#!/usr/local/bin/perl
# $Id: refstats.pl,v 1.3 1997/06/29 02:03:34 elkner Exp $

# refstats: Program to create statistics about URL's on a WWW server and
#           their refering URL's.
 
# Its is a modified version of Benjamin Franz' RefStats 1.1.1
# ( http://www.netimages.com/~snowhare/utilities/ )

# THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS 
# OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE 
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
# PARTICULAR PURPOSE.
#
# I offer it to the public domain and I ask, however, that this paragraph
# and my name be retained in any modified versions of the file you may
# make, and that you notify me of any improvements you make to the code.
#
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any 
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#
# IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, 
# SPECIAL, INCIDENTAL,  OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 
# USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT 
# LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE 
# POSSIBILITY OF SUCH DAMAGE

require "ctime.pl";
require "getopts.pl";

$version="RefStats 1.1.2-E";
# Location and name of the decompression program
# (usually '/usr/bin/zcat', '/usr/local/bin/zcat',
# '/usr/bin/gzip -c' or '/usr/local/bin/gzip -c')
$Zcat="/usr/local/bin/zcat";

$refscounter=0;
$filecounter=0;
$newsection="<p><hr><p>\n<h2><a name=\"";

# This is just to keep perl from whining using the -w option
$Domain='';$rfc931='';$authuser='';$TimeDate='';$Request='';
$Status='';$Bytes='';$referrer='';$Agent='';$Method='';$target='';
$Protocal='';

&Initialize;

foreach (@Includes) {
    $tmp = $_;
    &ReadOld($tmp) if &OpenFile($_);
}

foreach (@ARGV) {
    &ReadLog($_) if &OpenFile($_);
}
if ($filecounter > 0) {
    &PrintReport;
}
else {
    warn "No files found for analyzing!\n";
}
# -------------------------- main ends ------------------------------
sub OpenFile {
    if ( -r $_[0] ) {
        $filetime=(stat($_[0]))[9];
    }
    else {
        warn "$_[0] is not readable: $!\n";
    }
    # assuming these files are real logfiles (i.e. a.html.9601.gz would not
    # work for getting the right modifikation time
    if ($_[0]=~m/(\.gz|\.Z)/o) {
        $yymm = $_[0];
        $yymm =~ s/.*(\d\d\d\d).*/$1/;
        $mm = substr($yymm,2,2);
        $yy = substr($yymm,0,2);
        $_[0]="$Zcat $_[0] |";
    }
    # this has to be an uncompressed log file or stats file
    # if it is a stats file, @filetimes and @mmtimes are replaced
    else {
        ($mm,$yy) = (localtime($filetime))[4,5];
        $mm++;
        $mm = "0$mm" if ( $mm < 10 );
        $yy -= 100 if ($yy > 99);
        $yy = "0$yy" if ( $yy < 10 );
    }
    if (open(REFSLOG,"$_[0]")) {
        push @filetimes, $filetime;
        $filecounter++;
        push @mmtimes,"${yy}${mm}";
        1;
    }
    else {
        warn "Can't open $_[0]: $!\n";
        0;
    }
}

sub ReadLog {
    while($line=<REFSLOG>) {
	$refscounter++;
	chop $line;
	
	if ($LogType eq 'combined') {
	    ($Domain,$rfc931,$authuser,$TimeDate,$Request,
	     $Status,$Bytes,$referrer,$Agent) = $line =~ 
		 /^(\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^\"]*)\" (\S+) (\S+) \"([^\"]*)\" \"([^\"]*)\"/o;
	    ($Method,$target,$Protocal)=split(/\s/,$Request,3);
	} elsif ($LogType eq 'referer') {
	    ($referrer,$target)=split(/ -> /,$line,2);
	}
	next if (! ($referrer && $target));
	
	$target=~ s/\%7[eE]/~/o;	# Caniconalize %7E and %7e as ~
	$target=~ s#//#/#go;		# Remove any extra slashes
	$target=~ s#^ *$#/#o;		# fix root ref if needed
	$target=~ s/#.+$//o;		# combine #anchor refs with root doc
	$target=~ s/\?.*$//o if ($STRIPTARGETPARMS);	# strip '?' parameters
	if ( $target ne "/" ) {
	    $target=~s#/$##;            # remove last /
	}
	foreach (@TargetRegexReplace) { eval $_; }
	next if ( ($EXCLUDEREFSTO) && ($target =~ m#$EXCLUDEREFSTO#io) );
	next if ($IncludeOnlyRefsTo && !($target=~m#$IncludeOnlyRefsTo#) );
	$referrer=~ s/\%7[eE]/~/o;	# Caniconalize %7E and %7e as ~
	$referrer=~ s#^(http://[^/]+):80/#$1/#o;	# remove unneeded :80 port specification
	$referrer=~ s/#.+$//o;		# combine #anchor refs with root doc
	$referrer=~ s/\?.*$//o if $STRIPREFPARMS;	# strip '?' parameters
	foreach (@ReferrerRegexReplace) { eval $_; }	    
        if ($EXCLUDEREFSFROM && ($referrer=~m#$EXCLUDEREFSFROM#oi) ) {
				 next;
	}
	$keyform=$target.' '.$referrer;
	$keyform=~ s#<#\&lt\;#og;	# prevent accidents with '<'
	$keyform=~ s#>#\&gt\;#og;	# prevent accidents with '>'
	$keyform=~ s#\&#\&amp\;#og;	# prevent accidents with '&'
	$keyform=~ s#"#\&quot\;#og;	# prevent accidents with '"'
	$TargetCounter{$keyform}++;
    }
    close(REFSLOG);
}

sub PrintSummary {
    @tmp = sort @filetimes;
    $lastmodtime=$tmp[$#tmp];
    $filedate = &ctime($lastmodtime);
    $date = &ctime(time);
    print OUT
        $newsection,"summary\">Summary</a></h2>\n",
        "<b>Last analyzed:</b> ", $date, "<br>\n",
        "<b>Last log file modification:</b> ", $filedate, "<br>\n",
        "<!-- $lastmodtime -->\n",
        "<b>References examined:</b> ", $refscounter, "<br>\n",
	"<b>Required minimal references per URL:</b> ",$MinRefs, 
	"<br>\n";
}

sub PrintReport {
    $li="<li> <a href=\"";
    @tmp = sort @mmtimes;
    $yymm = $tmp[0];
    $mm = substr($yymm,2,2);
    $yy = substr($yymm,0,2);
    $line = "$NumberToMonth{$mm} $yy";
    if ( $#mmtimes > 0 ) {
        $yymm = $tmp[$#tmp];
        $mm = substr($yymm,2,2);
        $yy = substr($yymm,0,2);
        $line .= " - $NumberToMonth{$mm} $yy";  
    }   
    # Remove false hits caused by people using incorrect URLs
    # that are redirected by the server
    foreach $key (keys (%TargetCounter)) {
	($target,$referrer)=split(/ /,$key,2);
	if ($TargetCounter{"$target/ $referrer"} && $TargetCounter{"$target $referrer"}) {
	    $refscounter-=$TargetCounter{$key};
	    $TargetCounter{$key}=-1;
	}
    }
    open (OUT,">$OutputFile.$$") || 
	die ("Could not open $OutputFile for writing.\n$!");
    print OUT 
	"<html><head><title>Referring URL Statistics for ",
	$HTTPDSERVER, "</title></head>\n",
	"<body>\n<h1 align=center>Referring URL Statistics for ",
	$HTTPDSERVER, "<br>\n",$line,"</h1>\n";
    &PrintSummary;
    print OUT 
	$newsection,"references\">Web Pages and Referring URLs:</h2>\n",
	"<dl>\n";
    $LastWebPage='';
    foreach $key (sort bytargetthenhits keys(%TargetCounter)) {
	($target,$referrer)=split(/ /,$key,2);
	next if ($TargetCounter{$key} < $MinRefs);
	print OUT "<dt>$target</dt>\n" if ("$target" ne "$LastWebPage");
	print OUT 
	    "<dd><a href=\"$referrer\">$referrer</a>",
	    " ($TargetCounter{$key} reference";
	print OUT "s" if ($TargetCounter{$key} > 1);
	print OUT ")</dd>\n";
	$LastWebPage=$target;
    }
    print OUT
	"</dl>\n",
	"<p>\n<hr>\n<p>",
	"Generated using <a href=\"http://irb.cs.uni-magdeburg.de/~elkner/webtools/refstats.shtml\">$version</a> from ",
	"<a href=\"mailto:elkner\@irb.cs.uni-magdeburg.de\">",
	"Jens Elkner</a>, \n",
	"a modified version of \n",
	"<a href=\"http://www.netimages.com/~snowhare/utilities/\">",
	"RefStats 1.1.1</a>\n",    
	"</body></html>\n";
    close(OUT);
    if ($OutputFile ne '-') {
	rename "$OutputFile.$$", "$OutputFile";
    }
}

sub bytargetthenhits {
	($targeta)=($a=~m#^(.+)\s#o);
	($targetb)=($b=~m#^(.+)\s#o);

	$inequality=($targeta cmp $targetb);
	if ($inequality) {
		$inequality;
	} else {
		$TargetCounter{$b}<=>$TargetCounter{$a};
	}
}

sub GetLastModTime {
    while (<REFSLOG>) {
        next if ( $_ !~ m#<\!--\s*(\d+)\s*-->#);
        $filetime = $1;
        pop @filetimes;
        push @filetimes, $filetime;
        ($mm,$yy) = (localtime($filetime))[4,5];
        $mm++;
        $mm = "0$mm" if ( $mm < 10 );
        $yy -= 100 if ($yy > 99);
        $yy = "0$yy" if ( $yy < 10 );
        pop @mmtimes;
        push @mmtimes, "${yy}${mm}";
        last;
    }
}        

sub ReadOld {
    while (<REFSLOG>) {
        last if (m#</html>#oi); # check for end of html file
	chop;
        &GetLastModTime if (/<a name=\"summary\">/oi);
	$refscounter += $1 if (/<b>References examined:<\/b>\s+(\d+)/);
	if (m#^<dt>(.*)</dt>#o) {
	    $target = $1;
	    next;
	}
	next if (! $target);
	foreach (@TargetRegexReplace) { eval $_; }
	next if ( ($EXCLUDEREFSTO) && ($target =~ m#$EXCLUDEREFSTO#io) );
	next if ($IncludeOnlyRefsTo && !($target=~m#$IncludeOnlyRefsTo#) );
	if (m#^<dd><a href=\"([^"]*)\">.*</a> \((\d+) references?\)</dd>$#o) {
	    $referrer = $1;
	    if ($EXCLUDEREFSFROM && ($referrer=~m#$EXCLUDEREFSFROM#oi) ) {
		next;
	    }    
	    $TargetCounter{"$target $referrer"} += $2;
	}
    }
    close(REFSLOG);
}

sub Version {
    die <<"EndVersion";
This is $version.

It is Jens Elkner\'s modified version of refstats 1.1.1.
EndVersion
}

sub Usage {
    die <<"EndUsage";
Process a sequence of NCSA httpd common format referer_log files and output an HTML summary. 

Usage: refstats [-h] [-v] [-s] [-S] [-t] [-N HTTD-Server-Name] [-m MinRefs]
                [-u substExprList] [-U substExprList]
                [-x perlregex] [-X perlregex] 	
                [-r perlregex]
		[-i file] [-o file]
                [logfile ...] [logfile.gz ...] [logfile.Z ...]

Options:
  -h                    display this help message and quit.
  -v                    display version 

  -i fileList           A comma separated list of refstats files, generated
                        with the same Version of RefStats in previous runs.
  -m number             Minimum number of refs from a page to be included  
                        in the list (default = 2)
  -N name               HTTPD Server name for report
  -o file               Output file (default = Standard Output)
  -r regex              Only include refs to pages matching the perl regex 
  -s                    Do NOT strip Parameters of refs from any pages.
  -S                    Do NOT strip Parameters of refs to any pages.
  -t                    Set Type of the Logfile to combined (default = referer)
  -U substExprList      A @@ separated List of Perl substitute expressions
                        for doing Referer Substitutions/Replacements
  -u substExprList      A @@ separated List of Perl substitute expressions
                        for doing Target Substitutions/Replacements
  -x regex              Exclude refs from any pages matching the perl regex 
  -X regex              Exclude refs to any pages matching the perl regex

Perl subst expr  ...  everything after the s in s/PATTERN/REPLACEMENT/SWITCHES 
                      (e.g. #^/((disk1)|(disk2))/#/pub/#oi )
EndUsage
}

sub Initialize {  
    %NumberToMonth=(
                    '01','January',
                    '02','February',
                    '03','March',
                    '04','April',
                    '05','May',
                    '06','June',
                    '07','July',
                    '08','August',
                    '09','September',
                    '10','October',
                    '11','November',
                    '12','December',
                    );

    $result = &Getopts('hi:N:m:o:r:sSt:U:u:vx:X:');
    &Usage if  $opt_h || $result == 0;
    &Version if $opt_v;

    if ($opt_i) {
        @Includes = split(",",$opt_i);
    }
    $OutputFile = $opt_o ? $opt_o : "-";
    $EXCLUDEREFSFROM=$opt_x if $opt_x;
    $EXCLUDEREFSTO=$opt_X if $opt_X;
    $STRIPREFPARMS = $opt_s ? 0 : 1;
    $STRIPTARGETPARMS = $opt_S ? 0 : 1;
    $IncludeOnlyRefsTo=$opt_r if $opt_r;
    $HTTPDSERVER = $opt_N ? $opt_N : "www.dom.ain";
    $LogType = $opt_t ? "combined" : "referer";
    $tmp = $opt_m ? $opt_m : 2;
    if ($tmp =~ m#^\d+$#) { 
	$MinRefs = $tmp; 
    }
    else { 
	print STDERR "Invalid comand line switch usage, -m requires a ";
	print STDERR "number.\n";
	print STDERR "Comand line switch and argument $opt_m ignored\n";
	$MinRefs = 2;
    }
    if ($opt_u) {
        @tmp = split("@@",$opt_u);
        foreach (@tmp) {
            $RegexReplace = <<EOF;
            \$target=~ s$_;
EOF
            push @TargetRegexReplace,$RegexReplace;
        }
    }
    if ($opt_U) {
        @tmp = split("@@",$opt_U);
        foreach (@tmp) {
            $RegexReplace = <<EOF;
            \$referrer=~ s$_;
EOF
            push @ReferrerRegexReplace,$RegexReplace;
	}
    }
}
