#!/usr/local/bin/perl # $Id: refstats.pl,v 1.3 1997/06/29 02:03:34 elkner Exp $ # refstats: Program to create statistics about URL's on a WWW server and # their refering URL's. # Its is a modified version of Benjamin Franz' RefStats 1.1.1 # ( http://www.netimages.com/~snowhare/utilities/ ) # THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS # OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A # PARTICULAR PURPOSE. # # I offer it to the public domain and I ask, however, that this paragraph # and my name be retained in any modified versions of the file you may # make, and that you notify me of any improvements you make to the code. # # Use of this software in any way or in any form, source or binary, # is not allowed in any country which prohibits disclaimers of any # implied warranties of merchantability or fitness for a particular # purpose or any disclaimers of a similar nature. # # IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, # SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE # USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT # LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE require "ctime.pl"; require "getopts.pl"; $version="RefStats 1.1.2-E"; # Location and name of the decompression program # (usually '/usr/bin/zcat', '/usr/local/bin/zcat', # '/usr/bin/gzip -c' or '/usr/local/bin/gzip -c') $Zcat="/usr/local/bin/zcat"; $refscounter=0; $filecounter=0; $newsection="


\n

0) { &PrintReport; } else { warn "No files found for analyzing!\n"; } # -------------------------- main ends ------------------------------ sub OpenFile { if ( -r $_[0] ) { $filetime=(stat($_[0]))[9]; } else { warn "$_[0] is not readable: $!\n"; } # assuming these files are real logfiles (i.e. a.html.9601.gz would not # work for getting the right modifikation time if ($_[0]=~m/(\.gz|\.Z)/o) { $yymm = $_[0]; $yymm =~ s/.*(\d\d\d\d).*/$1/; $mm = substr($yymm,2,2); $yy = substr($yymm,0,2); $_[0]="$Zcat $_[0] |"; } # this has to be an uncompressed log file or stats file # if it is a stats file, @filetimes and @mmtimes are replaced else { ($mm,$yy) = (localtime($filetime))[4,5]; $mm++; $mm = "0$mm" if ( $mm < 10 ); $yy -= 100 if ($yy > 99); $yy = "0$yy" if ( $yy < 10 ); } if (open(REFSLOG,"$_[0]")) { push @filetimes, $filetime; $filecounter++; push @mmtimes,"${yy}${mm}"; 1; } else { warn "Can't open $_[0]: $!\n"; 0; } } sub ReadLog { while($line=) { $refscounter++; chop $line; if ($LogType eq 'combined') { ($Domain,$rfc931,$authuser,$TimeDate,$Request, $Status,$Bytes,$referrer,$Agent) = $line =~ /^(\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^\"]*)\" (\S+) (\S+) \"([^\"]*)\" \"([^\"]*)\"/o; ($Method,$target,$Protocal)=split(/\s/,$Request,3); } elsif ($LogType eq 'referer') { ($referrer,$target)=split(/ -> /,$line,2); } next if (! ($referrer && $target)); $target=~ s/\%7[eE]/~/o; # Caniconalize %7E and %7e as ~ $target=~ s#//#/#go; # Remove any extra slashes $target=~ s#^ *$#/#o; # fix root ref if needed $target=~ s/#.+$//o; # combine #anchor refs with root doc $target=~ s/\?.*$//o if ($STRIPTARGETPARMS); # strip '?' parameters if ( $target ne "/" ) { $target=~s#/$##; # remove last / } foreach (@TargetRegexReplace) { eval $_; } next if ( ($EXCLUDEREFSTO) && ($target =~ m#$EXCLUDEREFSTO#io) ); next if ($IncludeOnlyRefsTo && !($target=~m#$IncludeOnlyRefsTo#) ); $referrer=~ s/\%7[eE]/~/o; # Caniconalize %7E and %7e as ~ $referrer=~ s#^(http://[^/]+):80/#$1/#o; # remove unneeded :80 port specification $referrer=~ s/#.+$//o; # combine #anchor refs with root doc $referrer=~ s/\?.*$//o if $STRIPREFPARMS; # strip '?' parameters foreach (@ReferrerRegexReplace) { eval $_; } if ($EXCLUDEREFSFROM && ($referrer=~m#$EXCLUDEREFSFROM#oi) ) { next; } $keyform=$target.' '.$referrer; $keyform=~ s#<#\<\;#og; # prevent accidents with '<' $keyform=~ s#>#\>\;#og; # prevent accidents with '>' $keyform=~ s#\&#\&\;#og; # prevent accidents with '&' $keyform=~ s#"#\"\;#og; # prevent accidents with '"' $TargetCounter{$keyform}++; } close(REFSLOG); } sub PrintSummary { @tmp = sort @filetimes; $lastmodtime=$tmp[$#tmp]; $filedate = &ctime($lastmodtime); $date = &ctime(time); print OUT $newsection,"summary\">Summary

\n", "Last analyzed: ", $date, "
\n", "Last log file modification: ", $filedate, "
\n", "\n", "References examined: ", $refscounter, "
\n", "Required minimal references per URL: ",$MinRefs, "
\n"; } sub PrintReport { $li="
  • 0 ) { $yymm = $tmp[$#tmp]; $mm = substr($yymm,2,2); $yy = substr($yymm,0,2); $line .= " - $NumberToMonth{$mm} $yy"; } # Remove false hits caused by people using incorrect URLs # that are redirected by the server foreach $key (keys (%TargetCounter)) { ($target,$referrer)=split(/ /,$key,2); if ($TargetCounter{"$target/ $referrer"} && $TargetCounter{"$target $referrer"}) { $refscounter-=$TargetCounter{$key}; $TargetCounter{$key}=-1; } } open (OUT,">$OutputFile.$$") || die ("Could not open $OutputFile for writing.\n$!"); print OUT "Referring URL Statistics for ", $HTTPDSERVER, "\n", "\n

    Referring URL Statistics for ", $HTTPDSERVER, "
    \n",$line,"

    \n"; &PrintSummary; print OUT $newsection,"references\">Web Pages and Referring URLs:\n", "
    \n"; $LastWebPage=''; foreach $key (sort bytargetthenhits keys(%TargetCounter)) { ($target,$referrer)=split(/ /,$key,2); next if ($TargetCounter{$key} < $MinRefs); print OUT "
    $target
    \n" if ("$target" ne "$LastWebPage"); print OUT "
    $referrer", " ($TargetCounter{$key} reference"; print OUT "s" if ($TargetCounter{$key} > 1); print OUT ")
    \n"; $LastWebPage=$target; } print OUT "
    \n", "

    \n


    \n

    ", "Generated using $version from ", "", "Jens Elkner, \n", "a modified version of \n", "", "RefStats 1.1.1\n", "\n"; close(OUT); if ($OutputFile ne '-') { rename "$OutputFile.$$", "$OutputFile"; } } sub bytargetthenhits { ($targeta)=($a=~m#^(.+)\s#o); ($targetb)=($b=~m#^(.+)\s#o); $inequality=($targeta cmp $targetb); if ($inequality) { $inequality; } else { $TargetCounter{$b}<=>$TargetCounter{$a}; } } sub GetLastModTime { while () { next if ( $_ !~ m#<\!--\s*(\d+)\s*-->#); $filetime = $1; pop @filetimes; push @filetimes, $filetime; ($mm,$yy) = (localtime($filetime))[4,5]; $mm++; $mm = "0$mm" if ( $mm < 10 ); $yy -= 100 if ($yy > 99); $yy = "0$yy" if ( $yy < 10 ); pop @mmtimes; push @mmtimes, "${yy}${mm}"; last; } } sub ReadOld { while () { last if (m##oi); # check for end of html file chop; &GetLastModTime if (//oi); $refscounter += $1 if (/References examined:<\/b>\s+(\d+)/); if (m#^

    (.*)
    #o) { $target = $1; next; } next if (! $target); foreach (@TargetRegexReplace) { eval $_; } next if ( ($EXCLUDEREFSTO) && ($target =~ m#$EXCLUDEREFSTO#io) ); next if ($IncludeOnlyRefsTo && !($target=~m#$IncludeOnlyRefsTo#) ); if (m#^
    .* \((\d+) references?\)
    $#o) { $referrer = $1; if ($EXCLUDEREFSFROM && ($referrer=~m#$EXCLUDEREFSFROM#oi) ) { next; } $TargetCounter{"$target $referrer"} += $2; } } close(REFSLOG); } sub Version { die <<"EndVersion"; This is $version. It is Jens Elkner\'s modified version of refstats 1.1.1. EndVersion } sub Usage { die <<"EndUsage"; Process a sequence of NCSA httpd common format referer_log files and output an HTML summary. Usage: refstats [-h] [-v] [-s] [-S] [-t] [-N HTTD-Server-Name] [-m MinRefs] [-u substExprList] [-U substExprList] [-x perlregex] [-X perlregex] [-r perlregex] [-i file] [-o file] [logfile ...] [logfile.gz ...] [logfile.Z ...] Options: -h display this help message and quit. -v display version -i fileList A comma separated list of refstats files, generated with the same Version of RefStats in previous runs. -m number Minimum number of refs from a page to be included in the list (default = 2) -N name HTTPD Server name for report -o file Output file (default = Standard Output) -r regex Only include refs to pages matching the perl regex -s Do NOT strip Parameters of refs from any pages. -S Do NOT strip Parameters of refs to any pages. -t Set Type of the Logfile to combined (default = referer) -U substExprList A @@ separated List of Perl substitute expressions for doing Referer Substitutions/Replacements -u substExprList A @@ separated List of Perl substitute expressions for doing Target Substitutions/Replacements -x regex Exclude refs from any pages matching the perl regex -X regex Exclude refs to any pages matching the perl regex Perl subst expr ... everything after the s in s/PATTERN/REPLACEMENT/SWITCHES (e.g. #^/((disk1)|(disk2))/#/pub/#oi ) EndUsage } sub Initialize { %NumberToMonth=( '01','January', '02','February', '03','March', '04','April', '05','May', '06','June', '07','July', '08','August', '09','September', '10','October', '11','November', '12','December', ); $result = &Getopts('hi:N:m:o:r:sSt:U:u:vx:X:'); &Usage if $opt_h || $result == 0; &Version if $opt_v; if ($opt_i) { @Includes = split(",",$opt_i); } $OutputFile = $opt_o ? $opt_o : "-"; $EXCLUDEREFSFROM=$opt_x if $opt_x; $EXCLUDEREFSTO=$opt_X if $opt_X; $STRIPREFPARMS = $opt_s ? 0 : 1; $STRIPTARGETPARMS = $opt_S ? 0 : 1; $IncludeOnlyRefsTo=$opt_r if $opt_r; $HTTPDSERVER = $opt_N ? $opt_N : "www.dom.ain"; $LogType = $opt_t ? "combined" : "referer"; $tmp = $opt_m ? $opt_m : 2; if ($tmp =~ m#^\d+$#) { $MinRefs = $tmp; } else { print STDERR "Invalid comand line switch usage, -m requires a "; print STDERR "number.\n"; print STDERR "Comand line switch and argument $opt_m ignored\n"; $MinRefs = 2; } if ($opt_u) { @tmp = split("@@",$opt_u); foreach (@tmp) { $RegexReplace = <