#!/usr/local/bin/perl

# $Id: proxycounter.pl,v 1.4 1997/10/12 08:27:00 elkner Exp $

# proxycounter: Program to create statistics from log files about proxies
#               which accessed the WWW server. 

# It uses the apache customed log file, created with directive:
# "CustomLog logs/file %{VIA}i" and agent_logs, if available.

# written by Jens Elkner (elkner@irb.cs.uni-magdeburg.de)

# THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS 
# OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE 
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
# PARTICULAR PURPOSE.
#
# I offer it to the public domain and I ask, however, that this paragraph
# and my name be retained in any modified versions of the file you may
# make, and that you notify me of any improvements you make to the code.
#
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any 
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#
# IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, 
# SPECIAL, INCIDENTAL,  OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 
# USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT 
# LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE 
# POSSIBILITY OF SUCH DAMAGE

require "ctime.pl";
require "getopts.pl";

$Version="ProxyCounter 1.0";
$Zcat="/usr/local/bin/gzip -cd";

# Type of log file being parsed ('agent','combined')
$LogType='agent';

# Name of server
$HTTPDSERVER='www.cs.uni-magdeburg.de';

# cellpadding size
$cellpadding=2;

# Border size
$border=2;

$newsection="<p><hr><p>\n<h2><a name=\"";
$return2index = "<p><a href=\"#top\">Top</a>\n";
$filecounter=0;
$proxycounter=0;

%proxy_level=();

&Initialize;

foreach (@OldLogs) {
    $tmp = $_;
    &ReadOldLog($tmp) if &OpenFile($_);
}

foreach (@AgentLogs) {
    $tmp = $_;
    &ReadAgentLog($tmp) if &OpenFile($_);
}

foreach (@ARGV) {
    &ReadLog if &OpenFile($_);
}

if ($filecounter > 0) {
    $hits = $hits_old if $hits_old > $hits;
    $access = $access_old if $access_old > $access;
    &PrintReport;
}
else {
    warn "No files found for analyzing!\n";
}


# --------------------------------- That's it --------------------------------
sub OpenFile {
    if ( -r $_[0] ) {
	$filetime=(stat($_[0]))[9];
    }
    else {
	warn "$_[0] is not readable: $!\n";
    }
    # assuming these files are real logfiles (i.e. a.html.9601.gz would not
    # work for getting the right modification time
    if ($_[0]=~m/(\.gz|\.Z)/o) {
        $yymm = $_[0];
        $yymm =~ s/.*(\d\d\d\d).*/$1/;
        $mm = substr($yymm,2,2);
        $yy = substr($yymm,0,2);
        $_[0]="$Zcat $_[0] |";
    }
    # this has to be an uncompressed log file or stats file
    # if it is a stats file, @filetimes and @mmtimes are replaced
    else {
	($mm,$yy) = (localtime($filetime))[4,5];
	$mm++;
	$mm = "0$mm" if ( $mm < 10 );
	$yy -= 100 if ($yy > 99);
	$yy = "0$yy" if ( $yy < 10 );
    }
    if (open(LOG,"$_[0]")) {
	push @filetimes, $filetime;
	$filecounter++;
	push @mmtimes,"${yy}${mm}";
	1;
    }
    else {
	warn "Can't open $_[0]: $!\n";
	0;
    }
}
    
sub ReadAgentLog {
    local($linecounter) = 0;
    while (<LOG>) {
	next if $linecounter < $ignore;
	$linecounter++;
	chomp;
	s/\s+/\ /;
	if ($LogType eq 'combined') {
	    # combined logfile format:
	    # $Host $rfc931 $auth $Time $Request $Status $Bytes $Referrer $Agent
	    s/^\S+ \S+ \S+ \[[^\]\[]+\] \"[^\"]*\" \S+ \S+ \"[^\"]*\" \"([^\"]*)\"/$1/o;
	} 
       # strip leading chr(0)	
	s/^\x0*//;
	&Check4Proxies($_) if /\s+via\s+/;
    }
    close(LOG);
    $hits_old += $linecounter - $ignore;
}

sub GetToken {
    # tspecials [\(\)\<\>\@,;:\\\"\/\[\]\?=\{\}\ \t]
    local($token) = $_[0];
    local($remainder);
    if ($token =~ m#^([^\(\)\<\>\@,;:\\\"\/\[\]\?=\{\}\ \t]+)(.*)$# ) {
	$token = $1;
	$remainder = $2;
    }
    else {
	$remainder = $token;
	$token = "";
    }
    return ("$token","$remainder");
}

sub GetRawVersion {
    local($version) = $_[0];
    if ( $version =~ m#^([0-9]+\.[0-9]+)# ) {
        $version = $1;
    }
    # fix for Squid NOVM
    else {
	$version =~ s/^([0-9]+\.NOVM)\..*/$1/oi;
    }
    return $version;
}

sub Check4Proxies {
    local($line) = $_[0];
    local($level) = 0;
    local($tmp,$agent,$version,$detailed_version,$comment) = "";
    $access_old++;
    # fix CERN-HTTPD, WebTrack-HTTPP, KryptoWall, germany.net
    $line =~ s/\s+proxy\s+gateway//gi;
    # fix Squid 1.0.x and Harvest 1.x to RFC2068 product rule
    $line =~ s/\s+Cache\s+version\s+/\//gi;
    # fix NetCache to RFC2068 product rule
    $line =~ s/\s+version\s+/\//gi;
    # fix DocuMagix HotCargo Express to RFC2068 product rule
    $line =~ s/DocuMagix\s+HotCargo\s+Express/DocuMagix-HotCargo-Express/gi;
    $tmp = $line;
    while ( $tmp =~ m#\s+via\s+#o ) {
	   $tmp =~ s/.*\s+via\s+(.*)/$1/oi;
	   # RFC 2068 - 14.32: 1*( product | comment )
	   # comment is ignored
	   ($agent,$detailed_version) = &GetToken($tmp);
	   $agent = "Unknown" unless $agent;
	   $detailed_version =~ s/^\///;
	   ($detailed_version,$comment) = &GetToken($detailed_version);
	   $detailed_version = "???" if ($detailed_version eq "");
	   $version = &GetRawVersion($detailed_version);
	   $agent_counter{$agent}++;
	   $agent_version_counter{"$agent $version"}++;
	   $agent_detailed_version_counter{"$agent $detailed_version"}++;
	   $old_log_style{"$agent $detailed_version"}++;
	   # we do not want to have any sub products, thus we go for the next
           # entry
	   $line =~ s/(.*)\s+via\s+.*/$1/o; 
	   $tmp = $line;
	   $proxycounter++;
	   $level++;
    }
    $proxy_level{$level}++;
}

sub GetComment {
    local($comment) = $_[0];
    local($number) = $comment =~ s/[^\(]*\(([^\)]+)\).*/$1/;
    $comment = "" unless $number > 0;
    return $comment;
}

sub ReadLog {
    local(@p) = ();
    local($level) = 0;
    local($agent,$version,$detailed_version,$comment) = "";
    while(<LOG>) {
	$hits++;
	next if /^\-/;
	chop;
	# RFC 2068 - 14.44: 1#( received-protocol received-by [ comment ] )
	$access++;
	@p = split(/,/);
	$level = $#p + 1;
	foreach (@p) {
	    $comment = &GetComment($_);
	    # fix for AOL TurboWeb
	    $comment =~ s/AOL\s+TurboWeb/AOL-TurboWeb/g;
	    # fix for IBM ICS
	    $comment =~ s/IBM\s+ICS/IBM-ICS/g;
#	    print "$_\n" unless $comment;
	    $comment = "Unknown" unless $comment;
	    # RFC 2068 - 14.32: 1*( product | comment )
	    # comment is ignored
	    ($agent,$detailed_version) = &GetToken($comment);
	    $agent = "Unknown" unless $agent;
	    $detailed_version =~ s/^\///;
	    ($detailed_version,$comment) = &GetToken($detailed_version);
	    $detailed_version = "???" if ($detailed_version eq "");
	    $version = &GetRawVersion($detailed_version);
	    $agent_counter{$agent}++;
	    $agent_version_counter{"$agent $version"}++;
	    $agent_detailed_version_counter{"$agent $detailed_version"}++;
#	    print "\"$agent\" \"$version\" \"$detailed_version\"\n";
	    $proxycounter++;
	}
	$proxy_level{$level}++;
    }
    close(LOG);
}
    
sub PrintReport {
    local(@tmp);
    local($yymm,$from_mm,$from_yy,$to_mm,$to_yy,$line);
    select(OUT) if $OutputFile;
    $li="<li> <a href=\"";
    @tmp = sort @mmtimes;
    $yymm = $tmp[0];
    $from_mm = substr($yymm,2,2);
    $from_yy = substr($yymm,0,2);
    $line = "$NumberToMonth{$mm} $yy";
    if ( $#mmtimes > 0 ) {
        $yymm = $tmp[$#tmp];
	$to_mm = substr($yymm,2,2);
        $to_yy = substr($yymm,0,2);
	$line .= " - $NumberToMonth{$mm} $yy" unless ($from_mm == $to_mm && $from_yy == $to_yy);	
    }	
    print
	"<!doctype HTML public \"-//W3C//DTD HTML 3.2//EN\">\n",
	"<html><head>\n",
	"<title>Proxy Statistics for ", $HTTPDSERVER, "</title>\n",
	"</head>\n",
	"<BODY BGCOLOR=WHITE TEXT=Black TOPMARGIN=10 LEFTMARGIN=15>\n",
	"<a name=\"top\">\n",
	"<TABLE BORDER=0 CELLSPACING=0 CELLPADDING=3 WIDTH=100%>",
	"<TR><TD BGCOLOR=\"#1D098E\" WIDTH=100% align=center>",
	"<FONT COLOR=white SIZE=+2><STRONG>",
	"Proxy Statistics for ", $HTTPDSERVER, "</FONT></TABLE>\n<P>\n",
	"<CENTER><FONT SIZE=+2><B>", $line, "</B></FONT></CENTER>\n<P><HR>\n";
    &PrintSummary;
    print
	"<TABLE BORDER=0>\n<TR>\n<TD VALIGN=TOP><B>Logged Proxies:</B>\n<TD>",
        "<UL>\n",
        "$li#agent\">Summary</a>\n",
        "$li#version\">Summary by version</a>\n",
        "$li#detail\">Summary by detail of version</a>\n",
        "</ul>\n</TABLE>\n";
    &PrintAgent;
    &PrintAgentVersion;
    &PrintDetailedAgentVersion;
    print
        "<p>\n",
	"<TABLE BORDER=0>\n<TR>\n<TD VALIGN=TOP><B><SUP>*</SUP></B>\n<TD>",
	"log style <b>old</B> means, that the ",
	"statistics for the specified proxy are gathered via the user agent ",
	"log file. If no style is mentioned, the data for the specified proxy ",
	"have been extracted from the VIA log file - corresponding to RFC2068.",
	"</TABLE>\n<P>\n",
        "<TABLE Width=100%>\n<TR><TD BGCOLOR=\"#F0F0F0\"><FONT SIZE=-1>\n",
        "Generated with&nbsp;&nbsp;",
        "<a href=\"http://irb.cs.uni-magdeburg.de/~elkner/webtools/proxycounte
r.shtml\"><B>",
        $Version,"</B></a>&nbsp;&nbsp;written by&nbsp;&nbsp;",
        "<a href=\"mailto:elkner\@irb.cs.uni-magdeburg.de\">",
        "Jens Elkner</a></FONT></TABLE>\n", 
        "</body></html>\n";
    if ( $OutputFile ) {
        rename "$path/$OutputFile.$$", "$path/$OutputFile";
        select(STDOUT);
        close(OUT);
    }
}

sub PrintSummary {
    local(@tmp);
    local($lastmod,$filedate,$date);
    @tmp = sort @filetimes;
    $lastmodtime=$tmp[$#tmp];
    $filedate = &ctime($lastmodtime);
    $date = &ctime(time);
    print
	"<b>Last analyzed:</b> ", $date, "<br>\n",
	"<b>Last log file modification:</b> ", $filedate, "<br>\n",
	"<!-- $lastmodtime -->\n",
	"<BR>\n<b>Total Hits measured:</b> ", $hits, "<br><br>\n",
	"<b>Hits via one or more proxies:</b> ", $access," (",
	&GetPercent($access,$hits), "%)<br>\n";
    printf("<b>Average hops via proxy/proxy hit:</b> %5.2f<br>\n",
	   $proxycounter/$access);
    printf("<b>Average hops via proxy/hit:</b> %5.2f<P>\n",
	   $proxycounter/$hits);
    $date = 1;
    foreach $key (sort LevelsByHits keys(%proxy_level)) {
	print "<B>Hits via proxy with $key hop(s):</B> $proxy_level{$key}<BR>\n";  
    }
    print "<P>\n";
}

sub GetPercent {
    # $_[0] = relative value, $_[1] = absolut value
    return "99999999999999999" if $_[1] == 0;
    $percent = 100 * $_[0] / $_[1];
    $percent = sprintf("%5.2f",$percent);
    return $percent;
    return $percent;
}

sub PrintAgent {
    print 
        $newsection, "agent\">Summary</a></h2>\n\n",
        "<CENTER>\n<table border=$border cellpading=$cellpadding>\n",
        "<tr><th>Hits <th>Percent <th>Proxy<BR>\n";
    foreach $key (sort AgentByHits keys(%agent_counter)) {
        print
            "<tr><td align=right>", $agent_counter{$key} , " ",
            "<td align=right>", &GetPercent($agent_counter{$key},$proxycounter), 
            "% <td align=left>", $key, "<BR>\n";
    }
    print "</table>\n", $return2index, "</CENTER>\n";
}

sub PrintAgentVersion {
    print
        $newsection, "version\">Summary by version</a></h2>\n\n",
        "<CENTER>\n",
        "<table border=$border cellpadding=$cellpadding>\n",
        "<tr><th>Hits <th>Percent <th>Proxy<BR>\n";
    foreach $key (sort VersionByHits keys(%agent_version_counter)) {
        print 
            "<tr><td align=right>", $agent_version_counter{$key},
            "<td align=right>", 
	    &GetPercent($agent_version_counter{$key},$proxycounter),
            "% <td>", $key ,"<BR>\n";
    }
    print "</table>\n", $return2index, "</CENTER>\n";
}

sub PrintDetailedAgentVersion {
    local($tmp);
    print
        $newsection, "detail\">Summary by detail of version</a></h2>\n\n",
        "<CENTER>\n",
        "<table border=$border cellpadding=$cellpadding>\n",
        "<tr><th>Hits <th>Percent <th>Proxy <th>log style<SUP>*</SUP><BR>\n";
    foreach $key (sort DetailedVersionByHits keys(%agent_detailed_version_counter)) {
        print
            "<tr><td align=right>", $agent_detailed_version_counter{$key}, " ",
            "<td align=right>", &GetPercent($agent_detailed_version_counter{$key},$proxycounter),
            "% <td align=left>", $key, " ";
        $tmp = $old_log_style{$key} ? "old" : "&nbsp;";
        print "<td align=left>", $tmp, "<BR>\n";
    }
    print "</table>\n", $return2index, "</CENTER>\n";
}

sub AgentByHits {
   $tmp = $agent_counter{$b}<=>$agent_counter{$a};
   ($tmp == 0) ? $a cmp $b : $tmp;
}
sub VersionByHits {
    $tmp = $agent_version_counter{$b}<=>$agent_version_counter{$a};
    ($tmp == 0) ? $a cmp $b : $tmp;
}
sub DetailedVersionByHits {
    $tmp = $agent_detailed_version_counter{$b}<=>$agent_detailed_version_counter
{$a};
    ($tmp == 0) ? $a cmp $b : $tmp;
}
sub LevelsByHits {
   $tmp = $proxy_level{$b}<=>$proxy_level{$a};
   ($tmp == 0) ? $a cmp $b : $tmp;
}

sub GetLastModTime {
    while (<LOG>) {
#	print "GetMod: $_";
	next if ( $_ !~ m#<\!--\s*(\d+)\s*-->#);
	$filetime = $1;
	pop @filetimes;
	push @filetimes, $filetime;
	($mm,$yy) = (localtime($filetime))[4,5];
	$mm++;
	$mm = "0$mm" if ( $mm < 10 );
	$yy -= 100 if ($yy > 99);
	$yy = "0$yy" if ( $yy < 10 );
	pop @mmtimes;
	push @mmtimes, "${yy}${mm}";
	last;
    }
}	 

sub GetVitals {
    while (<LOG>) {
#	print "Vitals: $_";
	last if /Logged\s+Proxies/oi;
	$hits += $1 if /Total\s+Hits\s+measured:<\/b>\s*(\d+)/oi;
	$access += $1 if /Hits\s+via\s+one\s+or\s+more\s+proxies:<\/b>\s*(\d+)/oi;
	$proxy_level{$1} += $2 if /Hits\s+via\s+proxy\s+with\s+(\d+)\s+hop\(s\):<\/b>\s*(\d+)/oi;
    }
}

sub ReadOldLog {
    local($agent,$version, $detailed) = "";
    while (<LOG>) {
        chop;
        last if (m#</html>#oi); # check for end of html file
	if (/<a name=\"top\">/oi) {	 
	    &GetLastModTime ;
	    &GetVitals;
	}
        if (/<a name=\"agent\">/oi) {
            &ReadOldAgent;
            $agent = "1";
        }
        if (/<a name=\"version\">/oi) {
            &ReadOldVersion;
            $version = "1";
        }
        if (/<a name=\"detail\">/oi) {
            &ReadOldDetailedVersion;
            $detailed = "1";
            last;
        }
    }
    close(LOG);
}

sub ReadOldAgent {
    while (<LOG>) {
        last if /<\/table>/oi;
        next unless /<tr>/;
        /<tr><[^>]+>\s*(\d+)\s*<[^>]+>[^>]+>\s*([^<]+)/oi;
        if ( $1 && $2 ) {
            $proxycounter += $1;
            $agent_counter{$2} += $1;
        }
    }
}

sub ReadOldVersion {
   while (<LOG>) {
       last if /<\/table>/oi;
       next unless /<tr>/;
       /<tr><[^>]+>\s*(\d+)\s*<[^>]+>[^>]+>\s*([^<]+)/oi;
       if ( $1 && $2 ) {
           $agent_version_counter{$2} += $1;
       }
   }
}

sub ReadOldDetailedVersion {
    local($number,$agent,$log_style);
   while (<LOG>) {
       last if /<\/table>/oi;
       next unless /<tr>/;
       /<tr><[^>]+>\s*(\d+)\s*<[^>]+>[^>]+>\s*([^<]+)<[^>]+>([^<]+)/oi;
       $number = $1;
       $agent = $2;
       $log_style = $3;
       $agent =~ s/\s*(.*\S+)\s*$/$1/;
       if ( $agent && $number ) {
           $agent_detailed_version_counter{$agent} += $number;
       }
       $old_log_style{"$agent"} += $number if $log_style =~ m#old#oi;
#       print STDERR "Number: \"$number\"  Agent: \"$agent\"\n";
   }
}
	
sub Version {
    die <<"EndVersion";
This is $Version.

It is Jens Elkner\'s modified version of browsercounter 1.2.1.
EndVersion
}

sub Usage {
    die <<"EndUsage";
Process a sequence of proxy log files and output an HTML summary.
(Apache Directive: "CustomLog logs/pfile %{VIA}i").

Usage: browsercounter [-h] [-v] [-t] [-N HTTD-Server-Name] [-p path]
                      [-i fList] [-a fList] [-o afile]
                      [logfile ...] [logfile.gz ...] [logfile.Z ...]

Options:
  -h            Display this message and quit.
  -v            Display version 
  -t            Set type of the Logfile to combined (default = agent). Ignored,
                if option -a fList is not used.
  -N name       HTTPD Server name for report
  -i fList      List of html proxy stat files (afiles) to include
  -a fList      List of user agent log files to include (non HTTP1/1 compliant 
		proxies append their productname/version to the user agent
                header line - they do NOT use the VIA header line).
  -o afile      Output file (default = Standard Output)
  -p path       System directory, where to store generated files

Terms: fList ... comma separates list of files WITHOUT any whitespaces

EndUsage
}

sub Initialize {
    %NumberToMonth=(
                    '01','January',
                    '02','February',
                    '03','March',
                    '04','April',
                    '05','May',
                    '06','June',
                    '07','July',
                    '08','August',
                    '09','September',
                    '10','October',
                    '11','November',
                    '12','December',
                    );
    $result = &Getopts('hvtN:i:a:o:p:n:');
    &Usage if  $opt_h || $result == 0;
    &Version if $opt_v;
    if ($opt_i) {
	@OldLogs = split(",",$opt_i);
    }
    if ($opt_a) {
	@AgentLogs = split(",",$opt_a);
    }
    $path = $opt_p ? $opt_p : ".";
    $HTTPDSERVER=$opt_N if $opt_N;
    $LogType="combined" if ($opt_t);
    $ignore = $opt_n ? $opt_n : 0;
    if ( $opt_o && ( $opt_o ne '-') ) {
	$OutputFile =  $opt_o;
	open(OUT,">$path/$OutputFile.$$") || die "Can not open $path/$OutputFile: $!\n";
    }
}
