#!/usr/local/bin/perl

# $Id: agentcounter.pl,v 1.4 1997/07/16 02:15:47 elkner Exp elkner $

# agentcounter: Program to generate statistics from squids useragent log files
#               accessing squid proxy  server. 

# Derived from Jens Elkner's BrowserCounter 1.2.2-E
# ( http://irb.cs.uni-magdeburg.de/~elkner/webtools/ ) which is
# Derived from Benjamin Franz' BrowserCounter 1.2.1.
# ( http://www.netimages.com/~snowhare/utilities/browsercounter.html )

# THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS 
# OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE 
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
# PARTICULAR PURPOSE.
#
# I offer it to the public domain and I ask, however, that this paragraph
# and my name be retained in any modified versions of the file you may
# make, and that you notify me of any improvements you make to the code.
#
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any 
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#
# IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, 
# SPECIAL, INCIDENTAL,  OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 
# USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT 
# LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE 
# POSSIBILITY OF SUCH DAMAGE

require "ctime.pl";
require "getopts.pl";

$Version="AgentCounter 1.2.2";
$Zcat="/usr/local/bin/gzip -cd";

# Type of log file being parsed ('agent','combined')
$LogType='agent';

# Default Name of server
$HTTPDSERVER='proxycache.cs.uni-magdeburg.de';

# cellpadding size
$cellpadding=2;

# Border size
$border=2;

$newsection="<p><hr><p>\n<h2><a name=\"";
$return2index = "<p><a href=\"#top\">Top</a>\n";
$filecounter=0;

@start = ();
@stop = ();

&Initialize;

foreach (@Includes) {
    $tmp = $_;
    &ReadOld($tmp) if &OpenFile($_);
}

foreach (@ARGV) {
    &ReadLog if &OpenFile($_);
}

if ($filecounter > 0) {
    &AnalyzeAgents;
    &PrintReport;
}
else {
    warn "No files found for analyzing!\n";
}


# --------------------------------- That's it --------------------------------
sub OpenFile {
    # assuming these files are real logfiles
    if ($_[0]=~m/(\.gz|\.Z)/o) {
        $_[0]="$Zcat $_[0] |";
    }
    # this has to be an uncompressed log file
    if (open(AGENTLOG,"$_[0]")) {
	$filecounter++;
	1;
    }
    else {
	warn "Can't open $_[0]: $!\n";
	0;
    }
}

sub DeSpoofAgent {
    # Despoofs WebTV spoofing as MSIE spoofing as Mozilla.	
    if ($Agent =~ m#^\S+\s+(WebTV/\S+)#o) {
	$Agent = "$1 spoofing as $Agent";
    }
	
    # despoofs people using pseudo-'standard' of 'compatible'
    if ($Agent =~ m#^Mozilla.*\(compatible; *([^;)]+)#oi) {
	$spoofer =  $1;
	$spoofer =~ s#/#-#og;
	$spoofer =~ s/\W+$//o;
	$Agent="$spoofer spoofing as $Agent";
    }
}
    
sub ReadLog {
    local($i) = 1;
    while ($line=<AGENTLOG>) {
	$refscounter++;
	chomp $line;
	$line =~ s#\s+# #go; # Fixes proxy info bug. Fix suggested by
	($IP,$TimeDate,$Agent) = $line =~ /^(\S+) \[([^\]\[]+)\] \"([^\"]*)\"/o;
	# Check for Proxies
	if ( $i ) {
	    $i = &Date2String($TimeDate);
	    push(@start,$i);
	    $i = 0;
	}
	&DeSpoofAgent;
	# Lets not let children play with dangerous toys...
	$Agent =~ s#<#\&lt;#go;
	$Agent =~ s#\&#\&amp;#go;
	$Agent =~ s#>#\&gt;#go;
	$Agent =~ s#"#\&quot;#go;
	
        # strip leading chr(0)	
	$Agent =~ s/^\x0*//;
	$Agent = "Unknown" if (! $Agent);
	$rawagents{$Agent}++;
    }
    close(AGENTLOG);
    $i = &Date2String($TimeDate);
    push(@stop,$i);
}

sub AnalyzeAgents {
    foreach $agent (keys (%rawagents)) {
	$longagent=$agent;
	($base)          =  $longagent =~ m#^([^\(\[]+)#o;
	$base            =~ s#\s+$##o;
	($name,$version) = $base =~ m#^([^\d\/]+)[\s\/vV]+(\d[\.\d]+)#o;
	if ( ($name eq "") || ($name =~ /^\ +/)) { 
	    ($name) = split(/\s+/,$base);
	}
	$agentgroup{$name}               += $rawagents{$agent};
	$agentversion{"$name $version"}  += $rawagents{$agent};
	$baseagent{$base}                += $rawagents{$agent};
    }
}
    
sub PrintReport {
    local($date);
    $li="<li> <a href=\"";
    select(OUT) if $OutputFile;
    print
	"<!doctype HTML public \"-//W3C//DTD HTML 3.2//EN\">\n",
	"<html><head>\n",
	"<title>WWW Agent Statistics for ", $HTTPDSERVER, "</title>\n",
	"</head>\n",
	"<body>\n<a name=\"top\">\n",
	"<h1 align=center>WWW Agent Statistics for ",$HTTPDSERVER,"</h1>\n",
	"<HR>\n";
    &PrintSummary;
    print
	"<ul>\n",
	"$li#broad\">Summary</a>\n",
	"$li#version\">Summary by version</a>\n",
	"$li#detail\">Summary by fine detail of version</a>\n",
	"$li#complete\">Detailed report</a>\n",
	"</ul>\n";
    &PrintBroadVersion;
    &PrintVersion;
    &PrintDetailedVersion;
    &PrintComplete;
    $date = localtime(time);
    print
	"<p><hr>\n<TABLE BORDER=0 WIDTH=100%>\n<TR>\n<TD>",
	"<FONT SIZE=-1>Generated by \n",
	"<a href=\"http://irb.cs.uni-magdeburg.de/~elkner/webtools/agentcounter.shtml\">",
	$Version,"</a>\n",
	"by <a href=\"http://irb.cs.uni-magdeburg.de/~elkner/\">Jens Elkner</A></FONT>\n",
	"<TD ALIGN=RIGHT><FONT SIZE=-1>Report Last Modified: $date</FONT>\n",
	"</TABLE></BODY>\n</HTML>\n";
    select(STDOUT);
    if ( $OutputFile ) {
	rename "$OutputFile.$$", "$OutputFile";
	select(STDOUT);
    }
}

sub PrintSummary {
    local($dd,$mm,$yy,$time,$offset,$period);
    @tmp = sort @start;
    ($yy,$mm,$dd,$time,$offset) = split(/,/,$tmp[0]);
    $period = "$dd/$Number2Month{$mm}/$yy $time $offset&nbsp;&nbsp;<B>to</B>&nbsp;&nbsp;";
    @tmp = sort @stop;
    ($yy,$mm,$dd,$time,$offset) = split(/,/,$tmp[$#tmp]);
    $period .= "$dd/$Number2Month{$mm}/$yy $time $offset";
    print
	"<P>\n<b>Period Covered: </b>", $period, "<br>\n",
	"<b>Web Browser Hits measured:</b> ", $refscounter, "<br>\n",
	"<P>\n";
}

sub GetPercent {
    # $_[0] = relative value, $_[1] = absolut value
    $percent = 100 * $_[0] / $_[1];
    $percent = sprintf("%5.2f",$percent);
    return $percent;
}

sub PrintBroadVersion {
    print 
	$newsection, "broad\">Summary</a></h2>",
	"<table border=$border cols=3 cellpadding=$cellpadding>",
	"<tr><th>Hits <th>Percent <th>Browser<br>\n";
    foreach $key (sort AgentByHits keys(%agentgroup)) {
	print
	    "<tr><td align=right>", $agentgroup{$key} , " ",
	    "<td align=right>", &GetPercent($agentgroup{$key},$refscounter), " ", 
	    "%<td align=left>", $key, "<br>\n";
    }
    print "</table>\n", $return2index;
}

sub PrintVersion {
    print 
	$newsection, "version\">Summary by version</a></h2>\n\n",
	"<table border=$border cellpadding=$cellpadding>\n",
	"<tr><th>Hits <th>Percent <th>Browser<br>\n";
    foreach $key (sort VersionByHits keys(%agentversion)) {
        print
	    "<tr><td align=right>",$agentversion{$key}," ", 
	    "<td align=right>", &GetPercent($agentversion{$key},$refscounter)," ",
	    "%<td align=left>", $key, "<br>\n";
    }
    print "</table>\n", $return2index;
}

sub PrintDetailedVersion {
    print
	$newsection, "detail\">Summary by fine detail of version</a></h2>\n\n",
	"<table border=$border cellpadding=$cellpadding>",
	"<tr><th>Hits <th>Percent <th>Browser<br>\n";
    foreach $key (sort BaseByHits keys(%baseagent)) {
        print 
	    "<tr><td align=right>",$baseagent{$key}, " ",
	    "<td align=right>", &GetPercent($baseagent{$key},$refscounter), " ",
	    "%<td align=left>", $key ,"<br>\n";
    }
    print "</table>\n", $return2index;
}

sub PrintComplete {
    print
	$newsection, "complete\">Detailed report</a></h2>\n\n",
	"<table border=$border cellpadding=$cellpadding>\n",
	"<tr><th>Hits <th>Percent <th>Browser<br>\n";
    foreach $key (sort keys(%rawagents)) {
        print "<tr><td align=right>", $rawagents{$key}, " ",
	"<td align=right>", &GetPercent($rawagents{$key},$refscounter), " ",
	"%<td align=left>", $key, "<br>\n";
    }
    print "</table>\n", $return2index;
}

sub AgentByHits {
   $tmp = $agentgroup{$b}<=>$agentgroup{$a};
   ($tmp == 0) ? $a cmp $b : $tmp;
}
sub VersionByHits {
    $tmp = $agentversion{$b}<=>$agentversion{$a};
    ($tmp == 0) ? $a cmp $b : $tmp;
}
sub BaseByHits {
    $tmp = $baseagent{$b}<=>$baseagent{$a};
    ($tmp == 0) ? $a cmp $b : $tmp;
}

sub Date2String {
    local($dd,$mm,$yy,$time,$offset,$tmp);
    $_ = $_[0]; 
    /\d+\/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\/\d+[:\ ]\d+:\d+:\d+\s+[+|-]\d+/;
    $tmp = $&;
    ($tmp,$time,$offset) = split(/ /,$tmp);
    if (! $offset ) {
	$offset = $time;
	($tmp,$time,$dd,$mm) = split(/:/,$tmp);
	$time .= ":$dd:$mm";
    }
    ($dd,$mm,$yy) = split(/\//,$tmp);
    $tmp = "$yy,$Month2Number{$mm},$dd,$time,$offset";
    return $tmp;
}

sub ReadOld {
    local($tmp, $tmp1);
    while (<AGENTLOG>) {
	chop;
	last if (m#</html>#oi); # check for end of html file
	if (/Period\s+Covered/oi) {      # get old period covered
	    /to/;
	    $tmp = $`; $tmp1 = $';
	    $tmp = &Date2String($tmp);
	    push(@start,$tmp);	    
	    $tmp = &Date2String($tmp1);	    
	    push(@stop,$tmp);
	}
        if (/<a name=\"complete\">/oi) {
	    &ReadOldStats;
	    last;
	}
    }
    close(AGENTLOG);
}

sub ReadOldStats {
    while (<AGENTLOG>) {
	last if (m#</table>#oi);
	next if ( $_ !~ m#<tr>#);
	m#<tr><[^>]+>\s*(\d+)\s*<[^>]+>.*<[^>]+>\s*(.*)\s*<br>\s*$#oi;
        if ( $2 ) {
	    $rawagents{$2} += $1;
	    $refscounter += $1;       
	}
    }
}
	
sub Version {
    die <<"EndVersion";
This is $Version.

It is Jens Elkner\'s modified version of browsercounter 1.2.2-E.
EndVersion
}

sub Usage {
    die <<"EndUsage";
Process a sequence of Squids useragent log files and output an HTML summary. 

Usage: agentcounter [-h] [-v] [-N Proxy-Server-Name] [-i fileList] [-o afile]
                    [logfile ...] [logfile.gz ...] [logfile.Z ...]

Options:
  -h            Display this message and quit.
  -v            Display version 
  -N name       Proxy Server name for report
  -i fileList   a comma separated list of old statfiles files for inclusion 
  -o afile      Output file (default = Standard Output)
EndUsage
}

sub Initialize {
    %Number2Month=(
                    '01','Jan',
                    '02','Feb',
                    '03','Mar',
                    '04','Apr',
                    '05','May',
                    '06','Jun',
                    '07','Jul',
                    '08','Aug',
                    '09','Sep',
                    '10','Oct',
                    '11','Nov',
                    '12','Dec',
                    );
    %Month2Number=(
                    'Jan','01',
                    'Feb','02',
                    'Mar','03',
                    'Apr','04',
                    'May','05',
                    'Jun','06',
                    'Jul','07',
                    'Aug','08',
                    'Sep','09',
                    'Oct','10',
                    'Nov','11',
                    'Dec','12',
                    );
    $result = &Getopts('hi:N:o:v');
    &Usage if  $opt_h || $result == 0;
    &Version if $opt_v;
    $HTTPDSERVER=$opt_N if $opt_N;
    if ($opt_i) {
	@Includes = split(",",$opt_i);
    }
    if ($opt_o && ( $opt_o ne '-')) {
	$OutputFile =  $opt_o;
	open(OUT,">$OutputFile.$$") || die "Can not open $OutputFile: $!\n";
    }
}
