#!/usr/local/bin/perl

# $Id: browsercounter.pl,v 1.7 1997/10/12 08:33:56 elkner Exp $

# browsercounter: Program to create statistics from log files about WWW Agents 
#                 accessing WWW server. 

# written by Jens Elkner (elkner@irb.cs.uni-magdeburg.de)

# THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS 
# OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE 
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
# PARTICULAR PURPOSE.
#
# I offer it to the public domain and I ask, however, that this paragraph
# and my name be retained in any modified versions of the file you may
# make, and that you notify me of any improvements you make to the code.
#
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any 
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#
# IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, 
# SPECIAL, INCIDENTAL,  OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 
# USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT 
# LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE 
# POSSIBILITY OF SUCH DAMAGE

require "ctime.pl";
require "getopts.pl";

$Version="BrowserCounter 1.3-E";
$Zcat="/usr/local/bin/gzip -cd";

# Type of log file being parsed ('agent','combined')
$LogType='agent';

# Name of server
$HTTPDSERVER='www.cs.uni-magdeburg.de';

# cellpadding size
$cellpadding=2;

# Border size
$border=2;

$newsection="<p><hr><p>\n<h2><a name=\"";
$return2index = "<p><a href=\"#top\">Top</a>\n";
$filecounter=0;

&Initialize;

foreach (@Includes) {
    $tmp = $_;
    &ReadOld($tmp) if &OpenFile($_);
}

foreach (@ARGV) {
    &ReadLog if &OpenFile($_);
}

if ($filecounter > 0) {
    &PrintReport;
}
else {
    warn "No files found for analyzing!\n";
}


# --------------------------------- That's it --------------------------------
sub OpenFile {
    if ( -r $_[0] ) {
	$filetime=(stat($_[0]))[9];
    }
    else {
	warn "$_[0] is not readable: $!\n";
    }
    # assuming these files are real logfiles (i.e. a.html.9601.gz would not
    # work for getting the right modifikation time
    if ($_[0]=~m/(\.gz|\.Z)/o) {
        $yymm = $_[0];
        $yymm =~ s/.*(\d\d\d\d).*/$1/;
        $mm = substr($yymm,2,2);
        $yy = substr($yymm,0,2);
        $_[0]="$Zcat $_[0] |";
    }
    # this has to be an uncompressed log file or stats file
    # if it is a stats file, @filetimes and @mmtimes are replaced
    else {
	($mm,$yy) = (localtime($filetime))[4,5];
	$mm++;
	$mm = "0$mm" if ( $mm < 10 );
	$yy -= 100 if ($yy > 99);
	$yy = "0$yy" if ( $yy < 10 );
    }
    if (open(AGENTLOG,"$_[0]")) {
	push @filetimes, $filetime;
	$filecounter++;
	push @mmtimes,"${yy}${mm}";
	1;
    }
    else {
	warn "Can't open $_[0]: $!\n";
	0;
    }
}

sub ReadLog {
    local($agent,$tmp);
    while ($line=<AGENTLOG>) {
	$refscounter++;
	chomp $line;
	$line =~ s#\s+# #go;
	if ($LogType eq 'combined') {
	    ($tmp,$tmp,$tmp,$tmp,$tmp,$tmp,$tmp,$tmp,$agent) = $line =~  /^(\S+) (\S+) (\S+) \[([^\]\[]+)\] \"([^\"]*)\" (\S+) (\S+) \"([^\"]*)\" \"([^\"]*)\"/o;
	} 
	elsif ($LogType eq 'agent') {
	    $agent=$line;
	}
        # strip leading chr(0)	
	$agent =~ s/^\x0*//;
	&AnalyzeAgent($agent);
    }
    close(AGENTLOG);
}

sub GetToken {
    # tspecials [\(\)\<\>\@,;:\\\"\/\[\]\?=\{\}\ \t]
    local($token) = $_[0];
    local($remainder);
    $remainder = $token;
    $token =~ m#^([^\(\)\<\>\@,;:\\\"\/\[\]\?=\{\}\ \t]+)#;
    if ($token =~ m#^([^\(\)\<\>\@,;:\\\"\/\[\]\?=\{\}\ \t]+)(.*)$# ) {
	$token = $1;
	$remainder = $2;
    }
    else {
	$remainder = $token;
	$token = "";
    }
    return ("$token","$remainder");
}

sub GetComment {
    local($comment) = $_[0];
    $comment =~ s/[^\(]+\(([^\)]+)\).*/$1/;
    return $comment;
}

sub GetRawVersion {
    local($version) = $_[0];
    if ( $version =~ m#^([0-9]+\.[0-9]+)# ) {
	$version = $1;
    }
    return $version;
}

sub AnalyzeAgent {
    local($a) = $_[0];
    local($agent,$version,$detailed_version,$spoofer,$spoofer_version,$spoofer_detailed_version,$comment) = "";
    # RFC 2068 - 14.32: 1*( product | comment )
    ($agent,$detailed_version) = &GetToken($a);
    $agent = "Unknown" unless $agent;
    $detailed_version =~ s/^\///;
    ($detailed_version,$comment) = &GetToken($detailed_version);
    $detailed_version = "???" if ($detailed_version eq "");
    $version = &GetRawVersion($detailed_version);
    # we do not want to have any sub products, thus we scan next for comment:
    # people using pseudo-'standard' of 'compatible' in comment
    # like (compatible; Opera/2.12; Windows 95) 
    $comment = &GetComment($comment);
    if ($comment =~ m#compatible;\s*([^;)]+);#oi ) {
	$spoofer =  $1;
	$spoofer =~ s/MSIE\ /MSIE\//;    # need to fix MSIE notation to product
                                         # (compatible; MSIE 4.0b1; Windows 95)
	($spoofer,$spoofer_detailed_version) = &GetToken($spoofer);
	$spoofer = "Unknown" unless $spoofer;
	$spoofer_detailed_version =~ s/^\///;
	($spoofer_detailed_version,$comment) = &GetToken($spoofer_detailed_version);
	$spoofer_detailed_version = "???" if ($spoofer_detailed_version eq "");
	$spoofer_version = &GetRawVersion($spoofer_detailed_version);
    }
    if ( $spoofer ) {
	$agent_counter{$spoofer}++;
	$agent_version_counter{"$spoofer $spoofer_version"}++;
	$agent_detailed_version_counter{"$spoofer $spoofer_detailed_version"}++;
	$spoofer_value{"$spoofer $spoofer_detailed_version"} = "$agent $version";
    }
    else {
	$agent_counter{$agent}++;
	$agent_version_counter{"$agent $version"}++;
	$agent_detailed_version_counter{"$agent $detailed_version"}++;
    }
}
    
sub PrintReport {
    select(OUT) if $OutputFile;
    $li="<li> <a href=\"";
    @tmp = sort @mmtimes;
    $yymm = $tmp[0];
    $mm = substr($yymm,2,2);
    $yy = substr($yymm,0,2);
    $line = "$NumberToMonth{$mm} $yy";
    if ( $#mmtimes > 0 ) {
        $yymm = $tmp[$#tmp];
	$mm = substr($yymm,2,2);
        $yy = substr($yymm,0,2);
	$line .= " - $NumberToMonth{$mm} $yy";	
    }	
    print
	"<!doctype HTML public \"-//W3C//DTD HTML 3.2//EN\">\n",
	"<html><head>\n",
	"<title>W3 Agent Statistics for ", $HTTPDSERVER, "</title>\n",
	"</head>\n",
	"<BODY BGCOLOR=WHITE TEXT=Black TOPMARGIN=10 LEFTMARGIN=15>\n",
	"<a name=\"top\">\n",
	"<TABLE BORDER=0 CELLSPACING=0 CELLPADDING=3 WIDTH=100%>\n",
	"<TR><TD BGCOLOR=\"#1D098E\" WIDTH=100% align=center>\n",
	"<FONT COLOR=white SIZE=+2><STRONG>",
	"WWW Agent Statistics for ", $HTTPDSERVER, "</FONT></TABLE>\n<P>\n",
	"<CENTER><FONT SIZE=+2><B>", $line, "</B></FONT></CENTER>\n<P><HR>\n";
    &PrintSummary;
    print
	"<UL>\n",
	"$li#agent\">Summary</a>\n",
	"$li#version\">Summary by version</a>\n",
	"$li#detail\">Summary by detail of version</a>\n",
	"</ul>\n";
    &PrintAgent;
    &PrintAgentVersion;
    &PrintDetailedAgentVersion;
    print
	"<p>\n",
	"<TABLE Width=100%>\n<TR><TD BGCOLOR=\"#F0F0F0\"><FONT SIZE=-1>\n",
        "Generated with&nbsp;&nbsp;",
	"<a href=\"http://irb.cs.uni-magdeburg.de/~elkner/webtools/browsercounter.shtml\"><B>",
	$Version,"</B></a>&nbsp;&nbsp;written by&nbsp;&nbsp;",
	"<a href=\"mailto:elkner\@irb.cs.uni-magdeburg.de\">",
	"Jens Elkner</a></FONT></TABLE>\n", 
	"</body></html>\n";
    if ( $OutputFile ) {
	rename "$path/$OutputFile.$$", "$path/$OutputFile";
	select(STDOUT);
	close(OUT);
    }
}

sub PrintSummary {
    @tmp = sort @filetimes;
    $lastmodtime=$tmp[$#tmp];
    $filedate = &ctime($lastmodtime);
    $date = &ctime(time);
    print
	"<b>Last analyzed:</b> ", $date, "<br>\n",
	"<b>Last log file modification:</b> ", $filedate, "<br>\n",
	"<!-- $lastmodtime -->\n",
	"<br><b>Web Browser Hits measured:</b> ", $refscounter, "<br>\n",
	"<P>\n";
}

sub GetPercent {
    # $_[0] = relative value, $_[1] = absolut value
    $percent = 100 * $_[0] / $_[1];
    $percent = sprintf("%5.2f",$percent);
    return $percent;
}

sub PrintAgent {
    print 
	$newsection, "agent\">Summary</a></h2>\n\n",
	"<CENTER>\n<table border=$border cellpading=$cellpadding>\n",
	"<tr><th>Hits <th>Percent <th>Browser<BR>\n";
    foreach $key (sort AgentByHits keys(%agent_counter)) {
	print
	    "<tr><td align=right>", $agent_counter{$key} , " ",
	    "<td align=right>", &GetPercent($agent_counter{$key},$refscounter), 
	    "% <td align=left>", $key, "<BR>\n";
    }
    print "</table>\n", $return2index, "</CENTER>\n";
}

sub PrintAgentVersion {
    print
	$newsection, "version\">Summary by version</a></h2>\n\n",
	"<CENTER>\n",
	"<table border=$border cellpadding=$cellpadding>\n",
	"<tr><th>Hits <th>Percent <th>Browser<BR>\n";
    foreach $key (sort VersionByHits keys(%agent_version_counter)) {
        print 
	    "<tr><td align=right>", $agent_version_counter{$key},
	    "<td align=right>", &GetPercent($agent_version_counter{$key},$refscounter),
	    "% <td>", $key ,"<BR>\n";
    }
    print "</table>\n", $return2index, "</CENTER>\n";
}

sub PrintDetailedAgentVersion {
    local($tmp);
    print
	$newsection, "detail\">Summary by detail of version</a></h2>\n\n",
	"<CENTER>\n",
	"<table border=$border cellpadding=$cellpadding>\n",
	"<tr><th>Hits <th>Percent <th>Browser <th>spoofing as<BR>\n";
    foreach $key (sort DetailedVersionByHits keys(%agent_detailed_version_counter)) {
        print
	    "<tr><td align=right>", $agent_detailed_version_counter{$key}, " ",
	    "<td align=right>", &GetPercent($agent_detailed_version_counter{$key},$refscounter),
	    "% <td align=left>", $key, " ";
	$tmp = $spoofer_value{$key} ? $spoofer_value{$key} : "&nbsp;";
	print "<td align=left>", $tmp, "<BR>\n";
    }
    print "</table>\n", $return2index, "</CENTER>\n";
}

sub AgentByHits {
   $tmp = $agent_counter{$b}<=>$agent_counter{$a};
   ($tmp == 0) ? $a cmp $b : $tmp;
}
sub VersionByHits {
    $tmp = $agent_version_counter{$b}<=>$agent_version_counter{$a};
    ($tmp == 0) ? $a cmp $b : $tmp;
}
sub DetailedVersionByHits {
    $tmp = $agent_detailed_version_counter{$b}<=>$agent_detailed_version_counter{$a};
    ($tmp == 0) ? $a cmp $b : $tmp;
}

sub GetLastModTime {
    while (<AGENTLOG>) {
	next if ( $_ !~ m#<\!--\s*(\d+)\s*-->#);
	$filetime = $1;
	pop @filetimes;
	push @filetimes, $filetime;
	($mm,$yy) = (localtime($filetime))[4,5];
	$mm++;
	$mm = "0$mm" if ( $mm < 10 );
	$yy -= 100 if ($yy > 99);
	$yy = "0$yy" if ( $yy < 10 );
	pop @mmtimes;
	push @mmtimes, "${yy}${mm}";
	last;
    }
}	 

sub ReadOld {
    local($agent,$version, $detailed) = "";
    while (<AGENTLOG>) {
	chop;
	last if (m#</html>#oi); # check for end of html file
	&GetLastModTime if (/<a name=\"top\">/oi);
        if (/<a name=\"agent\">/oi) {
	    &ReadOldAgent;
	    $agent = "1";
	}
        if (/<a name=\"version\">/oi) {
	    &ReadOldVersion;
	    $version = "1";
	}
        if (/<a name=\"detail\">/oi) {
	    &ReadOldDetailedVersion;
	    $detailed = "1";
	    last;
	}
    }
    close(AGENTLOG);
}

sub ReadOldAgent {
    while (<AGENTLOG>) {
	last if /<\/table>/oi;
	next unless /<tr>/;
	/<tr><[^>]+>\s*(\d+)\s*<[^>]+>[^>]+>\s*([^<]+)/oi;
	if ( $1 && $2 ) {
	    $refscounter += $1;
	    $agent_counter{$2} += $1;
	}
    }
}

sub ReadOldVersion {
   while (<AGENTLOG>) {
       last if /<\/table>/oi;
       next unless /<tr>/;
       /<tr><[^>]+>\s*(\d+)\s*<[^>]+>[^>]+>\s*([^<]+)/oi;
       if ( $1 && $2 ) {
	   $agent_version_counter{$2} += $1;
       }
   }
}
	
sub ReadOldDetailedVersion {
    local($number,$agent,$spoofer);
   while (<AGENTLOG>) {
       last if /<\/table>/oi;
       next unless /<tr>/;
       /<tr><[^>]+>\s*(\d+)\s*<[^>]+>[^>]+>\s*([^<]+)<[^>]+>([^<]+)/oi;
       $number = $1;
       $agent = $2;
       $spoofer = $3;
       $agent =~ s/\s*(.*\S+)\s*$/$1/;
       $spoofer =~ s/\&nbsp;//g;
       if ( $agent && $number ) {
	   $agent_detailed_version_counter{$agent} += $number;
       }
       $spoofer_value{"$agent"} = "$spoofer" if $spoofer;
   }
}
	
sub Version {
    die <<"EndVersion";
This is $Version.\n\n(C) by Jens Elkner (elkner\@irb.cs.uni-magdeburg.de).

EndVersion
}

sub Usage {
    die <<"EndUsage";
Process a sequence of NCSA httpd common format agent_log files and 
generate an HTML summary. 

Usage: browsercounter [-h] [-v] [-t] [-N HTTD-Server-Name]
                      [-i fList] [-o file] [-p path]
                      [logfile ...] [logfile.gz ...] [logfile.Z ...]

Options:
  -h            Display this message and quit.
  -v            Display version 
  -t            Set type of the Logfile to combined (default = agent)
  -N name       HTTPD Server name for report
  -i fList      list of stat files (afile) for inclusion 
  -o afile      Output file (default = Standard Output)
  -p path       System directory, where to store generated files

Terms: fList   ... comma separated List of files (whitespaces are NOT allowed).
       logfile ... common or combined format log file with user agent data 
EndUsage
}

sub Initialize {
    %NumberToMonth=(
                    '01','January',
                    '02','February',
                    '03','March',
                    '04','April',
                    '05','May',
                    '06','June',
                    '07','July',
                    '08','August',
                    '09','September',
                    '10','October',
                    '11','November',
                    '12','December',
                    );
    $result = &Getopts('hi:N:o:p:tv');
    &Usage if  $opt_h || $result == 0;
    &Version if $opt_v;
    if ($opt_i) {
	@Includes = split(",",$opt_i);
    }
    $path = $opt_p ? $opt_p : ".";
    $HTTPDSERVER=$opt_N if $opt_N;
    $LogType="combined" if ($opt_t);
    if ($opt_o && ( $opt_o ne '-')) {
	$OutputFile = $opt_o;
	open(OUT,">$path/$OutputFile.$$") || die "Can not open $path/$OutputFile: $!\n";
    }
}
