#!/usr/local/bin/perl
# $Id: proxycounter.pl,v 1.4 1997/10/12 08:27:00 elkner Exp $
# proxycounter: Program to create statistics from log files about proxies
# which accessed the WWW server.
# It uses the apache customed log file, created with directive:
# "CustomLog logs/file %{VIA}i" and agent_logs, if available.
# written by Jens Elkner (elkner@irb.cs.uni-magdeburg.de)
# THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS
# OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE.
#
# I offer it to the public domain and I ask, however, that this paragraph
# and my name be retained in any modified versions of the file you may
# make, and that you notify me of any improvements you make to the code.
#
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#
# IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
# SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
# USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT
# LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE
require "ctime.pl";
require "getopts.pl";
$Version="ProxyCounter 1.0";
$Zcat="/usr/local/bin/gzip -cd";
# Type of log file being parsed ('agent','combined')
$LogType='agent';
# Name of server
$HTTPDSERVER='www.cs.uni-magdeburg.de';
# cellpadding size
$cellpadding=2;
# Border size
$border=2;
$newsection="
\n
Top\n";
$filecounter=0;
$proxycounter=0;
%proxy_level=();
&Initialize;
foreach (@OldLogs) {
$tmp = $_;
&ReadOldLog($tmp) if &OpenFile($_);
}
foreach (@AgentLogs) {
$tmp = $_;
&ReadAgentLog($tmp) if &OpenFile($_);
}
foreach (@ARGV) {
&ReadLog if &OpenFile($_);
}
if ($filecounter > 0) {
$hits = $hits_old if $hits_old > $hits;
$access = $access_old if $access_old > $access;
&PrintReport;
}
else {
warn "No files found for analyzing!\n";
}
# --------------------------------- That's it --------------------------------
sub OpenFile {
if ( -r $_[0] ) {
$filetime=(stat($_[0]))[9];
}
else {
warn "$_[0] is not readable: $!\n";
}
# assuming these files are real logfiles (i.e. a.html.9601.gz would not
# work for getting the right modification time
if ($_[0]=~m/(\.gz|\.Z)/o) {
$yymm = $_[0];
$yymm =~ s/.*(\d\d\d\d).*/$1/;
$mm = substr($yymm,2,2);
$yy = substr($yymm,0,2);
$_[0]="$Zcat $_[0] |";
}
# this has to be an uncompressed log file or stats file
# if it is a stats file, @filetimes and @mmtimes are replaced
else {
($mm,$yy) = (localtime($filetime))[4,5];
$mm++;
$mm = "0$mm" if ( $mm < 10 );
$yy -= 100 if ($yy > 99);
$yy = "0$yy" if ( $yy < 10 );
}
if (open(LOG,"$_[0]")) {
push @filetimes, $filetime;
$filecounter++;
push @mmtimes,"${yy}${mm}";
1;
}
else {
warn "Can't open $_[0]: $!\n";
0;
}
}
sub ReadAgentLog {
local($linecounter) = 0;
while () {
next if $linecounter < $ignore;
$linecounter++;
chomp;
s/\s+/\ /;
if ($LogType eq 'combined') {
# combined logfile format:
# $Host $rfc931 $auth $Time $Request $Status $Bytes $Referrer $Agent
s/^\S+ \S+ \S+ \[[^\]\[]+\] \"[^\"]*\" \S+ \S+ \"[^\"]*\" \"([^\"]*)\"/$1/o;
}
# strip leading chr(0)
s/^\x0*//;
&Check4Proxies($_) if /\s+via\s+/;
}
close(LOG);
$hits_old += $linecounter - $ignore;
}
sub GetToken {
# tspecials [\(\)\<\>\@,;:\\\"\/\[\]\?=\{\}\ \t]
local($token) = $_[0];
local($remainder);
if ($token =~ m#^([^\(\)\<\>\@,;:\\\"\/\[\]\?=\{\}\ \t]+)(.*)$# ) {
$token = $1;
$remainder = $2;
}
else {
$remainder = $token;
$token = "";
}
return ("$token","$remainder");
}
sub GetRawVersion {
local($version) = $_[0];
if ( $version =~ m#^([0-9]+\.[0-9]+)# ) {
$version = $1;
}
# fix for Squid NOVM
else {
$version =~ s/^([0-9]+\.NOVM)\..*/$1/oi;
}
return $version;
}
sub Check4Proxies {
local($line) = $_[0];
local($level) = 0;
local($tmp,$agent,$version,$detailed_version,$comment) = "";
$access_old++;
# fix CERN-HTTPD, WebTrack-HTTPP, KryptoWall, germany.net
$line =~ s/\s+proxy\s+gateway//gi;
# fix Squid 1.0.x and Harvest 1.x to RFC2068 product rule
$line =~ s/\s+Cache\s+version\s+/\//gi;
# fix NetCache to RFC2068 product rule
$line =~ s/\s+version\s+/\//gi;
# fix DocuMagix HotCargo Express to RFC2068 product rule
$line =~ s/DocuMagix\s+HotCargo\s+Express/DocuMagix-HotCargo-Express/gi;
$tmp = $line;
while ( $tmp =~ m#\s+via\s+#o ) {
$tmp =~ s/.*\s+via\s+(.*)/$1/oi;
# RFC 2068 - 14.32: 1*( product | comment )
# comment is ignored
($agent,$detailed_version) = &GetToken($tmp);
$agent = "Unknown" unless $agent;
$detailed_version =~ s/^\///;
($detailed_version,$comment) = &GetToken($detailed_version);
$detailed_version = "???" if ($detailed_version eq "");
$version = &GetRawVersion($detailed_version);
$agent_counter{$agent}++;
$agent_version_counter{"$agent $version"}++;
$agent_detailed_version_counter{"$agent $detailed_version"}++;
$old_log_style{"$agent $detailed_version"}++;
# we do not want to have any sub products, thus we go for the next
# entry
$line =~ s/(.*)\s+via\s+.*/$1/o;
$tmp = $line;
$proxycounter++;
$level++;
}
$proxy_level{$level}++;
}
sub GetComment {
local($comment) = $_[0];
local($number) = $comment =~ s/[^\(]*\(([^\)]+)\).*/$1/;
$comment = "" unless $number > 0;
return $comment;
}
sub ReadLog {
local(@p) = ();
local($level) = 0;
local($agent,$version,$detailed_version,$comment) = "";
while() {
$hits++;
next if /^\-/;
chop;
# RFC 2068 - 14.44: 1#( received-protocol received-by [ comment ] )
$access++;
@p = split(/,/);
$level = $#p + 1;
foreach (@p) {
$comment = &GetComment($_);
# fix for AOL TurboWeb
$comment =~ s/AOL\s+TurboWeb/AOL-TurboWeb/g;
# fix for IBM ICS
$comment =~ s/IBM\s+ICS/IBM-ICS/g;
# print "$_\n" unless $comment;
$comment = "Unknown" unless $comment;
# RFC 2068 - 14.32: 1*( product | comment )
# comment is ignored
($agent,$detailed_version) = &GetToken($comment);
$agent = "Unknown" unless $agent;
$detailed_version =~ s/^\///;
($detailed_version,$comment) = &GetToken($detailed_version);
$detailed_version = "???" if ($detailed_version eq "");
$version = &GetRawVersion($detailed_version);
$agent_counter{$agent}++;
$agent_version_counter{"$agent $version"}++;
$agent_detailed_version_counter{"$agent $detailed_version"}++;
# print "\"$agent\" \"$version\" \"$detailed_version\"\n";
$proxycounter++;
}
$proxy_level{$level}++;
}
close(LOG);
}
sub PrintReport {
local(@tmp);
local($yymm,$from_mm,$from_yy,$to_mm,$to_yy,$line);
select(OUT) if $OutputFile;
$li=" 0 ) {
$yymm = $tmp[$#tmp];
$to_mm = substr($yymm,2,2);
$to_yy = substr($yymm,0,2);
$line .= " - $NumberToMonth{$mm} $yy" unless ($from_mm == $to_mm && $from_yy == $to_yy);
}
print
"\n",
"\n",
"Proxy Statistics for ", $HTTPDSERVER, "\n",
"\n",
"\n",
"\n",
"",
"| ",
"",
"Proxy Statistics for ", $HTTPDSERVER, " |
\n\n",
"
", $line, "\n
\n";
&PrintSummary;
print
"\n\n| Logged Proxies:\n | ",
"\n",
"$li#agent\">Summary\n",
"$li#version\">Summary by version\n",
"$li#detail\">Summary by detail of version\n",
" \n |
\n";
&PrintAgent;
&PrintAgentVersion;
&PrintDetailedAgentVersion;
print
"\n",
"
\n\n| *\n | ",
"log style old means, that the ",
"statistics for the specified proxy are gathered via the user agent ",
"log file. If no style is mentioned, the data for the specified proxy ",
"have been extracted from the VIA log file - corresponding to RFC2068.",
" |
\n\n",
"
\n",
"\n";
if ( $OutputFile ) {
rename "$path/$OutputFile.$$", "$path/$OutputFile";
select(STDOUT);
close(OUT);
}
}
sub PrintSummary {
local(@tmp);
local($lastmod,$filedate,$date);
@tmp = sort @filetimes;
$lastmodtime=$tmp[$#tmp];
$filedate = &ctime($lastmodtime);
$date = &ctime(time);
print
"Last analyzed: ", $date, "
\n",
"Last log file modification: ", $filedate, "
\n",
"\n",
"
\nTotal Hits measured: ", $hits, "
\n",
"Hits via one or more proxies: ", $access," (",
&GetPercent($access,$hits), "%)
\n";
printf("Average hops via proxy/proxy hit: %5.2f
\n",
$proxycounter/$access);
printf("Average hops via proxy/hit: %5.2f\n",
$proxycounter/$hits);
$date = 1;
foreach $key (sort LevelsByHits keys(%proxy_level)) {
print "Hits via proxy with $key hop(s): $proxy_level{$key}
\n";
}
print "
\n";
}
sub GetPercent {
# $_[0] = relative value, $_[1] = absolut value
return "99999999999999999" if $_[1] == 0;
$percent = 100 * $_[0] / $_[1];
$percent = sprintf("%5.2f",$percent);
return $percent;
return $percent;
}
sub PrintAgent {
print
$newsection, "agent\">Summary
\n\n",
"\n\n",
"| Hits | Percent | Proxy \n";
foreach $key (sort AgentByHits keys(%agent_counter)) {
print
" |
|---|
| ", $agent_counter{$key} , " ",
" | ", &GetPercent($agent_counter{$key},$proxycounter),
"% | ", $key, " \n";
}
print " |
\n", $return2index, "\n";
}
sub PrintAgentVersion {
print
$newsection, "version\">Summary by version\n\n",
"\n",
"\n",
"| Hits | Percent | Proxy \n";
foreach $key (sort VersionByHits keys(%agent_version_counter)) {
print
" |
|---|
| ", $agent_version_counter{$key},
" | ",
&GetPercent($agent_version_counter{$key},$proxycounter),
"% | ", $key ," \n";
}
print " |
\n", $return2index, "\n";
}
sub PrintDetailedAgentVersion {
local($tmp);
print
$newsection, "detail\">Summary by detail of version\n\n",
"\n",
"\n",
"| Hits | Percent | Proxy | log style* \n";
foreach $key (sort DetailedVersionByHits keys(%agent_detailed_version_counter)) {
print
" |
|---|
| ", $agent_detailed_version_counter{$key}, " ",
" | ", &GetPercent($agent_detailed_version_counter{$key},$proxycounter),
"% | ", $key, " ";
$tmp = $old_log_style{$key} ? "old" : " ";
print " | ", $tmp, " \n";
}
print " |
\n", $return2index, "\n";
}
sub AgentByHits {
$tmp = $agent_counter{$b}<=>$agent_counter{$a};
($tmp == 0) ? $a cmp $b : $tmp;
}
sub VersionByHits {
$tmp = $agent_version_counter{$b}<=>$agent_version_counter{$a};
($tmp == 0) ? $a cmp $b : $tmp;
}
sub DetailedVersionByHits {
$tmp = $agent_detailed_version_counter{$b}<=>$agent_detailed_version_counter
{$a};
($tmp == 0) ? $a cmp $b : $tmp;
}
sub LevelsByHits {
$tmp = $proxy_level{$b}<=>$proxy_level{$a};
($tmp == 0) ? $a cmp $b : $tmp;
}
sub GetLastModTime {
while () {
# print "GetMod: $_";
next if ( $_ !~ m#<\!--\s*(\d+)\s*-->#);
$filetime = $1;
pop @filetimes;
push @filetimes, $filetime;
($mm,$yy) = (localtime($filetime))[4,5];
$mm++;
$mm = "0$mm" if ( $mm < 10 );
$yy -= 100 if ($yy > 99);
$yy = "0$yy" if ( $yy < 10 );
pop @mmtimes;
push @mmtimes, "${yy}${mm}";
last;
}
}
sub GetVitals {
while () {
# print "Vitals: $_";
last if /Logged\s+Proxies/oi;
$hits += $1 if /Total\s+Hits\s+measured:<\/b>\s*(\d+)/oi;
$access += $1 if /Hits\s+via\s+one\s+or\s+more\s+proxies:<\/b>\s*(\d+)/oi;
$proxy_level{$1} += $2 if /Hits\s+via\s+proxy\s+with\s+(\d+)\s+hop\(s\):<\/b>\s*(\d+)/oi;
}
}
sub ReadOldLog {
local($agent,$version, $detailed) = "";
while () {
chop;
last if (m#