# ----------------------------------------------------------------------------------------------- # Author: Dean Stringer # Description: Parses a log file (on STDIN) and reports on browser and bot # breakdown based on user-agent string # ----------------------------------------------------------------------------------------------- # inspired by a script of Tim Bray's # http://www.tbray.org/ongoing/When/200x/2003/11/11/Late2003Browsers # but about all that's left now is the regexp that extracts the UA string # ----------------------------------------------------------------------------------------------- use strict; my $count = 0; my $showSummary = 1; my %browsers; my $showStatsAs = "percen"; # possible vals 'percent' my $showBrowsers = 1; my $totalBrowsers = 0; my %knownBrowsers = ( # note: as this is a hash, there's no way to know what order the # pattern-matches will be executed in, this may be important if # two browsers had similar strings and one matched before the # other depending on the regexp. 'Amaya' => 'amaya', 'IE' => 'MSIE', 'Mozilla' => 'Mozilla.*Gecko', 'Safari' => 'Safari', 'Opera' => 'opera', 'Lynx' => 'lynx', 'Netscape4' => 'Mozilla\/4\.(07|76)', 'Konqueror' => 'Konqueror', ); my %bots; my $showBots = 1; my $totalBots = 0; my %knownBots = ( 'TutorGig' => 'http://www.tutorgig.com', 'TurnItIn' => 'http://www.turnitin.com', 'Grub' => 'http://grub.org', 'Scooter' => 'scooter', 'W3C Linkchecker' => 'W3C-checklink', 'W3C Validator' => 'W3C_validator', 'W3C CSS Validator' => 'W3C_CSS_Validator', 'Infoseek' => 'infoseek', 'Google' => 'google', 'Inktomi' => 'Inktomi', 'HTDig' => 'htdig', 'FrontPage' => 'MS\s*FrontPage', 'ZyBorg' => 'ZyBorg', 'Netcraft' => 'Netcraft', 'Xenu' => 'Xenu', 'IBM Almaden' => 'http://www.almaden.ibm.com/cs/crawler', 'FAST-WebCrawler' => 'FAST-WebCrawler', 'SurveyBot' => 'SurveyBot', 'MS DAV' => '(Microsoft Data Access|Microsoft-WebDAV)', ); my %unKnown; my $showUnknown = 1; my $logUnknown = 1; my $totalUnknown = 0; LOGLINES: while () { next if (m@\"GET /ongoing/ongoing.rss@); my $b; if (/^.*"([^\"]*)"$/) { $b = $1; #print "ZZZ" if $b eq "-"; } else { next; } $count++; foreach my $browser (keys %knownBrowsers) { if ($b =~ /$knownBrowsers{$browser}/i) { $b = $browser; #print $b; $browsers{$b}++; $totalBrowsers++; next LOGLINES; } } foreach my $bot (keys %knownBots) { if ($b =~ /$knownBots{$bot}/i) { $b = $bot; #print $b; $bots{$b}++; $totalBots++; #print "Found $crawler"; next LOGLINES; } } if ($logUnknown && ($b ne "-")) { $unKnown{$b}++; #print "didnt Found $b"; } $totalUnknown++; } my $b; if ($showBrowsers) { print "\n"; foreach $b (keys %knownBrowsers) { print $b . "\t"; }; print "\n"; foreach $b (keys %knownBrowsers) { if ($showStatsAs eq "percent") { printf "%0.1f\t", 100.0 * $browsers{$b} / $count; } else { print "\t $browsers{$b}"; } } print "\n"; } if ($showBots) { print "\n"; foreach my $bot (keys %knownBots) { print "\n[$bots{$bot}] : $bot"; } print "\n"; } if ($showUnknown) { foreach my $key (keys %unKnown) { print "\n[$unKnown{$key}] : $key"; } } print $totalUnknown; if ($showSummary) { my $browserPC = int(100.0 * ($totalBrowsers / $count)); my $botPC = int(100.0 * ($totalBots / $count)); my $unknownPC = int(100.0 * ($totalUnknown / $count)); print <