# ---------------------------------------------------------------------------------
# Author: Dean Stringer (deeknow @ pobox.com)
# Description/Purpose:
# Takes a URL specified in an FORM INPUT field, requests the page using
# LWP and parses it using HTML::Parser to extract the TABLE layout used
# in the page for debugging purposes.
#
# HTML::Parser does a good job of emaulating browser activity when reading
# and parsing fetched HTML, all we do in this script really is keep a
# track of table nesting and open/close tag pairs as the parser triggers
# the event handlers.
#
# Dependencies:
# HTML::Parser, LWP::UserAgent, CGI.pm
#
# Expected Parameters:
# url: full format (see invocation example/s)
#
# Returns/Output:
# TABLE layout of page with dummy content and a sequence of
# background colours to make it easy to differentiate between
# Tables.
#
# Sample Invocation:
# /cgi-bin/deTabliser.cgi?url=http://www.mysite.com/tablespage.html
#
# Error situations:
# o Request URL = self. This would result in an infinite loop of the script
# requesting itself. This is checked for and script dies if so.
# o File not Found (404) - will present error message and prompt for another URL
# wont actually parse the 404 HTML page results.
# o Table complexity shouldn't be a problem as far as processing effort or
# memory consumption is concerned as were parsing in stream mode rather
# than loading as a tree (ala DOM).
# ------------------------------------------------------------------------------------------------
# References:
# If your interested in using HTML::Parser you should grab HTML::Parser from
# CPAN and refer to the man page. There is also an online copy of the
# documentation at..
# http://www.perldoc.com/perl5.6.1/lib/HTML/Parser.html
# ---------------------------------------------------------------------------------
use strict;
use HTML::Parser; # Used to parse the page
use LWP::UserAgent; # Used to fetch the page
use CGI;
my $q = new CGI;
my $myself = $q->url(-relative=>1);
print $q->header('text/html');
my @tableColours = ( # Change these to any colours you fancy, they'll be chosen
"#ff6633", # using a mod ('%') operator so we can cycle though a different
"#ffaa33", # background colour for each nested table, makes things a little
"#ffcc33", # easier to read in the HTML formatted output
"#ffdd33", #
"#ffee33", # These colours are shades of Red-Yellow
"#ffff33"
);
my $tableColoursCount = (@tableColours);
my $debugMode = 1; # Show some extra debug info as we parse the source
my $tableCount = 0; # Total number of TABLEs seen during parse
my $tableDepth = 0; # Nested depth at point of parse, also used for formatting of output
my $maxTableDepth = 0; # Maximum nested depth reached
my $indent = ""; # Used to fomat HTML output as a sequence of Tab (\t) chars
my $thisTable = 0; # ID of table being processed
my @currentTable; # Use array to Push/Pop table numbers on/off a stack as we encounter
# their start and end tags, could have used some sort of linked-list
my $endTable = 0; # Used for debugging, indicates table last popped
my @parentTables; # Array used as stack to track parent of current table
my @tableRows; # Rows found for each Table
my @tableCols; # Columns found for each Table
my $contentSummary = ""; # Container for Summary report
my $tableLayout = ""; # Container for Table layout report
my $warningCount = 0; # Number of warnings found while parsing
my $contentWarnings = ""; # Container for Warning report
my @warnings; # Array of warning messages generated while parsing
my $countH1 = 0; # Number of H1 tags found (used for Warning check)
my $isFirstCell = 1; # Tracks whether is 1st cell in a table or not
my $thisColumn = 1; # Tracks current column number
my $file2Parse = "";
$file2Parse = $q->param('url'); # Ummmmm... the file to Parse :-)
# ---------------------------------------------------------------------------------
# Check we're not fetching ourself, otherwise we end up chasing our tail
# ---------------------------------------------------------------------------------
if ($file2Parse =~ /deTabliser\.cgi/) {
dieNice ("Cannot parse myself, otherwise I'll end up in an infinite loop.", "Can't Parse Self");
}
# ---------------------------------------------------------------------------------
# Check we've been given a URL to fetch, use default URL if not.
# ---------------------------------------------------------------------------------
unless(defined($file2Parse) || ($file2Parse ne "")) {
$file2Parse = "http://default.site.com/";
}
# ---------------------------------------------------------------------------------
# Setup some standard messages for later
# ---------------------------------------------------------------------------------
my $tryAgain = <
Hints: You need to supply a fully qualified URL when running a Snoop so please include the protocol (e.g. HTTP://) otherwise the fetch stage will get a 404 from the web server and will halt at that point.
END # --------------------------------------------------------------------------------- # perform LWP web request and bomb-out if cannot fetch page # --------------------------------------------------------------------------------- my $response = LWP::UserAgent->new->request( HTTP::Request->new( GET => $file2Parse ) ); unless($response->is_success) { dieNice ("Couldn't fetch file $file2Parse.$tryAgain$hints", "Fetch Failed"); } my $pageContent = $response->content; # --------------------------------------------------------------------------------- # We've managed to fetch the URL, now we can try parsing it # --------------------------------------------------------------------------------- my $parsedFile = HTML::Parser->new(api_version => 3); # --------------------------------------------------------------------------------- # Setup Handlers for events # --------------------------------------------------------------------------------- # For 'start' handler, we're only interested in the tag name and its attributes, # dont need to pass 'self' or 'attrseq' $parsedFile->handler( start => \&start_handler, "tagname,attr" ); $parsedFile->handler( end_1document => \&end_document, "" ); $parsedFile->handler( end => \&end_handler, "tagname" ); # --------------------------------------------------------------------------------- # Hold your Breath.... and Parse it !!!! # --------------------------------------------------------------------------------- $parsedFile->parse($pageContent) || dieNice" $!"; $parsedFile->eof; # signal end of document, otherwise the end_document # event doesnt get fired. (It will fire when using # parse_file # --------------------------------------------------------------------------------- # Show a summary if were in debug mode. # --------------------------------------------------------------------------------- if ($debugMode>0) { $contentSummary .= "\nFound $tableCount Tables " . " nested a maximum of $maxTableDepth deep.
There were $warningCount warnings...
deTabliser takes a URL you supply in the following form field, fetches the page, and attempts to display the tables used to lay out that page. Note: You must include the Protocol (e.g. http://) in front of the URL you want to query.
" . $tryAgain . "
Results: Layout | " .
"Summary | " .
"Warnings
<H1> element"; }
if ($countH1 eq 0) { $warningCount++; push @warnings, "No <H1> element"; }
}