# --------------------------------------------------------------------------------- # Author: Dean Stringer (deeknow @ pobox.com) # Description/Purpose: # Takes a URL specified in an FORM INPUT field, requests the page using # LWP and parses it using HTML::Parser to extract the TABLE layout used # in the page for debugging purposes. # # HTML::Parser does a good job of emaulating browser activity when reading # and parsing fetched HTML, all we do in this script really is keep a # track of table nesting and open/close tag pairs as the parser triggers # the event handlers. # # Dependencies: # HTML::Parser, LWP::UserAgent, CGI.pm # # Expected Parameters: # url: full format (see invocation example/s) # # Returns/Output: # TABLE layout of page with dummy content and a sequence of # background colours to make it easy to differentiate between # Tables. # # Sample Invocation: # /cgi-bin/deTabliser.cgi?url=http://www.mysite.com/tablespage.html # # Error situations: # o Request URL = self. This would result in an infinite loop of the script # requesting itself. This is checked for and script dies if so. # o File not Found (404) - will present error message and prompt for another URL # wont actually parse the 404 HTML page results. # o Table complexity shouldn't be a problem as far as processing effort or # memory consumption is concerned as were parsing in stream mode rather # than loading as a tree (ala DOM). # ------------------------------------------------------------------------------------------------ # References: # If your interested in using HTML::Parser you should grab HTML::Parser from # CPAN and refer to the man page. There is also an online copy of the # documentation at.. # http://www.perldoc.com/perl5.6.1/lib/HTML/Parser.html # --------------------------------------------------------------------------------- use strict; use HTML::Parser; # Used to parse the page use LWP::UserAgent; # Used to fetch the page use CGI; my $q = new CGI; my $myself = $q->url(-relative=>1); print $q->header('text/html'); my @tableColours = ( # Change these to any colours you fancy, they'll be chosen "#ff6633", # using a mod ('%') operator so we can cycle though a different "#ffaa33", # background colour for each nested table, makes things a little "#ffcc33", # easier to read in the HTML formatted output "#ffdd33", # "#ffee33", # These colours are shades of Red-Yellow "#ffff33" ); my $tableColoursCount = (@tableColours); my $debugMode = 1; # Show some extra debug info as we parse the source my $tableCount = 0; # Total number of TABLEs seen during parse my $tableDepth = 0; # Nested depth at point of parse, also used for formatting of output my $maxTableDepth = 0; # Maximum nested depth reached my $indent = ""; # Used to fomat HTML output as a sequence of Tab (\t) chars my $thisTable = 0; # ID of table being processed my @currentTable; # Use array to Push/Pop table numbers on/off a stack as we encounter # their start and end tags, could have used some sort of linked-list my $endTable = 0; # Used for debugging, indicates table last popped my @parentTables; # Array used as stack to track parent of current table my @tableRows; # Rows found for each Table my @tableCols; # Columns found for each Table my $contentSummary = ""; # Container for Summary report my $tableLayout = ""; # Container for Table layout report my $warningCount = 0; # Number of warnings found while parsing my $contentWarnings = ""; # Container for Warning report my @warnings; # Array of warning messages generated while parsing my $countH1 = 0; # Number of H1 tags found (used for Warning check) my $isFirstCell = 1; # Tracks whether is 1st cell in a table or not my $thisColumn = 1; # Tracks current column number my $file2Parse = ""; $file2Parse = $q->param('url'); # Ummmmm... the file to Parse :-) # --------------------------------------------------------------------------------- # Check we're not fetching ourself, otherwise we end up chasing our tail # --------------------------------------------------------------------------------- if ($file2Parse =~ /deTabliser\.cgi/) { dieNice ("Cannot parse myself, otherwise I'll end up in an infinite loop.", "Can't Parse Self"); } # --------------------------------------------------------------------------------- # Check we've been given a URL to fetch, use default URL if not. # --------------------------------------------------------------------------------- unless(defined($file2Parse) || ($file2Parse ne "")) { $file2Parse = "http://default.site.com/"; } # --------------------------------------------------------------------------------- # Setup some standard messages for later # --------------------------------------------------------------------------------- my $tryAgain = <     END my $hints = < 

Hints: You need to supply a fully qualified URL when running a Snoop so please include the protocol (e.g. HTTP://) otherwise the fetch stage will get a 404 from the web server and will halt at that point.

END # --------------------------------------------------------------------------------- # perform LWP web request and bomb-out if cannot fetch page # --------------------------------------------------------------------------------- my $response = LWP::UserAgent->new->request( HTTP::Request->new( GET => $file2Parse ) ); unless($response->is_success) { dieNice ("Couldn't fetch file $file2Parse.$tryAgain$hints", "Fetch Failed"); } my $pageContent = $response->content; # --------------------------------------------------------------------------------- # We've managed to fetch the URL, now we can try parsing it # --------------------------------------------------------------------------------- my $parsedFile = HTML::Parser->new(api_version => 3); # --------------------------------------------------------------------------------- # Setup Handlers for events # --------------------------------------------------------------------------------- # For 'start' handler, we're only interested in the tag name and its attributes, # dont need to pass 'self' or 'attrseq' $parsedFile->handler( start => \&start_handler, "tagname,attr" ); $parsedFile->handler( end_1document => \&end_document, "" ); $parsedFile->handler( end => \&end_handler, "tagname" ); # --------------------------------------------------------------------------------- # Hold your Breath.... and Parse it !!!! # --------------------------------------------------------------------------------- $parsedFile->parse($pageContent) || dieNice" $!"; $parsedFile->eof; # signal end of document, otherwise the end_document # event doesnt get fired. (It will fire when using # parse_file # --------------------------------------------------------------------------------- # Show a summary if were in debug mode. # --------------------------------------------------------------------------------- if ($debugMode>0) { $contentSummary .= "\n
\n" . "

TABLE Summary

\n

Found $tableCount Tables " . " nested a maximum of $maxTableDepth deep.

\n"; } # --------------------------------------------------------------------------------- # Show Warnings # --------------------------------------------------------------------------------- if (($debugMode>0) && ($warningCount>0)) { $contentWarnings = "\n
\n

Warnings

\n" . "

There were $warningCount warnings...

\n"; } # --------------------------------------------------------------------------------- # Assemble the results and other content and print # --------------------------------------------------------------------------------- my $content = "

deTabliser takes a URL you supply in the following form field, fetches the page, and attempts to display the tables used to lay out that page. Note: You must include the Protocol (e.g. http://) in front of the URL you want to query.

" . $tryAgain . "


Results: Layout | " . "Summary | " . "Warnings

" . "
" . "

TABLE Layout

" . $tableLayout . $contentSummary . $contentWarnings; print $content; exit; # ================================================================================================ # End of MAIN .... Start of Handlers # ================================================================================================ sub start_handler { # --------------------------------------------------------------------------------- # Handler for all start tags found in the document being parsed. If the tag # is one were interested in (table, tr, td, h1) then add some HTML to the # container variable $tableLayout which well use later to print out to the # browser in the report. # --------------------------------------------------------------------------------- my($thisTag, $attr) = @_; my %attributes = %$attr; SWITCH: { # switch checks in order of most common element to least common # ---------------------------------------------------------------------------------------- # , # ---------------------------------------------------------------------------------------- if (($thisTag eq "td") || ($thisTag eq "th")) { my $colSpan = ""; my $rowSpan = ""; my $numCols = 1; if (defined($attributes{colspan})) { $colSpan = " COLSPAN='" . $attributes{colspan} . "'"; $numCols = $attributes{colspan}; } if (defined($attributes{rowspan})) { $rowSpan = " ROWSPAN='" . $attributes{rowspan} . "'"; } $tableLayout .= ""; if ($isFirstCell) { $tableLayout .= "TB:$thisTable"; $isFirstCell = 0;} else { $tableLayout .= "blabla"; } # else { $tableLayout .= "$tableRows[$thisTable]:$thisColumn"; } unless ($tableRows[$thisTable]>1) { # Increment the number of Columns in the $tableCols[$thisTable] += $numCols; # Table, but can ignore if on the 2nd row } # as we will have already determined no. # on the 1st row. $thisColumn += $numCols; last SWITCH; } # ---------------------------------------------------------------------------------------- # # ---------------------------------------------------------------------------------------- if ($thisTag eq "tr") { $tableLayout .= "\n" . $indent . ""; $tableRows[$thisTable]++; $thisColumn = 1; last SWITCH; } # ---------------------------------------------------------------------------------------- # # ---------------------------------------------------------------------------------------- if ($thisTag eq "table") { push @parentTables, $thisTable; # Found a new TABLE so save the previous # one on the stack as a parent table $tableCount++; # increment the total count and add to $thisTable = $tableCount; # the stack of all TABLES ... push @currentTable, $tableCount; $tableRows[$tableCount] = 0; # and initialise column/row counters $tableCols[$tableCount] = 0; $tableLayout .= "\n" . $indent . "
"; $tableDepth++; if ($tableDepth > $maxTableDepth) { $maxTableDepth = $tableDepth; } $indent = "\t" x $tableDepth; $isFirstCell = 1; # reset so we know the next TD is the 1st last SWITCH; } # ---------------------------------------------------------------------------------------- #

,

# ---------------------------------------------------------------------------------------- if ($thisTag eq "h1") { $tableLayout .= "\n" . $indent . "

Heading-1

"; $countH1++; last SWITCH; } if ($thisTag eq "h2") { $tableLayout .= "\n" . $indent . "

Heading-2

"; last SWITCH; } } return; } sub end_handler { # --------------------------------------------------------------------------------- # Handler for all END tags found in the document being parsed. If the tag # is one were interested in (table, tr, td, h1) then add some HTML to the # container variable $tableLayout which well use later to print out to the # browser in the report. # --------------------------------------------------------------------------------- my $thisTag = shift; SWITCH: { # --------------------------------------------------------------------------------- #
# --------------------------------------------------------------------------------- if ($thisTag eq "table") { $tableDepth--; $indent = "\t" x $tableDepth; $tableLayout .= "\n" . $indent . ""; $tableLayout .= "\n" . $indent; $endTable = pop @currentTable; $thisTable = pop @parentTables; last SWITCH; } # --------------------------------------------------------------------------------- # # --------------------------------------------------------------------------------- if ($thisTag eq "tr") { $tableLayout .= ""; last SWITCH; } # --------------------------------------------------------------------------------- # , # --------------------------------------------------------------------------------- if (($thisTag eq "td") || ($thisTag eq "th")) { $tableLayout .= ""; last SWITCH; } } } sub end_document { # --------------------------------------------------------------------------------- # Handler triggered at the end of the document. Do some diagnostic checking # here of the various counters generated as we parsed the document and add # any oddities to the @warnings array as warning messages. # --------------------------------------------------------------------------------- if ($countH1 > 1) { $warningCount++; push @warnings, "More than one <H1> element"; } if ($countH1 eq 0) { $warningCount++; push @warnings, "No <H1> element"; } }