# ----------------------------------------------------------------------------------------------- # Author: Dean Stringer # Description/Purpose: # # Takes a source XML file and reads into a DOM-like tree which # we can query using XPAth (or at least the functionality available # in the XML::Xpath.pm) # # Useful References/Resources: # Intro's to XPath (for anyone)... # http://html.about.com/library/weekly/aa110501a.htm # http://www.zvon.org/xxl/XPathTutorial/General/examples.html # XPath spec (for the brave)... # http://www.w3.org/TR/xpath # # ----------------------------------------------------------------------------------------------- # Dependencies: # Perl Modules: XML::XPath (bundle) # ----------------------------------------------------------------------------------------------- use strict; my $syntax = "\nSyntax: $0 \n"; unless (@ARGV == 2) { print STDERR $syntax; exit; } my $file2Parse = $ARGV[0]; unless ((-e $file2Parse) && (-f $file2Parse)) { print STDERR "No such file $file2Parse $syntax"; exit; } use XML::XPath; use XML::XPath::XMLParser; #use XML::XPath::Number; my $xp = XML::XPath->new(filename => $file2Parse); my $expression = $ARGV[1]; my $nodeset = $xp->find( $expression ); print "Found " . $nodeset->size . " nodes.\n"; foreach my $node ($nodeset->get_nodelist) { print XML::XPath::XMLParser::as_string($node); } exit; # ----------------------------------------------------------------------------------------------- # sample XPath expressions, these are useful for querying HTML sources # see also: http://www.w3schools.com/xpath/xpath_examples.asp # ----------------------------------------------------------------------------------------------- # "//@id" # all id values # "//link[@id=20]" # links with id=20 # "//link[@id=20]/text()" # text contents of all links with id = '20' # "html/body/table", # all tables in the body root # "/html/head/meta", # all meta data # "/html/head/*", # all head section # "//p", # any

no matter where its contained # "//td", # all s # "//td/*", # contents of all s # "html/body/table/tr/td/table", # 2nd level s in # "/*/*/*/*/*/table", #
s which have 5 ancestors # "/html/body/table/tr/td[2]/*", # Contents of 2nd
of parent table # "/*/*/*/*/*/table[last()]", # last which has 5 ancestors - NO SUPPORT # "count(//p)", # no. of

s # "//h1" # all H1s