# --------------------------------------------------------------------------------- # Author: Dean Stringer # Description/Purpose: # # Takes an CSV formatted file as input, reads its 1st line expecting a row # that lists column names, then uses these as element names to be output to # STDOUT as XML records. # # Perl Solutions considered for parsing the source CSV... # AnyData::Format::CSV # Text::CSV_XS # XML::SAXDriver::CSV # XML::CSV # XML::Excel # # Excel option seems the most fragile, as we can't rely on MS keeping XLS in the # same format (in fact they are planning to move to XML). The SAXDriver option would # be best for serial handling of *large* CSV files. # Text::CSV_XS is an extenstion of Text::CSV by an author who's published a number # of modules on CPAN, and it seems the most mature of the above. # # Dependencies: # Perl Modules... # Text::CSV_XS parsing the input CSV data # XML::Writer writing the XML output # # Expected Parameters: # filename to parse # # Returns: # XML document # # Sample Invocation: # perl filename.txt > output.xml # # Error situations: # Will bomb-out w error if sourcefile not found or if not able to parse the # CSV line data for some reason. # # There is currently no error handling for missing fields either, so if someone # removes a field in a row, that data will appear in the wrong output column. # --------------------------------------------------------------------------------- use strict; use XML::Writer; use Text::CSV_XS; use HTML::Entities; my $file2Read = $ARGV[0]; my $syntax = "\nSyntax: $0 \n"; unless ((-e $file2Read) && (-f $file2Read)) { die "No such file $file2Read $syntax"; } my $parentTagName = "phonebook"; my $nodeTagName = "person"; my $csv = Text::CSV_XS->new( { 'quote_char' => '"', 'escape_char' => '"', 'sep_char' => ',', 'binary' => '1' } ); open(INFILE, "<" . $file2Read) || die "Cant open file!"; # Check to see if the file contains a set of field header/names in the 1st row my $line = <>; # $line =~ tr/\r\n|\n//d; # If a Windows machine source file, remove CR/LF pairs off end of line my @fields = split(/,/,$line); my $fieldCount = @fields; # Cleanup field name formats, remove spaces so we just have A-Z chars for (my $i=1; $i<$fieldCount; $i++) { $fields[$i] =~ s/[^a-z]//igs; } my $writer = new XML::Writer( NEWLINES => 1 ); $writer->xmlDecl(); # Send declaration so we can post-process w XSLT later $writer->startTag($parentTagName); while ($line = <>){ if ($csv->parse($line)) { my @field = $csv->fields; my $count = 0; $writer->startTag($nodeTagName); for my $column (@field) { $writer->startTag($fields[$count]); $writer->characters(HTML::Entities::encode($column)); $writer->endTag($fields[$count]); $count++; } $writer->endTag($nodeTagName); } else { my $err = $csv->error_input; print STDERR "parse() failed on argument: ", $err, "\n"; } } close(INFILE); $writer->endTag($parentTagName); $writer->end(); exit;