#!/usr/bin/perl -sw
=head1 NAME
htmltable2csv - Script to convert HTML tables to CSV
=head1 VERSION
0.01
=head1 SYNOPSIS
htmltable2csv file.html > file.csv
curl -sS some/url | htmltable2csv > file.csv
htmltable2csv --separator '
' file.html > file.csv
=head1 DESCRIPTION
The name says it all, except for the C<--separator> option, which will be
inserted in the CSV file between tables. The default is a new line
character.
=begin comment
=head1 README
Script to convert HTML tables to CSV
=end comment
=head1 BUGS
Column and row spanning is currently ignored.
Character encoding is not taken into account. If the file is in any
encoding other than ISO Latin 1, it can easily be mangled.
This script is currently slow. It would probably run faster if I made it
use HTML::TableExtract or HTML::TableContentParser, but I couldn't be
bothered to learn the former's interface and the latter does not support
HTML entities, and I already know how to use the bloated monstrosity known
as 'HTML::DOM,' even though its interface is horrible clunky.
=head1 PREREQUISITES
HTML::DOM 0.010
Text::CSV_XS
=head1 SCRIPT CATEGORIES
Web
=head1 AUTHOR AND COPYRIGHT
Copyright (C) 2007 Father Chrysostomos (gro.napc ta tuorps [backwards])
This program is free software; you may redistribute it and/or modify
it under the same terms as perl.
=head1 SEE ALSO
L and L, which this script uses.
L and L, or
L, which this script would probably do well to use.
L, which inspired me to write this (when I found that UPS's
2008 zone charts are HTML files with C<.xls> extensions [!]).
=cut
# OK, here’s the code:
my $s = ${'-separator'};
defined $s or $s = "\n";
use Text::CSV_XS;
#use HTML::TableContentParser;
use HTML::DOM;
my $tcx = new Text::CSV_XS { binary => 1};
# HTML::TableContentParser doesn’t support entities.
#my $not_first;
#for(@{new HTML::TableContentParser->parse(join '', <>)}){
# print $s if $not_first++;
# for (@{$$_{rows}}) {
# combine $tcx map $$_{data}, @{$$_{cells}};
# print +string $tcx, "\n";
# }
#}
my $not_first;
my $doc = new HTML::DOM;
$doc->write($_) while <>;
for(getElementsByTagName $doc 'table'){
print $s if $not_first++;
for (rows$_) {
combine $tcx map as_text$_, cells$_;
print +string $tcx, "\n";
}
}
# That’s it! Short, isn’t it?