diff options
Diffstat (limited to 'lib/HTML')
-rw-r--r-- | lib/HTML/.AutoConvert.pm.swp | bin | 0 -> 20480 bytes | |||
-rw-r--r-- | lib/HTML/AutoConvert.pm | 249 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/.OpenOffice.pm.swp | bin | 0 -> 12288 bytes | |||
-rw-r--r-- | lib/HTML/AutoConvert/.Run.pm.swp | bin | 0 -> 12288 bytes | |||
-rw-r--r-- | lib/HTML/AutoConvert/.antiword.pm.swp | bin | 0 -> 12288 bytes | |||
-rw-r--r-- | lib/HTML/AutoConvert/.poppler.pm.swp | bin | 0 -> 12288 bytes | |||
-rw-r--r-- | lib/HTML/AutoConvert/.unrtf.pm.swp | bin | 0 -> 12288 bytes | |||
-rw-r--r-- | lib/HTML/AutoConvert/.wvWare.pm.swp | bin | 0 -> 12288 bytes | |||
-rw-r--r-- | lib/HTML/AutoConvert/OpenOffice.pm | 96 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/Run.pm | 30 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/antiword.pm | 31 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/poppler.pm | 48 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/unrtf.pm | 25 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/wvWare.pm | 25 |
14 files changed, 504 insertions, 0 deletions
diff --git a/lib/HTML/.AutoConvert.pm.swp b/lib/HTML/.AutoConvert.pm.swp Binary files differnew file mode 100644 index 0000000..f6927d2 --- /dev/null +++ b/lib/HTML/.AutoConvert.pm.swp diff --git a/lib/HTML/AutoConvert.pm b/lib/HTML/AutoConvert.pm new file mode 100644 index 0000000..7df3b82 --- /dev/null +++ b/lib/HTML/AutoConvert.pm @@ -0,0 +1,249 @@ +package HTML::AutoConvert; + +use warnings; +use strict; + +=head1 NAME + +HTML::AutoConvert - Best-effort HTML conversion of arbitrary files to HTML. + +=head1 VERSION + +Version 0.01 + +=cut + +our $VERSION = '0.01'; + +=head1 SYNOPSIS + + use HTML::AutoConvert; + + my $converter = HTML::AutoConvert->new(); + #or to turn on debugging + my $converter = HTML::AutoConvert->new('debug'=>1); + + my( $html, @images ) = $converter->html_convert( $file ); + + #turn on or off debugging later + $converter->debug(1); + +=head1 DESCRIPTION + +Convert arbitrary file types to HTML. + +#=head1 EXPORT +# +#doc on also using html_convert functional interface + +=head1 FUNCTIONS + +=head2 new + +=cut + +sub new { + my $proto = shift; + my $class = ref($proto) || $proto; + + my $opts = ref($_[0]) ? shift : { @_ }; + my $self = $opts; #{}; + bless ($self, $class); + + $self->find_handlers; + + $self; + +} + +=head2 html_convert FILENAME + +Convert the given filename to HTML. The HTML output is returned as a scalar. + +=cut + +sub html_convert { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + $self->{'file'} = $file; + + my @handlers = $self->handlers + or die "no registered handlers for filetype ". $self->filetype( $file ); + + my( $converted, $html, $errors ) = ( 0, '', '' ); + foreach my $handler ( @handlers ) { + + my $module = 'HTML::AutoConvert::'. $handler->{'module'}; + my $tmp_html = eval { $module->html_convert( $self->{'file'} ) }; + if ( $@ ) { + my $tmp_err = "conversion with $module failed: $@\n"; + warn $tmp_err if $self->{'debug'}; + $errors .= $tmp_err; + next; + } + + $converted = 1; + $html = $tmp_html; + last; + } + + die "couldn't convert $file:\n$errors" unless $converted; + + $html; + +} + +=head2 debug + +Get or set the debugging level + +=cut + +sub debug { + my $self = shift; + $self->{'debug'} = shift if @_; + $self->{'debug'}; +} + +=head1 INTERNAL FUNCTIONS + +=head2 find_handlers + +Search for installed HTML::AutoConvert::* plugins. + +=cut + +sub find_handlers { + my $self = shift; + + my %types; + foreach my $INC ( @INC ) { + warn "globbing $INC/HTML/AutoConvert/*.pm\n" if $self->{'debug'}; + foreach my $file ( glob("$INC/HTML/AutoConvert/*.pm") ) { + warn "attempting to load handler info from $file\n" if $self->{'debug'}; + $file =~ /\/(\w+)\.pm$/ or do { + warn "unrecognized file in $INC/HTML/AutoConvert/: $file\n"; + next; + }; + my $mod = $1; + my $info = eval "use HTML::AutoConvert::$mod; ". + "\\%HTML::AutoConvert::$mod\::info;"; + if ( $@ ) { + die "error using HTML::AutoConvert::$mod (skipping): $@\n" if $@; + next; + } + unless ( keys %$info ) { + warn "no %info hash in HTML::AutoConvert::$mod, skipping\n" if $self->{'debug'}; + next; + } + warn "got handler info from HTML::AutoConvert::$mod: $info\n" if $self->{'debug'}; + if ( exists($info->{'disabled'}) && $info->{'disabled'} ) { + warn "skipping disabled handler HTML::AutoConvert::$mod" if $self->{'debug'}; + next; + } + + my $types = $info->{'types'}; + $types = [ $types ] unless ref($types); + + foreach my $type ( @$types ) { + $types{lc($type)}->{$mod} = { 'module' => $mod, %$info }; + } + + } + } + + $self->{'handlers'} = \%types; + +} + +=head2 handlers + +Return the available handlers for the current file. + +=cut + +sub handlers { + my( $self ) = @_; + + my $types = $self->{'handlers'}; + + my $type = $self->filetype; + + sort { $a->{'weight'} <=> $b->{'weight'} } + values %{ $types->{lc($type)} }; +} + +=head2 + + +=head2 filetype + +Determine the type of the current file. + +=cut + +#just use the file extension... could also use File::MMagic or something +sub filetype { + my $self = shift; + + my $file = $self->{'file'}; + $file =~ /\.(\w{3,4})$/ or die "can't parse $file for extension"; + lc($1); +} + +=head1 AUTHOR + +Ivan Kohler, C<< <ivan-html-autoconvert at 420.am> >> + +=head1 BUGS + +Please report any bugs or feature requests to C<bug-html-autoconvert at rt.cpan.org>, or through +the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-AutoConvert>. I will be notified, and then you'll +automatically be notified of progress on your bug as I make changes. + +=head1 SUPPORT + +You can find documentation for this module with the perldoc command. + + perldoc HTML::AutoConvert + +You can also look for information at: + +=over 4 + +=item * RT: CPAN's request tracker + +L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-AutoConvert> + +=item * AnnoCPAN: Annotated CPAN documentation + +L<http://annocpan.org/dist/HTML-AutoConvert> + +=item * CPAN Ratings + +L<http://cpanratings.perl.org/d/HTML-AutoConvert> + +=item * Search CPAN + +L<http://search.cpan.org/dist/HTML-AutoConvert> + +=back + + +=head1 ACKNOWLEDGEMENTS + + + +=head1 COPYRIGHT & LICENSE + +Copyright 2008 Freeside Internet Services, Inc. +All rights reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=cut + +1; # End of HTML::AutoConvert + diff --git a/lib/HTML/AutoConvert/.OpenOffice.pm.swp b/lib/HTML/AutoConvert/.OpenOffice.pm.swp Binary files differnew file mode 100644 index 0000000..750ad68 --- /dev/null +++ b/lib/HTML/AutoConvert/.OpenOffice.pm.swp diff --git a/lib/HTML/AutoConvert/.Run.pm.swp b/lib/HTML/AutoConvert/.Run.pm.swp Binary files differnew file mode 100644 index 0000000..8a90a9a --- /dev/null +++ b/lib/HTML/AutoConvert/.Run.pm.swp diff --git a/lib/HTML/AutoConvert/.antiword.pm.swp b/lib/HTML/AutoConvert/.antiword.pm.swp Binary files differnew file mode 100644 index 0000000..79e92ef --- /dev/null +++ b/lib/HTML/AutoConvert/.antiword.pm.swp diff --git a/lib/HTML/AutoConvert/.poppler.pm.swp b/lib/HTML/AutoConvert/.poppler.pm.swp Binary files differnew file mode 100644 index 0000000..2c011bf --- /dev/null +++ b/lib/HTML/AutoConvert/.poppler.pm.swp diff --git a/lib/HTML/AutoConvert/.unrtf.pm.swp b/lib/HTML/AutoConvert/.unrtf.pm.swp Binary files differnew file mode 100644 index 0000000..9bc779c --- /dev/null +++ b/lib/HTML/AutoConvert/.unrtf.pm.swp diff --git a/lib/HTML/AutoConvert/.wvWare.pm.swp b/lib/HTML/AutoConvert/.wvWare.pm.swp Binary files differnew file mode 100644 index 0000000..39e4317 --- /dev/null +++ b/lib/HTML/AutoConvert/.wvWare.pm.swp diff --git a/lib/HTML/AutoConvert/OpenOffice.pm b/lib/HTML/AutoConvert/OpenOffice.pm new file mode 100644 index 0000000..e09a9e4 --- /dev/null +++ b/lib/HTML/AutoConvert/OpenOffice.pm @@ -0,0 +1,96 @@ +package HTML::AutoConvert::OpenOffice; + +=head1 NAME + +HTML::AutoConvert::antiword - antiword plugin for HTML::AutoConvert + +=head1 PREREQUISITES + +OpenOffice v2.3 or later + +(currently) +Python +Python-UNO + +(future) +Perl OpenOffice::UNO + +=head1 SECURITY NOTE + +This module starts and leaves an OpenOffice instance running. + +The OpenOffice instance binds to and listens to a port on localhost for +commands. Anything which can talk to this port can instruct OpenOffice to +read or write any file the current user has access to. + +By default, port 8100 is used. You can choose a different port by passing an +option to the new() constructor: + + my $converter = HTML::AutoConvert->new('openoffice_port'=>5555); + +=cut + +use strict; +use vars qw( %info ); #$slept ); +use IPC::Run qw( run timeout io ); +use File::Slurp qw( slurp ); + +%info = ( + 'types' => [qw( doc rtf odt sxw )], + 'weight' => 80, + 'url' => 'http://wvware.sourceforge.net/', +); + +#$slept = 0; + +#sub program { ( 'openoffice', '-headless' ); } + +#half-ass using DocumentConverter.py for now +#need to recode with OpenOffice::UNO + +sub html_convert { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + $self->start_openoffice($opt); + + my $program = 'DocumentConverter.py'; + + my $timeout = 60; #? + + use File::Temp qw/ tempfile /; + my($fh, $outfile) = tempfile(SUFFIX => '.html'); + #hmm, it gets overwritten so $fh is bunk + + my($out, $err) = ( '', '' ); + local($SIG{CHLD}) = sub {}; + run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) ) + or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; + + my $html = slurp($outfile); + + $html; + +} + +sub start_openoffice { + my( $self ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + my $port = $opt->{'openoffice_port'} || 8100; + + my $cmd = [ 'openoffice', '-headless', + "-accept=socket,port=$port;urp", + #'-splash-pipe=5', + ]; + + local($SIG{CHLD}) = sub {}; + run $cmd, \undef, '>/dev/null', '2>/dev/null' + or die "can't launch openoffice: $@\n"; + + #it isn't ready to run commands right away :( + #it would be better if we could ping against UNO API somehow until ready... + #sleep 5 unless $slept++; + +} + +1; diff --git a/lib/HTML/AutoConvert/Run.pm b/lib/HTML/AutoConvert/Run.pm new file mode 100644 index 0000000..2eada89 --- /dev/null +++ b/lib/HTML/AutoConvert/Run.pm @@ -0,0 +1,30 @@ +package HTML::AutoConvert::Run; + +=head1 NAME + +HTML::AutoConvert::Run - Base class for HTML::AutoConvert plugs that run an external program + +=cut + +use strict; +use IPC::Run qw( run timeout ); + +sub html_convert { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + my @program = $self->program; + my $program = $program[0]; + + my $timeout = 60; #? + + my( $html, $err ) = ( '', ''); + local($SIG{CHLD}) = sub {}; + run( [ @program, $file ], \undef, \$html, \$err, timeout($timeout) ) + or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n"; + + $html; + +} + +1; diff --git a/lib/HTML/AutoConvert/antiword.pm b/lib/HTML/AutoConvert/antiword.pm new file mode 100644 index 0000000..4622a79 --- /dev/null +++ b/lib/HTML/AutoConvert/antiword.pm @@ -0,0 +1,31 @@ +package HTML::AutoConvert::antiword; + +=head1 NAME + +HTML::AutoConvert::antiword - antiword plugin for HTML::AutoConvert + +=head1 URL + +Antiword can be downloaded from http://www.winfield.demon.nl/index.html + +=cut + +use strict; +use vars qw( %info ); +use base 'HTML::AutoConvert::Run'; + +%info = ( + 'types' => 'doc', + 'weight' => 90, + 'url' => 'http://www.winfield.demon.nl/index.html', +); + +sub program { ( 'antiword' ) } + +sub html_convert { + my $self = shift; + my $html = $self->SUPER::html_convert(@_); + "<HTML><HEAD></HEAD><BODY><PRE>\n$html\n</PRE></BODY></HTML>"; +} + +1; diff --git a/lib/HTML/AutoConvert/poppler.pm b/lib/HTML/AutoConvert/poppler.pm new file mode 100644 index 0000000..cca5b0d --- /dev/null +++ b/lib/HTML/AutoConvert/poppler.pm @@ -0,0 +1,48 @@ +package HTML::AutoConvert::poppler; + +=head1 NAME + +HTML::AutoConvert::poppler - poppler (pdftohtml) plugin for HTML::AutoConvert + +=head1 URL + +poppler can be downloaded from http://poppler.freedesktop.org/ + +=cut + +use strict; +use vars qw( %info ); +use base 'HTML::AutoConvert::Run'; + +%info = ( + 'types' => 'pdf', + 'weight' => 10, + 'url' => 'http://poppler.freedesktop.org/', +); + +sub program { ( 'pdftohtml', '-stdout' ) } + +#false laziness w/OpenOffice.pm +#sub html_convert { +# my( $self, $file ) = ( shift, shift ); +# my $opt = ref($_[0]) ? shift : { @_ }; +# +# my $program = 'pdftohtml'; +# +# my $timeout = 60; #? +# +# my($out, $err) = ( '', '' ); +# local($SIG{CHLD}) = sub {}; +# run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) ) +# or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; +# +# ( my $outfile = $file ) =~ s/\.pdf$/.html/i +# or die "poppler.pm called with non-PDF file?!"; +# +# my $html = slurp($outfile); +# +# $html; +# +#} + +1; diff --git a/lib/HTML/AutoConvert/unrtf.pm b/lib/HTML/AutoConvert/unrtf.pm new file mode 100644 index 0000000..034440e --- /dev/null +++ b/lib/HTML/AutoConvert/unrtf.pm @@ -0,0 +1,25 @@ +package HTML::AutoConvert::unrtf; + +=head1 NAME + +HTML::AutoConvert::unrtf - unrtf plugin for HTML::AutoConvert + +=head1 URL + +unrtf can be downloaded from ftp://ftp.gnu.org/pub/gnu/unrtf/ + +=cut + +use strict; +use vars qw( %info ); +use base 'HTML::AutoConvert::Run'; + +%info = ( + 'types' => 'rtf', + 'weight' => 90, + 'url' => 'ftp://ftp.gnu.org/pub/gnu/unrtf/', +); + +sub program { ( 'unrtf' ) } + +1; diff --git a/lib/HTML/AutoConvert/wvWare.pm b/lib/HTML/AutoConvert/wvWare.pm new file mode 100644 index 0000000..26a57cd --- /dev/null +++ b/lib/HTML/AutoConvert/wvWare.pm @@ -0,0 +1,25 @@ +package HTML::AutoConvert::wvWare; + +=head1 NAME + +HTML::AutoConvert::wvWare - wvWare plugin for HTML::AutoConvert + +=head1 URL + +wvWare can be downloaded from http://wvware.sourceforge.net/ + +=cut + +use strict; +use vars qw( %info ); +use base 'HTML::AutoConvert::Run'; + +%info = ( + 'types' => 'doc', + 'weight' => 80, + 'url' => 'http://wvware.sourceforge.net/', +); + +sub program { ( 'wvWare' ) } + +1; |