diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/HTML/AutoConvert.pm | 39 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/OpenOffice.pm | 77 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/poppler.pm | 69 |
3 files changed, 143 insertions, 42 deletions
diff --git a/lib/HTML/AutoConvert.pm b/lib/HTML/AutoConvert.pm index 7df3b82..bdbc5fd 100644 --- a/lib/HTML/AutoConvert.pm +++ b/lib/HTML/AutoConvert.pm @@ -23,6 +23,8 @@ our $VERSION = '0.01'; #or to turn on debugging my $converter = HTML::AutoConvert->new('debug'=>1); + my $html = $converter->html_convert( $file ); + # OR my( $html, @images ) = $converter->html_convert( $file ); #turn on or off debugging later @@ -58,7 +60,22 @@ sub new { =head2 html_convert FILENAME -Convert the given filename to HTML. The HTML output is returned as a scalar. +Convert the given filename to HTML. + +In a scalar context, simply returns the HTML output as a scalar. + + my $html = $converter->html_convert( $file ); + +In a list context, returns a list consisting of the HTML output as a scalar, +followed by references for each image extracted, if any. Each image reference +is a list reference consisting of two elements: the first is the filename and +the second is the image itself. + + my( $html, @images ) = $converter->html_convert( $file ); + foreach my $image ( @images ) { + my( $filename, $data ) = @$image; + #... + } =cut @@ -72,10 +89,21 @@ sub html_convert { or die "no registered handlers for filetype ". $self->filetype( $file ); my( $converted, $html, $errors ) = ( 0, '', '' ); + my @imgs = (); foreach my $handler ( @handlers ) { my $module = 'HTML::AutoConvert::'. $handler->{'module'}; - my $tmp_html = eval { $module->html_convert( $self->{'file'} ) }; + + my $tmp_html = ''; + my @tmp_imgs = (); + if ( $handler->{'returns_images'} && wantarray ) { + ( $tmp_html, @tmp_imgs ) = + eval { $module->html_convert( $self->{'file'} ) }; + } else { + $tmp_html = + eval { $module->html_convert( $self->{'file'} ) }; + } + if ( $@ ) { my $tmp_err = "conversion with $module failed: $@\n"; warn $tmp_err if $self->{'debug'}; @@ -85,12 +113,17 @@ sub html_convert { $converted = 1; $html = $tmp_html; + @imgs = @tmp_imgs; last; } die "couldn't convert $file:\n$errors" unless $converted; - $html; + if ( wantarray ) { + ( $html, @imgs ); + } else { + $html; + } } diff --git a/lib/HTML/AutoConvert/OpenOffice.pm b/lib/HTML/AutoConvert/OpenOffice.pm index e09a9e4..7b35595 100644 --- a/lib/HTML/AutoConvert/OpenOffice.pm +++ b/lib/HTML/AutoConvert/OpenOffice.pm @@ -34,24 +34,66 @@ use strict; use vars qw( %info ); #$slept ); use IPC::Run qw( run timeout io ); use File::Slurp qw( slurp ); +use Archive::Zip qw( :ERROR_CODES :CONSTANTS ); %info = ( - 'types' => [qw( doc rtf odt sxw )], - 'weight' => 80, - 'url' => 'http://wvware.sourceforge.net/', + 'types' => [qw( doc rtf odt sxw )], + 'weight' => 10, + 'returns_images' => 1, + 'url' => 'http://www.openoffice.org/', ); #$slept = 0; #sub program { ( 'openoffice', '-headless' ); } -#half-ass using DocumentConverter.py for now -#need to recode with OpenOffice::UNO - sub html_convert { my( $self, $file ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; + my $outfile = $self->odconvert($file, 'html'); + my $html = slurp($outfile); + unlink($outfile) or warn "can't unlink $outfile"; + + return $html unless wantarray; + + my @images = $self->extract_images($file, $opt); + + ( $html, @images ); + +} + +#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/ +sub extract_images { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + my $zipfile = $self->odconvert($file, 'odt'); + my $zip = Archive::Zip->new(); + + unless ( $zip->read( $zipfile ) == AZ_OK ) { + die "error reading $zipfile for images"; + } + + my @members = $zip->membersMatching( '^Pictures/*' ); + + my @images = map { + ( my $filename = $_->fileName ) =~ s/^.*\///; + [ $filename, $zip->contents($_) ]; + } + @members; + + unlink($zipfile); + + @images; +} + +#half-ass using DocumentConverter.py for now +#need to recode with OpenOffice::UNO +sub odconvert { + my( $self, $file, $suffix ) = ( shift, shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + $self->start_openoffice($opt); my $program = 'DocumentConverter.py'; @@ -59,20 +101,27 @@ sub html_convert { my $timeout = 60; #? use File::Temp qw/ tempfile /; - my($fh, $outfile) = tempfile(SUFFIX => '.html'); + my($fh, $outfile) = tempfile(SUFFIX => ".$suffix"); #hmm, it gets overwritten so $fh is bunk my($out, $err) = ( '', '' ); local($SIG{CHLD}) = sub {}; - run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) ) - or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; - - my $html = slurp($outfile); - - $html; - + eval { + run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) ) + or do { + unlink($outfile) or warn "$!\n"; + die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; + }; + }; + if ( $@ ) { + unlink($outfile) or warn "$!\n"; + die "$program failed: $@\n"; + } + + $outfile; } + sub start_openoffice { my( $self ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; diff --git a/lib/HTML/AutoConvert/poppler.pm b/lib/HTML/AutoConvert/poppler.pm index cca5b0d..a75a54f 100644 --- a/lib/HTML/AutoConvert/poppler.pm +++ b/lib/HTML/AutoConvert/poppler.pm @@ -13,36 +13,55 @@ poppler can be downloaded from http://poppler.freedesktop.org/ use strict; use vars qw( %info ); use base 'HTML::AutoConvert::Run'; +use File::Temp qw( tempdir ); +use File::Slurp qw( slurp ); +use IPC::Run qw( run timeout ); %info = ( - 'types' => 'pdf', - 'weight' => 10, - 'url' => 'http://poppler.freedesktop.org/', + 'types' => 'pdf', + 'weight' => 10, + 'returns_images' => 1, + 'url' => 'http://poppler.freedesktop.org/', ); sub program { ( 'pdftohtml', '-stdout' ) } -#false laziness w/OpenOffice.pm -#sub html_convert { -# my( $self, $file ) = ( shift, shift ); -# my $opt = ref($_[0]) ? shift : { @_ }; -# -# my $program = 'pdftohtml'; -# -# my $timeout = 60; #? -# -# my($out, $err) = ( '', '' ); -# local($SIG{CHLD}) = sub {}; -# run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) ) -# or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; -# -# ( my $outfile = $file ) =~ s/\.pdf$/.html/i -# or die "poppler.pm called with non-PDF file?!"; -# -# my $html = slurp($outfile); -# -# $html; -# -#} +#some false laziness "in spirit" w/OpenOffice.pm +sub html_convert { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + my $html = $self->SUPER::html_convert($file, $opt); + return $html unless wantarray; + + my @images = $self->extract_images($file, $opt); + + ( $html, @images); +} + +sub extract_images { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + my $imgdir = tempdir( CLEANUP=>1 ).'/'; + + #some false laziness w/Run::html_convert :( + my @program = ( 'pdfimages' ); + my $program = $program[0]; + + my $timeout = 60; #? + + my( $out, $err ) = ( '', ''); + local($SIG{CHLD}) = sub {}; + run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) ) + or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n"; + + map { + ( my $filename = $_ ) =~ s/^.*\/\-?//; + [ $filename, scalar(slurp($_)) ]; + } + glob("$imgdir*"); + +} 1; |