diff options
author | ivan <ivan> | 2008-08-12 04:02:02 +0000 |
---|---|---|
committer | ivan <ivan> | 2008-08-12 04:02:02 +0000 |
commit | 9fbeda1dc776c602ce14d3874368d4620c079b60 (patch) | |
tree | 1c4eeceb1a881caa99dd2e2d600c0c637bb87df2 | |
parent | cba80d78f46ea7541c37efd54262ab1c0dff67e9 (diff) |
add image handling and prevent leaking temporary files (ourselves, Archive::Zip might be)
-rw-r--r-- | MANIFEST | 5 | ||||
-rw-r--r-- | Makefile.PL | 7 | ||||
-rw-r--r-- | TODO | 15 | ||||
-rw-r--r-- | lib/HTML/AutoConvert.pm | 39 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/OpenOffice.pm | 77 | ||||
-rw-r--r-- | lib/HTML/AutoConvert/poppler.pm | 69 | ||||
-rw-r--r-- | t/01-doc.t | 2 | ||||
-rw-r--r-- | t/34-doc_images-OpenOffice.t | 35 | ||||
-rw-r--r-- | t/46-pdf_images-poppler.t | 38 | ||||
-rw-r--r-- | t/HeatherElko.doc | bin | 0 -> 36864 bytes |
10 files changed, 237 insertions, 50 deletions
@@ -20,6 +20,9 @@ t/04-doc-OpenOffice.t t/14-rtf-OpenOffice.t t/15-rtf-unrtf.t t/26-pdf-poppler.t +t/34-doc_images-OpenOffice.t +t/46-pdf_images-poppler.t +t/attitude.pdf t/DiaryofaKillerCat.doc +t/HeatherElko.doc t/VEGAN_RECIPES.rtf -t/attitude.pdf diff --git a/Makefile.PL b/Makefile.PL index cd7c009..18af759 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -12,9 +12,10 @@ WriteMakefile( 'INSTALLSCRIPT' => '/usr/local/bin', 'INSTALLSITEBIN' => '/usr/local/bin', PREREQ_PM => { - 'Test::More' => 0, - 'IPC::Run' => 0, - 'File::Slurp' => 0, + 'Test::More' => 0, + 'IPC::Run' => 0, + 'File::Slurp' => 0, + 'Archive::Zip' => 0, }, dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, clean => { FILES => 'HTML-AutoConvert-*' }, @@ -1,6 +1,13 @@ -- DOC: images -- PDF: images -- RTF: images +- add the ability to supress starting our own OO and connect to one running + elsewhere + +- OpenOffice.pm: image converter seems to be leaving behind images in /tmp... + Archive::Zip? + +- auto-convert non-web images to jpg/gif/png? + +- wvWare/other backends besides OO and poppler: handle images? + +- OpenOffice.pm: poll via UNO to determine readiness rather than sleep (or not) -- OpenOffice.pm: poll via UNO to determine readiness rather than sleep - OpenOffice.pm: convert DocumentConverter.py to Perl using OpenOffice::UNO diff --git a/lib/HTML/AutoConvert.pm b/lib/HTML/AutoConvert.pm index 7df3b82..bdbc5fd 100644 --- a/lib/HTML/AutoConvert.pm +++ b/lib/HTML/AutoConvert.pm @@ -23,6 +23,8 @@ our $VERSION = '0.01'; #or to turn on debugging my $converter = HTML::AutoConvert->new('debug'=>1); + my $html = $converter->html_convert( $file ); + # OR my( $html, @images ) = $converter->html_convert( $file ); #turn on or off debugging later @@ -58,7 +60,22 @@ sub new { =head2 html_convert FILENAME -Convert the given filename to HTML. The HTML output is returned as a scalar. +Convert the given filename to HTML. + +In a scalar context, simply returns the HTML output as a scalar. + + my $html = $converter->html_convert( $file ); + +In a list context, returns a list consisting of the HTML output as a scalar, +followed by references for each image extracted, if any. Each image reference +is a list reference consisting of two elements: the first is the filename and +the second is the image itself. + + my( $html, @images ) = $converter->html_convert( $file ); + foreach my $image ( @images ) { + my( $filename, $data ) = @$image; + #... + } =cut @@ -72,10 +89,21 @@ sub html_convert { or die "no registered handlers for filetype ". $self->filetype( $file ); my( $converted, $html, $errors ) = ( 0, '', '' ); + my @imgs = (); foreach my $handler ( @handlers ) { my $module = 'HTML::AutoConvert::'. $handler->{'module'}; - my $tmp_html = eval { $module->html_convert( $self->{'file'} ) }; + + my $tmp_html = ''; + my @tmp_imgs = (); + if ( $handler->{'returns_images'} && wantarray ) { + ( $tmp_html, @tmp_imgs ) = + eval { $module->html_convert( $self->{'file'} ) }; + } else { + $tmp_html = + eval { $module->html_convert( $self->{'file'} ) }; + } + if ( $@ ) { my $tmp_err = "conversion with $module failed: $@\n"; warn $tmp_err if $self->{'debug'}; @@ -85,12 +113,17 @@ sub html_convert { $converted = 1; $html = $tmp_html; + @imgs = @tmp_imgs; last; } die "couldn't convert $file:\n$errors" unless $converted; - $html; + if ( wantarray ) { + ( $html, @imgs ); + } else { + $html; + } } diff --git a/lib/HTML/AutoConvert/OpenOffice.pm b/lib/HTML/AutoConvert/OpenOffice.pm index e09a9e4..7b35595 100644 --- a/lib/HTML/AutoConvert/OpenOffice.pm +++ b/lib/HTML/AutoConvert/OpenOffice.pm @@ -34,24 +34,66 @@ use strict; use vars qw( %info ); #$slept ); use IPC::Run qw( run timeout io ); use File::Slurp qw( slurp ); +use Archive::Zip qw( :ERROR_CODES :CONSTANTS ); %info = ( - 'types' => [qw( doc rtf odt sxw )], - 'weight' => 80, - 'url' => 'http://wvware.sourceforge.net/', + 'types' => [qw( doc rtf odt sxw )], + 'weight' => 10, + 'returns_images' => 1, + 'url' => 'http://www.openoffice.org/', ); #$slept = 0; #sub program { ( 'openoffice', '-headless' ); } -#half-ass using DocumentConverter.py for now -#need to recode with OpenOffice::UNO - sub html_convert { my( $self, $file ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; + my $outfile = $self->odconvert($file, 'html'); + my $html = slurp($outfile); + unlink($outfile) or warn "can't unlink $outfile"; + + return $html unless wantarray; + + my @images = $self->extract_images($file, $opt); + + ( $html, @images ); + +} + +#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/ +sub extract_images { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + my $zipfile = $self->odconvert($file, 'odt'); + my $zip = Archive::Zip->new(); + + unless ( $zip->read( $zipfile ) == AZ_OK ) { + die "error reading $zipfile for images"; + } + + my @members = $zip->membersMatching( '^Pictures/*' ); + + my @images = map { + ( my $filename = $_->fileName ) =~ s/^.*\///; + [ $filename, $zip->contents($_) ]; + } + @members; + + unlink($zipfile); + + @images; +} + +#half-ass using DocumentConverter.py for now +#need to recode with OpenOffice::UNO +sub odconvert { + my( $self, $file, $suffix ) = ( shift, shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + $self->start_openoffice($opt); my $program = 'DocumentConverter.py'; @@ -59,20 +101,27 @@ sub html_convert { my $timeout = 60; #? use File::Temp qw/ tempfile /; - my($fh, $outfile) = tempfile(SUFFIX => '.html'); + my($fh, $outfile) = tempfile(SUFFIX => ".$suffix"); #hmm, it gets overwritten so $fh is bunk my($out, $err) = ( '', '' ); local($SIG{CHLD}) = sub {}; - run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) ) - or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; - - my $html = slurp($outfile); - - $html; - + eval { + run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) ) + or do { + unlink($outfile) or warn "$!\n"; + die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; + }; + }; + if ( $@ ) { + unlink($outfile) or warn "$!\n"; + die "$program failed: $@\n"; + } + + $outfile; } + sub start_openoffice { my( $self ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; diff --git a/lib/HTML/AutoConvert/poppler.pm b/lib/HTML/AutoConvert/poppler.pm index cca5b0d..a75a54f 100644 --- a/lib/HTML/AutoConvert/poppler.pm +++ b/lib/HTML/AutoConvert/poppler.pm @@ -13,36 +13,55 @@ poppler can be downloaded from http://poppler.freedesktop.org/ use strict; use vars qw( %info ); use base 'HTML::AutoConvert::Run'; +use File::Temp qw( tempdir ); +use File::Slurp qw( slurp ); +use IPC::Run qw( run timeout ); %info = ( - 'types' => 'pdf', - 'weight' => 10, - 'url' => 'http://poppler.freedesktop.org/', + 'types' => 'pdf', + 'weight' => 10, + 'returns_images' => 1, + 'url' => 'http://poppler.freedesktop.org/', ); sub program { ( 'pdftohtml', '-stdout' ) } -#false laziness w/OpenOffice.pm -#sub html_convert { -# my( $self, $file ) = ( shift, shift ); -# my $opt = ref($_[0]) ? shift : { @_ }; -# -# my $program = 'pdftohtml'; -# -# my $timeout = 60; #? -# -# my($out, $err) = ( '', '' ); -# local($SIG{CHLD}) = sub {}; -# run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) ) -# or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; -# -# ( my $outfile = $file ) =~ s/\.pdf$/.html/i -# or die "poppler.pm called with non-PDF file?!"; -# -# my $html = slurp($outfile); -# -# $html; -# -#} +#some false laziness "in spirit" w/OpenOffice.pm +sub html_convert { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + my $html = $self->SUPER::html_convert($file, $opt); + return $html unless wantarray; + + my @images = $self->extract_images($file, $opt); + + ( $html, @images); +} + +sub extract_images { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + my $imgdir = tempdir( CLEANUP=>1 ).'/'; + + #some false laziness w/Run::html_convert :( + my @program = ( 'pdfimages' ); + my $program = $program[0]; + + my $timeout = 60; #? + + my( $out, $err ) = ( '', ''); + local($SIG{CHLD}) = sub {}; + run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) ) + or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n"; + + map { + ( my $filename = $_ ) =~ s/^.*\/\-?//; + [ $filename, scalar(slurp($_)) ]; + } + glob("$imgdir*"); + +} 1; @@ -1,5 +1,7 @@ #!perl +BEGIN { chomp($pwd=`pwd`); $ENV{PATH} .= ":$pwd/bin"; }; + use Test::More tests => 2; use HTML::AutoConvert; diff --git a/t/34-doc_images-OpenOffice.t b/t/34-doc_images-OpenOffice.t new file mode 100644 index 0000000..ba32b2d --- /dev/null +++ b/t/34-doc_images-OpenOffice.t @@ -0,0 +1,35 @@ +#!perl + +BEGIN { chomp($pwd=`pwd`); $ENV{PATH} .= ":$pwd/bin"; }; + +use Test::More tests => 5; + +use HTML::AutoConvert; + +my $c = new HTML::AutoConvert; + +my $force = 'OpenOffice'; +#$c->{'handlers'}{'doc'}{$force}{'weight'} = -1; +my @del = grep { $_ ne $force } keys %{ $c->{'handlers'}{'doc'} }; +delete($c->{'handlers'}{'doc'}{$_}) foreach @del; + +my( $html, @images ) = $c->html_convert('t/HeatherElko.doc'); + +ok( scalar(@images) == 2, 'got two images' ); + +#save em off +#foreach my $image (@images) { +# my( $file, $data) = @$image; +# open(FILE, ">t/$file") or die $!; +# print FILE $data; +# close FILE or die $!; +#} + +#check the names & lengths at least +is( $images[0]->[0], '10000000000000C80000009688B0FEF3.png', '1st image name'); +ok( length($images[0]->[1]) == 8704, '1st image size'); + +is( $images[1]->[0], '100000000000009D0000009F54B4BCB3.png', '2nd image name'); +ok( length($images[1]->[1]) == 2125, '2nd image size'); + + diff --git a/t/46-pdf_images-poppler.t b/t/46-pdf_images-poppler.t new file mode 100644 index 0000000..9bc1fc1 --- /dev/null +++ b/t/46-pdf_images-poppler.t @@ -0,0 +1,38 @@ +#!perl + +use Test::More tests => 9; + +use HTML::AutoConvert; + +my $c = new HTML::AutoConvert; + +my $force = 'poppler'; +#$c->{'handlers'}{'doc'}{$force}{'weight'} = -1; +my @del = grep { $_ ne $force } keys %{ $c->{'handlers'}{'pdf'} }; +delete($c->{'handlers'}{'pdf'}{$_}) foreach @del; + +my( $html, @images ) = $c->html_convert('t/attitude.pdf'); + +ok( scalar(@images) == 21, 'got 21 images' ); + +#save em off +#foreach my $image (@images) { +# my( $file, $data) = @$image; +# open(FILE, ">t/$file") or die $!; +# print FILE $data; +# close FILE or die $!; +#} + +#check the names & lengths at least +is( $images[0]->[0], '000.ppm', '1st image name'); +ok( length($images[0]->[1]) == 25949, '1st image size'); + +is( $images[1]->[0], '001.ppm', '2nd image name'); +ok( length($images[1]->[1]) == 43664, '1st image size'); + +is( $images[2]->[0], '002.ppm', '3rd image name'); +ok( length($images[2]->[1]) == 46833, '1st image size'); + +is( $images[9]->[0], '009.ppm', '10th image name'); +ok( length($images[9]->[1]) == 46374, '10th image size'); + diff --git a/t/HeatherElko.doc b/t/HeatherElko.doc Binary files differnew file mode 100644 index 0000000..4af5af7 --- /dev/null +++ b/t/HeatherElko.doc |