diff options
author | ivan <ivan> | 2008-08-12 04:02:02 +0000 |
---|---|---|
committer | ivan <ivan> | 2008-08-12 04:02:02 +0000 |
commit | 9fbeda1dc776c602ce14d3874368d4620c079b60 (patch) | |
tree | 1c4eeceb1a881caa99dd2e2d600c0c637bb87df2 /lib/HTML/AutoConvert/OpenOffice.pm | |
parent | cba80d78f46ea7541c37efd54262ab1c0dff67e9 (diff) |
add image handling and prevent leaking temporary files (ourselves, Archive::Zip might be)
Diffstat (limited to 'lib/HTML/AutoConvert/OpenOffice.pm')
-rw-r--r-- | lib/HTML/AutoConvert/OpenOffice.pm | 77 |
1 files changed, 63 insertions, 14 deletions
diff --git a/lib/HTML/AutoConvert/OpenOffice.pm b/lib/HTML/AutoConvert/OpenOffice.pm index e09a9e4..7b35595 100644 --- a/lib/HTML/AutoConvert/OpenOffice.pm +++ b/lib/HTML/AutoConvert/OpenOffice.pm @@ -34,24 +34,66 @@ use strict; use vars qw( %info ); #$slept ); use IPC::Run qw( run timeout io ); use File::Slurp qw( slurp ); +use Archive::Zip qw( :ERROR_CODES :CONSTANTS ); %info = ( - 'types' => [qw( doc rtf odt sxw )], - 'weight' => 80, - 'url' => 'http://wvware.sourceforge.net/', + 'types' => [qw( doc rtf odt sxw )], + 'weight' => 10, + 'returns_images' => 1, + 'url' => 'http://www.openoffice.org/', ); #$slept = 0; #sub program { ( 'openoffice', '-headless' ); } -#half-ass using DocumentConverter.py for now -#need to recode with OpenOffice::UNO - sub html_convert { my( $self, $file ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; + my $outfile = $self->odconvert($file, 'html'); + my $html = slurp($outfile); + unlink($outfile) or warn "can't unlink $outfile"; + + return $html unless wantarray; + + my @images = $self->extract_images($file, $opt); + + ( $html, @images ); + +} + +#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/ +sub extract_images { + my( $self, $file ) = ( shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + + my $zipfile = $self->odconvert($file, 'odt'); + my $zip = Archive::Zip->new(); + + unless ( $zip->read( $zipfile ) == AZ_OK ) { + die "error reading $zipfile for images"; + } + + my @members = $zip->membersMatching( '^Pictures/*' ); + + my @images = map { + ( my $filename = $_->fileName ) =~ s/^.*\///; + [ $filename, $zip->contents($_) ]; + } + @members; + + unlink($zipfile); + + @images; +} + +#half-ass using DocumentConverter.py for now +#need to recode with OpenOffice::UNO +sub odconvert { + my( $self, $file, $suffix ) = ( shift, shift, shift ); + my $opt = ref($_[0]) ? shift : { @_ }; + $self->start_openoffice($opt); my $program = 'DocumentConverter.py'; @@ -59,20 +101,27 @@ sub html_convert { my $timeout = 60; #? use File::Temp qw/ tempfile /; - my($fh, $outfile) = tempfile(SUFFIX => '.html'); + my($fh, $outfile) = tempfile(SUFFIX => ".$suffix"); #hmm, it gets overwritten so $fh is bunk my($out, $err) = ( '', '' ); local($SIG{CHLD}) = sub {}; - run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) ) - or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; - - my $html = slurp($outfile); - - $html; - + eval { + run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) ) + or do { + unlink($outfile) or warn "$!\n"; + die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; + }; + }; + if ( $@ ) { + unlink($outfile) or warn "$!\n"; + die "$program failed: $@\n"; + } + + $outfile; } + sub start_openoffice { my( $self ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; |