use vars qw( %info ); #$slept );
use IPC::Run qw( run timeout io );
use File::Slurp qw( slurp );
+use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
%info = (
- 'types' => [qw( doc rtf odt sxw )],
- 'weight' => 80,
- 'url' => 'http://wvware.sourceforge.net/',
+ 'types' => [qw( doc rtf odt sxw )],
+ 'weight' => 10,
+ 'returns_images' => 1,
+ 'url' => 'http://www.openoffice.org/',
);
#$slept = 0;
#sub program { ( 'openoffice', '-headless' ); }
-#half-ass using DocumentConverter.py for now
-#need to recode with OpenOffice::UNO
-
sub html_convert {
my( $self, $file ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
+ my $outfile = $self->odconvert($file, 'html');
+ my $html = slurp($outfile);
+ unlink($outfile) or warn "can't unlink $outfile";
+
+ return $html unless wantarray;
+
+ my @images = $self->extract_images($file, $opt);
+
+ ( $html, @images );
+
+}
+
+#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/
+sub extract_images {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ my $zipfile = $self->odconvert($file, 'odt');
+ my $zip = Archive::Zip->new();
+
+ unless ( $zip->read( $zipfile ) == AZ_OK ) {
+ die "error reading $zipfile for images";
+ }
+
+ my @members = $zip->membersMatching( '^Pictures/*' );
+
+ my @images = map {
+ ( my $filename = $_->fileName ) =~ s/^.*\///;
+ [ $filename, $zip->contents($_) ];
+ }
+ @members;
+
+ unlink($zipfile);
+
+ @images;
+}
+
+#half-ass using DocumentConverter.py for now
+#need to recode with OpenOffice::UNO
+sub odconvert {
+ my( $self, $file, $suffix ) = ( shift, shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
$self->start_openoffice($opt);
my $program = 'DocumentConverter.py';
my $timeout = 60; #?
use File::Temp qw/ tempfile /;
- my($fh, $outfile) = tempfile(SUFFIX => '.html');
+ my($fh, $outfile) = tempfile(SUFFIX => ".$suffix");
#hmm, it gets overwritten so $fh is bunk
my($out, $err) = ( '', '' );
local($SIG{CHLD}) = sub {};
- run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
- or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-
- my $html = slurp($outfile);
-
- $html;
-
+ eval {
+ run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
+ or do {
+ unlink($outfile) or warn "$!\n";
+ die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
+ };
+ };
+ if ( $@ ) {
+ unlink($outfile) or warn "$!\n";
+ die "$program failed: $@\n";
+ }
+
+ $outfile;
}
+
sub start_openoffice {
my( $self ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };