package HTML::AutoConvert::OpenOffice; =head1 NAME HTML::AutoConvert::antiword - antiword plugin for HTML::AutoConvert =head1 PREREQUISITES OpenOffice v2.3 or later (currently) Python Python-UNO (future) Perl OpenOffice::UNO =head1 SECURITY NOTE This module starts and leaves an OpenOffice instance running. The OpenOffice instance binds to and listens to a port on localhost for commands. Anything which can talk to this port can instruct OpenOffice to read or write any file the current user has access to. By default, port 8100 is used. You can choose a different port by passing an option to the new() constructor: my $converter = HTML::AutoConvert->new('openoffice_port'=>5555); =cut use strict; use vars qw( %info ); #$slept ); use IPC::Run qw( run timeout io ); use File::Slurp qw( slurp ); use Archive::Zip qw( :ERROR_CODES :CONSTANTS ); %info = ( 'types' => [qw( doc rtf odt sxw )], 'weight' => 10, 'returns_images' => 1, 'url' => 'http://www.openoffice.org/', ); #$slept = 0; #sub program { ( 'openoffice', '-headless' ); } sub html_convert { my( $self, $file ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; my $outfile = $self->odconvert($file, 'html'); my $html = slurp($outfile); unlink($outfile) or warn "can't unlink $outfile"; return $html unless wantarray; my @images = $self->extract_images($file, $opt); ( $html, @images ); } #http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/ sub extract_images { my( $self, $file ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; my $zipfile = $self->odconvert($file, 'odt'); my $zip = Archive::Zip->new(); unless ( $zip->read( $zipfile ) == AZ_OK ) { die "error reading $zipfile for images"; } my @members = $zip->membersMatching( '^Pictures/*' ); my @images = map { ( my $filename = $_->fileName ) =~ s/^.*\///; [ $filename, $zip->contents($_) ]; } @members; unlink($zipfile); @images; } #half-ass using DocumentConverter.py for now #need to recode with OpenOffice::UNO sub odconvert { my( $self, $file, $suffix ) = ( shift, shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; $self->start_openoffice($opt); my $program = 'DocumentConverter.py'; my $timeout = 60; #? use File::Temp qw/ tempfile /; my($fh, $outfile) = tempfile(SUFFIX => ".$suffix"); #hmm, it gets overwritten so $fh is bunk my($out, $err) = ( '', '' ); local($SIG{CHLD}) = sub {}; eval { run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) ) or do { unlink($outfile) or warn "$!\n"; die "$program failed with exit status ". ( $? >> 8 ). ": $out\n"; }; }; if ( $@ ) { unlink($outfile) or warn "$!\n"; die "$program failed: $@\n"; } $outfile; } sub start_openoffice { my( $self ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; my $port = $opt->{'openoffice_port'} || 8100; my $cmd = [ 'openoffice', '-headless', "-accept=socket,port=$port;urp", #'-splash-pipe=5', ]; local($SIG{CHLD}) = sub {}; run $cmd, \undef, '>/dev/null', '2>/dev/null' or die "can't launch openoffice: $@\n"; #it isn't ready to run commands right away :( #it would be better if we could ping against UNO API somehow until ready... #sleep 5 unless $slept++; } 1;