package HTML::AutoConvert::OpenOffice;
=head1 NAME
HTML::AutoConvert::antiword - antiword plugin for HTML::AutoConvert
=head1 PREREQUISITES
OpenOffice v2.3 or later
(currently)
Python
Python-UNO
(future)
Perl OpenOffice::UNO
=head1 SECURITY NOTE
This module starts and leaves an OpenOffice instance running.
The OpenOffice instance binds to and listens to a port on localhost for
commands. Anything which can talk to this port can instruct OpenOffice to
read or write any file the current user has access to.
By default, port 8100 is used. You can choose a different port by passing an
option to the new() constructor:
my $converter = HTML::AutoConvert->new('openoffice_port'=>5555);
=cut
use strict;
use vars qw( %info ); #$slept );
use IPC::Run qw( run timeout io );
use File::Slurp qw( slurp );
use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
%info = (
'types' => [qw( doc rtf odt sxw )],
'weight' => 10,
'returns_images' => 1,
'url' => 'http://www.openoffice.org/',
);
#$slept = 0;
#sub program { ( 'openoffice', '-headless' ); }
sub html_convert {
my( $self, $file ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
my $outfile = $self->odconvert($file, 'html');
my $html = slurp($outfile);
unlink($outfile) or warn "can't unlink $outfile";
return $html unless wantarray;
my @images = $self->extract_images($file, $opt);
( $html, @images );
}
#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/
sub extract_images {
my( $self, $file ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
my $zipfile = $self->odconvert($file, 'odt');
my $zip = Archive::Zip->new();
unless ( $zip->read( $zipfile ) == AZ_OK ) {
die "error reading $zipfile for images";
}
my @members = $zip->membersMatching( '^Pictures/*' );
my @images = map {
( my $filename = $_->fileName ) =~ s/^.*\///;
[ $filename, $zip->contents($_) ];
}
@members;
unlink($zipfile);
@images;
}
#half-ass using DocumentConverter.py for now
#need to recode with OpenOffice::UNO
sub odconvert {
my( $self, $file, $suffix ) = ( shift, shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
$self->start_openoffice($opt);
my $program = 'DocumentConverter.py';
my $timeout = 60; #?
use File::Temp qw/ tempfile /;
my($fh, $outfile) = tempfile(SUFFIX => ".$suffix");
#hmm, it gets overwritten so $fh is bunk
my($out, $err) = ( '', '' );
local($SIG{CHLD}) = sub {};
eval {
run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
or do {
unlink($outfile) or warn "$!\n";
die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
};
};
if ( $@ ) {
unlink($outfile) or warn "$!\n";
die "$program failed: $@\n";
}
$outfile;
}
sub start_openoffice {
my( $self ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
my $port = $opt->{'openoffice_port'} || 8100;
my $cmd = [ 'openoffice', '-headless',
"-accept=socket,port=$port;urp",
#'-splash-pipe=5',
];
local($SIG{CHLD}) = sub {};
run $cmd, \undef, '>/dev/null', '2>/dev/null'
or die "can't launch openoffice: $@\n";
#it isn't ready to run commands right away :(
#it would be better if we could ping against UNO API somehow until ready...
#sleep 5 unless $slept++;
}
1;