package HTML::AutoConvert::poppler;
=head1 NAME
HTML::AutoConvert::poppler - poppler (pdftohtml) plugin for HTML::AutoConvert
=head1 URL
poppler can be downloaded from http://poppler.freedesktop.org/
=cut
use strict;
use vars qw( %info );
use base 'HTML::AutoConvert::Run';
use File::Temp qw( tempdir );
use File::Slurp qw( slurp );
use IPC::Run qw( run timeout );
%info = (
'types' => 'pdf',
'weight' => 10,
'returns_images' => 1,
'url' => 'http://poppler.freedesktop.org/',
);
sub program { ( 'pdftohtml', '-stdout' ) }
#some false laziness "in spirit" w/OpenOffice.pm
sub html_convert {
my( $self, $file ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
my $html = $self->SUPER::html_convert($file, $opt);
return $html unless wantarray;
my @images = $self->extract_images($file, $opt);
( $html, @images);
}
sub extract_images {
my( $self, $file ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
my $imgdir = tempdir( CLEANUP=>1 ).'/';
#some false laziness w/Run::html_convert :(
my @program = ( 'pdfimages' );
my $program = $program[0];
my $timeout = 60; #?
my( $out, $err ) = ( '', '');
local($SIG{CHLD}) = sub {};
run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) )
or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";
map {
( my $filename = $_ ) =~ s/^.*\/\-?//;
[ $filename, scalar(slurp($_)) ];
}
glob("$imgdir*");
}
1;