package HTML::AutoConvert::poppler; =head1 NAME HTML::AutoConvert::poppler - poppler (pdftohtml) plugin for HTML::AutoConvert =head1 URL poppler can be downloaded from http://poppler.freedesktop.org/ =cut use strict; use vars qw( %info ); use base 'HTML::AutoConvert::Run'; use File::Temp qw( tempdir ); use File::Slurp qw( slurp ); use IPC::Run qw( run timeout ); %info = ( 'types' => 'pdf', 'weight' => 10, 'returns_images' => 1, 'url' => 'http://poppler.freedesktop.org/', ); sub program { ( 'pdftohtml', '-stdout' ) } #some false laziness "in spirit" w/OpenOffice.pm sub html_convert { my( $self, $file ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; my $html = $self->SUPER::html_convert($file, $opt); return $html unless wantarray; my @images = $self->extract_images($file, $opt); ( $html, @images); } sub extract_images { my( $self, $file ) = ( shift, shift ); my $opt = ref($_[0]) ? shift : { @_ }; my $imgdir = tempdir( CLEANUP=>1 ).'/'; #some false laziness w/Run::html_convert :( my @program = ( 'pdfimages' ); my $program = $program[0]; my $timeout = 60; #? my( $out, $err ) = ( '', ''); local($SIG{CHLD}) = sub {}; run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) ) or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n"; map { ( my $filename = $_ ) =~ s/^.*\/\-?//; [ $filename, scalar(slurp($_)) ]; } glob("$imgdir*"); } 1;