use strict;
use vars qw( %info );
use base 'HTML::AutoConvert::Run';
+use File::Temp qw( tempdir );
+use File::Slurp qw( slurp );
+use IPC::Run qw( run timeout );
%info = (
- 'types' => 'pdf',
- 'weight' => 10,
- 'url' => 'http://poppler.freedesktop.org/',
+ 'types' => 'pdf',
+ 'weight' => 10,
+ 'returns_images' => 1,
+ 'url' => 'http://poppler.freedesktop.org/',
);
sub program { ( 'pdftohtml', '-stdout' ) }
-#false laziness w/OpenOffice.pm
-#sub html_convert {
-# my( $self, $file ) = ( shift, shift );
-# my $opt = ref($_[0]) ? shift : { @_ };
-#
-# my $program = 'pdftohtml';
-#
-# my $timeout = 60; #?
-#
-# my($out, $err) = ( '', '' );
-# local($SIG{CHLD}) = sub {};
-# run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) )
-# or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-#
-# ( my $outfile = $file ) =~ s/\.pdf$/.html/i
-# or die "poppler.pm called with non-PDF file?!";
-#
-# my $html = slurp($outfile);
-#
-# $html;
-#
-#}
+#some false laziness "in spirit" w/OpenOffice.pm
+sub html_convert {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ my $html = $self->SUPER::html_convert($file, $opt);
+ return $html unless wantarray;
+
+ my @images = $self->extract_images($file, $opt);
+
+ ( $html, @images);
+}
+
+sub extract_images {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ my $imgdir = tempdir( CLEANUP=>1 ).'/';
+
+ #some false laziness w/Run::html_convert :(
+ my @program = ( 'pdfimages' );
+ my $program = $program[0];
+
+ my $timeout = 60; #?
+
+ my( $out, $err ) = ( '', '');
+ local($SIG{CHLD}) = sub {};
+ run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) )
+ or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";
+
+ map {
+ ( my $filename = $_ ) =~ s/^.*\/\-?//;
+ [ $filename, scalar(slurp($_)) ];
+ }
+ glob("$imgdir*");
+
+}
1;