1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
package HTML::AutoConvert::poppler;
=head1 NAME
HTML::AutoConvert::poppler - poppler (pdftohtml) plugin for HTML::AutoConvert
=head1 URL
poppler can be downloaded from http://poppler.freedesktop.org/
=cut
use strict;
use vars qw( %info );
use base 'HTML::AutoConvert::Run';
use File::Temp qw( tempdir );
use File::Slurp qw( slurp );
use IPC::Run qw( run timeout );
%info = (
'types' => 'pdf',
'weight' => 10,
'returns_images' => 1,
'url' => 'http://poppler.freedesktop.org/',
);
sub program { ( 'pdftohtml', '-stdout' ) }
#some false laziness "in spirit" w/OpenOffice.pm
sub html_convert {
my( $self, $file ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
my $html = $self->SUPER::html_convert($file, $opt);
return $html unless wantarray;
my @images = $self->extract_images($file, $opt);
( $html, @images);
}
sub extract_images {
my( $self, $file ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
my $imgdir = tempdir( CLEANUP=>1 ).'/';
#some false laziness w/Run::html_convert :(
my @program = ( 'pdfimages' );
my $program = $program[0];
my $timeout = 60; #?
my( $out, $err ) = ( '', '');
local($SIG{CHLD}) = sub {};
run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) )
or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";
map {
( my $filename = $_ ) =~ s/^.*\/\-?//;
[ $filename, scalar(slurp($_)) ];
}
glob("$imgdir*");
}
1;
|