summaryrefslogtreecommitdiff
path: root/lib/HTML/AutoConvert/poppler.pm
blob: a75a54fe9fe2df6f22f05256f208b15cc319e0dc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
package HTML::AutoConvert::poppler;

=head1 NAME

HTML::AutoConvert::poppler - poppler (pdftohtml) plugin for HTML::AutoConvert

=head1 URL

poppler can be downloaded from http://poppler.freedesktop.org/ 

=cut

use strict;
use vars qw( %info );
use base 'HTML::AutoConvert::Run';
use File::Temp qw( tempdir );
use File::Slurp qw( slurp );
use IPC::Run qw( run timeout );

%info = (
  'types'          => 'pdf',
  'weight'         => 10,
  'returns_images' => 1,
  'url'            => 'http://poppler.freedesktop.org/',
);

sub program { ( 'pdftohtml', '-stdout' ) }

#some false laziness "in spirit" w/OpenOffice.pm
sub html_convert {
  my( $self, $file ) = ( shift, shift );
  my $opt = ref($_[0]) ? shift : { @_ };

  my $html = $self->SUPER::html_convert($file, $opt);
  return $html unless wantarray;

  my @images = $self->extract_images($file, $opt);

  ( $html, @images);
}

sub extract_images {
  my( $self, $file ) = ( shift, shift );
  my $opt = ref($_[0]) ? shift : { @_ };

  my $imgdir = tempdir( CLEANUP=>1 ).'/';

  #some false laziness w/Run::html_convert :(
  my @program = ( 'pdfimages' );
  my $program = $program[0];

  my $timeout = 60; #?

  my( $out, $err ) = ( '', '');
  local($SIG{CHLD}) = sub {};
  run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) )
    or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";

  map {
        ( my $filename = $_ ) =~ s/^.*\/\-?//;
        [ $filename, scalar(slurp($_)) ];
      }
      glob("$imgdir*");

}

1;