add image handling and prevent leaking temporary files (ourselves, Archive::Zip might be)

author: ivan <ivan> 2008-08-12 04:02:02 +0000
committer: ivan <ivan> 2008-08-12 04:02:02 +0000
commit: 9fbeda1dc776c602ce14d3874368d4620c079b60 (patch)
tree: 1c4eeceb1a881caa99dd2e2d600c0c637bb87df2
parent: cba80d78f46ea7541c37efd54262ab1c0dff67e9 (diff)
10 files changed, 237 insertions, 50 deletions
diff --git a/MANIFEST b/MANIFEST
index 173ec7a..36a7492 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -20,6 +20,9 @@ t/04-doc-OpenOffice.t
 t/14-rtf-OpenOffice.t
 t/15-rtf-unrtf.t
 t/26-pdf-poppler.t
+t/34-doc_images-OpenOffice.t
+t/46-pdf_images-poppler.t
+t/attitude.pdf
 t/DiaryofaKillerCat.doc
+t/HeatherElko.doc
 t/VEGAN_RECIPES.rtf
-t/attitude.pdf
diff --git a/Makefile.PL b/Makefile.PL
index cd7c009..18af759 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -12,9 +12,10 @@ WriteMakefile(
     'INSTALLSCRIPT'     => '/usr/local/bin',
     'INSTALLSITEBIN'    => '/usr/local/bin',
     PREREQ_PM => {
-        'Test::More'  => 0,
-        'IPC::Run'    => 0,
-        'File::Slurp' => 0,
+        'Test::More'   => 0,
+        'IPC::Run'     => 0,
+        'File::Slurp'  => 0,
+        'Archive::Zip' => 0,
     },
     dist                => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
     clean               => { FILES => 'HTML-AutoConvert-*' },
diff --git a/TODO b/TODO
index 0058ab2..2969e53 100644
--- a/TODO
+++ b/TODO
@@ -1,6 +1,13 @@
-- DOC: images
-- PDF: images
-- RTF: images
+- add the ability to supress starting our own OO and connect to one running
+  elsewhere
+
+- OpenOffice.pm: image converter seems to be leaving behind images in /tmp...
+                 Archive::Zip?
+
+- auto-convert non-web images to jpg/gif/png?
+
+- wvWare/other backends besides OO and poppler: handle images?
+
+- OpenOffice.pm: poll via UNO to determine readiness rather than sleep (or not)
 
-- OpenOffice.pm: poll via UNO to determine readiness rather than sleep
 - OpenOffice.pm: convert DocumentConverter.py to Perl using OpenOffice::UNO
diff --git a/lib/HTML/AutoConvert.pm b/lib/HTML/AutoConvert.pm
index 7df3b82..bdbc5fd 100644
--- a/lib/HTML/AutoConvert.pm
+++ b/lib/HTML/AutoConvert.pm
@@ -23,6 +23,8 @@ our $VERSION = '0.01';
     #or to turn on debugging
     my $converter = HTML::AutoConvert->new('debug'=>1);
 
+    my $html = $converter->html_convert( $file );
+    # OR 
     my( $html, @images ) = $converter->html_convert( $file );
 
     #turn on or off debugging later
@@ -58,7 +60,22 @@ sub new {
 
 =head2 html_convert FILENAME
 
-Convert the given filename to HTML.  The HTML output is returned as a scalar.
+Convert the given filename to HTML.
+
+In a scalar context, simply returns the HTML output as a scalar.
+
+    my $html = $converter->html_convert( $file );
+
+In a list context, returns a list consisting of the HTML output as a scalar,
+followed by references for each image extracted, if any.  Each image reference
+is a list reference consisting of two elements: the first is the filename and
+the second is the image itself.
+
+    my( $html, @images ) = $converter->html_convert( $file );
+    foreach my $image ( @images ) {
+      my( $filename, $data ) = @$image;
+      #...
+    }
 
 =cut
 
@@ -72,10 +89,21 @@ sub html_convert {
     or die "no registered handlers for filetype ". $self->filetype( $file );
 
   my( $converted, $html, $errors ) = ( 0, '', '' );
+  my @imgs = ();
   foreach my $handler ( @handlers ) {
 
     my $module = 'HTML::AutoConvert::'. $handler->{'module'};
-    my $tmp_html = eval { $module->html_convert( $self->{'file'} ) };
+
+    my $tmp_html = '';
+    my @tmp_imgs = ();
+    if ( $handler->{'returns_images'} && wantarray ) {
+      ( $tmp_html, @tmp_imgs ) =
+        eval { $module->html_convert( $self->{'file'} ) };
+    } else {
+      $tmp_html =
+        eval { $module->html_convert( $self->{'file'} ) };
+    }
+
     if ( $@ ) {
        my $tmp_err = "conversion with $module failed: $@\n";
        warn $tmp_err if $self->{'debug'};
@@ -85,12 +113,17 @@ sub html_convert {
 
     $converted = 1;
     $html = $tmp_html;
+    @imgs = @tmp_imgs;
     last;
   }
 
   die "couldn't convert $file:\n$errors" unless $converted;
 
-  $html;
+  if ( wantarray ) {
+    ( $html, @imgs );
+  } else {
+    $html;
+  }
 
 }
 
diff --git a/lib/HTML/AutoConvert/OpenOffice.pm b/lib/HTML/AutoConvert/OpenOffice.pm
index e09a9e4..7b35595 100644
--- a/lib/HTML/AutoConvert/OpenOffice.pm
+++ b/lib/HTML/AutoConvert/OpenOffice.pm
@@ -34,24 +34,66 @@ use strict;
 use vars qw( %info ); #$slept );
 use IPC::Run qw( run timeout io );
 use File::Slurp qw( slurp );
+use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
 
 %info = (
-  'types'   => [qw( doc rtf odt sxw )],
-  'weight'  => 80,
-  'url'     => 'http://wvware.sourceforge.net/',
+  'types'          => [qw( doc rtf odt sxw )],
+  'weight'         => 10,
+  'returns_images' => 1,
+  'url'            => 'http://www.openoffice.org/',
 );
 
 #$slept = 0;
 
 #sub program { ( 'openoffice', '-headless' ); }
 
-#half-ass using DocumentConverter.py for now
-#need to recode with OpenOffice::UNO
-
 sub html_convert {
   my( $self, $file ) = ( shift, shift );
   my $opt = ref($_[0]) ? shift : { @_ };
 
+  my $outfile = $self->odconvert($file, 'html');
+  my $html = slurp($outfile);
+  unlink($outfile) or warn "can't unlink $outfile";
+
+  return $html unless wantarray;
+
+  my @images = $self->extract_images($file, $opt);
+
+  ( $html, @images );
+
+}
+
+#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/
+sub extract_images {
+  my( $self, $file ) = ( shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
+  my $zipfile = $self->odconvert($file, 'odt');
+  my $zip = Archive::Zip->new();
+
+  unless ( $zip->read( $zipfile ) == AZ_OK ) {
+    die "error reading $zipfile for images";
+  }
+
+  my @members = $zip->membersMatching( '^Pictures/*' );
+
+  my @images = map {
+                     ( my $filename = $_->fileName ) =~ s/^.*\///;
+                     [ $filename, $zip->contents($_) ];
+                   }
+                   @members;
+
+  unlink($zipfile);
+
+  @images;
+}
+
+#half-ass using DocumentConverter.py for now
+#need to recode with OpenOffice::UNO
+sub odconvert {
+  my( $self, $file, $suffix ) = ( shift, shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
   $self->start_openoffice($opt);
 
   my $program = 'DocumentConverter.py';
@@ -59,20 +101,27 @@ sub html_convert {
   my $timeout = 60; #?
 
   use File::Temp qw/ tempfile /;
-  my($fh, $outfile) = tempfile(SUFFIX => '.html');
+  my($fh, $outfile) = tempfile(SUFFIX => ".$suffix");
   #hmm, it gets overwritten so $fh is bunk
 
   my($out, $err) = ( '', '' );
   local($SIG{CHLD}) = sub {};
-  run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
-    or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-
-  my $html = slurp($outfile);
-
-  $html;
-
+  eval {
+    run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
+      or do {
+              unlink($outfile) or warn "$!\n";
+              die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
+            };
+  };
+  if ( $@ ) {
+    unlink($outfile) or warn "$!\n";
+    die "$program failed: $@\n";
+  }
+
+  $outfile;
 }
 
+
 sub start_openoffice {
   my( $self ) = ( shift, shift );
   my $opt = ref($_[0]) ? shift : { @_ };
diff --git a/lib/HTML/AutoConvert/poppler.pm b/lib/HTML/AutoConvert/poppler.pm
index cca5b0d..a75a54f 100644
--- a/lib/HTML/AutoConvert/poppler.pm
+++ b/lib/HTML/AutoConvert/poppler.pm
@@ -13,36 +13,55 @@ poppler can be downloaded from http://poppler.freedesktop.org/
 use strict;
 use vars qw( %info );
 use base 'HTML::AutoConvert::Run';
+use File::Temp qw( tempdir );
+use File::Slurp qw( slurp );
+use IPC::Run qw( run timeout );
 
 %info = (
-  'types'   => 'pdf',
-  'weight'  => 10,
-  'url'     => 'http://poppler.freedesktop.org/',
+  'types'          => 'pdf',
+  'weight'         => 10,
+  'returns_images' => 1,
+  'url'            => 'http://poppler.freedesktop.org/',
 );
 
 sub program { ( 'pdftohtml', '-stdout' ) }
 
-#false laziness w/OpenOffice.pm
-#sub html_convert {
-#  my( $self, $file ) = ( shift, shift );
-#  my $opt = ref($_[0]) ? shift : { @_ };
-#
-#  my $program = 'pdftohtml';
-#
-#  my $timeout = 60; #?
-#
-#  my($out, $err) = ( '', '' );
-#  local($SIG{CHLD}) = sub {};
-#  run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) )
-#    or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-#
-#  ( my $outfile = $file ) =~ s/\.pdf$/.html/i
-#    or die "poppler.pm called with non-PDF file?!";
-#
-#  my $html = slurp($outfile);
-#
-#  $html;
-#
-#}
+#some false laziness "in spirit" w/OpenOffice.pm
+sub html_convert {
+  my( $self, $file ) = ( shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
+  my $html = $self->SUPER::html_convert($file, $opt);
+  return $html unless wantarray;
+
+  my @images = $self->extract_images($file, $opt);
+
+  ( $html, @images);
+}
+
+sub extract_images {
+  my( $self, $file ) = ( shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
+  my $imgdir = tempdir( CLEANUP=>1 ).'/';
+
+  #some false laziness w/Run::html_convert :(
+  my @program = ( 'pdfimages' );
+  my $program = $program[0];
+
+  my $timeout = 60; #?
+
+  my( $out, $err ) = ( '', '');
+  local($SIG{CHLD}) = sub {};
+  run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) )
+    or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";
+
+  map {
+        ( my $filename = $_ ) =~ s/^.*\/\-?//;
+        [ $filename, scalar(slurp($_)) ];
+      }
+      glob("$imgdir*");
+
+}
 
 1;
diff --git a/t/01-doc.t b/t/01-doc.t
index cbb8286..a2a788d 100644
--- a/t/01-doc.t
+++ b/t/01-doc.t
@@ -1,5 +1,7 @@
 #!perl
 
+BEGIN { chomp($pwd=`pwd`); $ENV{PATH} .= ":$pwd/bin"; };
+
 use Test::More tests => 2;
 
 use HTML::AutoConvert;
diff --git a/t/34-doc_images-OpenOffice.t b/t/34-doc_images-OpenOffice.t
new file mode 100644
index 0000000..ba32b2d
--- /dev/null
+++ b/t/34-doc_images-OpenOffice.t
@@ -0,0 +1,35 @@
+#!perl
+
+BEGIN { chomp($pwd=`pwd`); $ENV{PATH} .= ":$pwd/bin"; };
+
+use Test::More tests => 5;
+
+use HTML::AutoConvert;
+
+my $c = new HTML::AutoConvert;
+
+my $force = 'OpenOffice';
+#$c->{'handlers'}{'doc'}{$force}{'weight'} = -1;
+my @del = grep { $_ ne $force } keys %{ $c->{'handlers'}{'doc'} };
+delete($c->{'handlers'}{'doc'}{$_}) foreach @del;
+
+my( $html, @images ) = $c->html_convert('t/HeatherElko.doc');
+
+ok( scalar(@images) == 2, 'got two images' );
+
+#save em off
+#foreach my $image (@images) {
+#  my( $file, $data) = @$image;
+#  open(FILE, ">t/$file") or die $!;
+#  print FILE $data;
+#  close FILE or die $!;
+#}
+
+#check the names & lengths at least
+is( $images[0]->[0], '10000000000000C80000009688B0FEF3.png', '1st image name');
+ok( length($images[0]->[1]) == 8704, '1st image size');
+
+is( $images[1]->[0], '100000000000009D0000009F54B4BCB3.png', '2nd image name');
+ok( length($images[1]->[1]) == 2125, '2nd image size');
+
+
diff --git a/t/46-pdf_images-poppler.t b/t/46-pdf_images-poppler.t
new file mode 100644
index 0000000..9bc1fc1
--- /dev/null
+++ b/t/46-pdf_images-poppler.t
@@ -0,0 +1,38 @@
+#!perl
+
+use Test::More tests => 9;
+
+use HTML::AutoConvert;
+
+my $c = new HTML::AutoConvert;
+
+my $force = 'poppler';
+#$c->{'handlers'}{'doc'}{$force}{'weight'} = -1;
+my @del = grep { $_ ne $force } keys %{ $c->{'handlers'}{'pdf'} };
+delete($c->{'handlers'}{'pdf'}{$_}) foreach @del;
+
+my( $html, @images ) = $c->html_convert('t/attitude.pdf');
+
+ok( scalar(@images) == 21, 'got 21 images' );
+
+#save em off
+#foreach my $image (@images) {
+#  my( $file, $data) = @$image;
+#  open(FILE, ">t/$file") or die $!;
+#  print FILE $data;
+#  close FILE or die $!;
+#}
+
+#check the names & lengths at least
+is( $images[0]->[0], '000.ppm', '1st image name');
+ok( length($images[0]->[1]) == 25949, '1st image size');
+
+is( $images[1]->[0], '001.ppm', '2nd image name');
+ok( length($images[1]->[1]) == 43664, '1st image size');
+
+is( $images[2]->[0], '002.ppm', '3rd image name');
+ok( length($images[2]->[1]) == 46833, '1st image size');
+
+is( $images[9]->[0], '009.ppm', '10th image name');
+ok( length($images[9]->[1]) == 46374, '10th image size');
+
diff --git a/t/HeatherElko.doc b/t/HeatherElko.doc
new file mode 100644
index 0000000..4af5af7
--- /dev/null
+++ b/t/HeatherElko.doc
author	ivan <ivan>	2008-08-12 04:02:02 +0000
committer	ivan <ivan>	2008-08-12 04:02:02 +0000
commit	9fbeda1dc776c602ce14d3874368d4620c079b60 (patch)
tree	1c4eeceb1a881caa99dd2e2d600c0c637bb87df2
parent	cba80d78f46ea7541c37efd54262ab1c0dff67e9 (diff)