add image handling and prevent leaking temporary files (ourselves, Archive::Zip might be)

author ivan <ivan>

Tue, 12 Aug 2008 04:02:02 +0000 (04:02 +0000)

committer ivan <ivan>

Tue, 12 Aug 2008 04:02:02 +0000 (04:02 +0000)
author ivan <ivan>
Tue, 12 Aug 2008 04:02:02 +0000 (04:02 +0000)
committer ivan <ivan>
Tue, 12 Aug 2008 04:02:02 +0000 (04:02 +0000)
diff --git a/MANIFEST b/MANIFEST

index 173ec7a..36a7492 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -20,6 +20,9 @@ t/04-doc-OpenOffice.t
  t/14-rtf-OpenOffice.t
  t/15-rtf-unrtf.t
  t/26-pdf-poppler.t
+t/34-doc_images-OpenOffice.t
+t/46-pdf_images-poppler.t
+t/attitude.pdf
  t/DiaryofaKillerCat.doc
+t/HeatherElko.doc
  t/VEGAN_RECIPES.rtf
-t/attitude.pdf
diff --git a/Makefile.PL b/Makefile.PL

index cd7c009..18af759 100644 (file)
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -12,9 +12,10 @@ WriteMakefile(
      'INSTALLSCRIPT'     => '/usr/local/bin',
      'INSTALLSITEBIN'    => '/usr/local/bin',
      PREREQ_PM => {
-        'Test::More'  => 0,
-        'IPC::Run'    => 0,
-        'File::Slurp' => 0,
+        'Test::More'   => 0,
+        'IPC::Run'     => 0,
+        'File::Slurp'  => 0,
+        'Archive::Zip' => 0,
      },
      dist                => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
      clean               => { FILES => 'HTML-AutoConvert-*' },
diff --git a/TODO b/TODO

index 0058ab2..2969e53 100644 (file)
--- a/TODO
+++ b/TODO
@@ -1,6 +1,13 @@
-- DOC: images
-- PDF: images
-- RTF: images
+- add the ability to supress starting our own OO and connect to one running
+  elsewhere
+
+- OpenOffice.pm: image converter seems to be leaving behind images in /tmp...
+                 Archive::Zip?
+
+- auto-convert non-web images to jpg/gif/png?
+
+- wvWare/other backends besides OO and poppler: handle images?
+
+- OpenOffice.pm: poll via UNO to determine readiness rather than sleep (or not)
  
-- OpenOffice.pm: poll via UNO to determine readiness rather than sleep
  - OpenOffice.pm: convert DocumentConverter.py to Perl using OpenOffice::UNO
diff --git a/lib/HTML/AutoConvert.pm b/lib/HTML/AutoConvert.pm

index 7df3b82..bdbc5fd 100644 (file)
--- a/lib/HTML/AutoConvert.pm
+++ b/lib/HTML/AutoConvert.pm
@@ -23,6 +23,8 @@ our $VERSION = '0.01';
      #or to turn on debugging
      my $converter = HTML::AutoConvert->new('debug'=>1);
  
+    my $html = $converter->html_convert( $file );
+    # OR 
      my( $html, @images ) = $converter->html_convert( $file );
  
      #turn on or off debugging later
@@ -58,7 +60,22 @@ sub new {
  
  =head2 html_convert FILENAME
  
-Convert the given filename to HTML.  The HTML output is returned as a scalar.
+Convert the given filename to HTML.
+
+In a scalar context, simply returns the HTML output as a scalar.
+
+    my $html = $converter->html_convert( $file );
+
+In a list context, returns a list consisting of the HTML output as a scalar,
+followed by references for each image extracted, if any.  Each image reference
+is a list reference consisting of two elements: the first is the filename and
+the second is the image itself.
+
+    my( $html, @images ) = $converter->html_convert( $file );
+    foreach my $image ( @images ) {
+      my( $filename, $data ) = @$image;
+      #...
+    }
  
  =cut
  
@@ -72,10 +89,21 @@ sub html_convert {
      or die "no registered handlers for filetype ". $self->filetype( $file );
  
    my( $converted, $html, $errors ) = ( 0, '', '' );
+  my @imgs = ();
    foreach my $handler ( @handlers ) {
  
      my $module = 'HTML::AutoConvert::'. $handler->{'module'};
-    my $tmp_html = eval { $module->html_convert( $self->{'file'} ) };
+
+    my $tmp_html = '';
+    my @tmp_imgs = ();
+    if ( $handler->{'returns_images'} && wantarray ) {
+      ( $tmp_html, @tmp_imgs ) =
+        eval { $module->html_convert( $self->{'file'} ) };
+    } else {
+      $tmp_html =
+        eval { $module->html_convert( $self->{'file'} ) };
+    }
+
      if ( $@ ) {
         my $tmp_err = "conversion with $module failed: $@\n";
         warn $tmp_err if $self->{'debug'};
@@ -85,12 +113,17 @@ sub html_convert {
  
      $converted = 1;
      $html = $tmp_html;
+    @imgs = @tmp_imgs;
      last;
    }
  
    die "couldn't convert $file:\n$errors" unless $converted;
  
-  $html;
+  if ( wantarray ) {
+    ( $html, @imgs );
+  } else {
+    $html;
+  }
  
  }
  
diff --git a/lib/HTML/AutoConvert/OpenOffice.pm b/lib/HTML/AutoConvert/OpenOffice.pm

index e09a9e4..7b35595 100644 (file)
--- a/lib/HTML/AutoConvert/OpenOffice.pm
+++ b/lib/HTML/AutoConvert/OpenOffice.pm
@@ -34,24 +34,66 @@ use strict;
  use vars qw( %info ); #$slept );
  use IPC::Run qw( run timeout io );
  use File::Slurp qw( slurp );
+use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
  
  %info = (
-  'types'   => [qw( doc rtf odt sxw )],
-  'weight'  => 80,
-  'url'     => 'http://wvware.sourceforge.net/',
+  'types'          => [qw( doc rtf odt sxw )],
+  'weight'         => 10,
+  'returns_images' => 1,
+  'url'            => 'http://www.openoffice.org/',
  );
  
  #$slept = 0;
  
  #sub program { ( 'openoffice', '-headless' ); }
  
-#half-ass using DocumentConverter.py for now
-#need to recode with OpenOffice::UNO
-
  sub html_convert {
    my( $self, $file ) = ( shift, shift );
    my $opt = ref($_[0]) ? shift : { @_ };
  
+  my $outfile = $self->odconvert($file, 'html');
+  my $html = slurp($outfile);
+  unlink($outfile) or warn "can't unlink $outfile";
+
+  return $html unless wantarray;
+
+  my @images = $self->extract_images($file, $opt);
+
+  ( $html, @images );
+
+}
+
+#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/
+sub extract_images {
+  my( $self, $file ) = ( shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
+  my $zipfile = $self->odconvert($file, 'odt');
+  my $zip = Archive::Zip->new();
+
+  unless ( $zip->read( $zipfile ) == AZ_OK ) {
+    die "error reading $zipfile for images";
+  }
+
+  my @members = $zip->membersMatching( '^Pictures/*' );
+
+  my @images = map {
+                     ( my $filename = $_->fileName ) =~ s/^.*\///;
+                     [ $filename, $zip->contents($_) ];
+                   }
+                   @members;
+
+  unlink($zipfile);
+
+  @images;
+}
+
+#half-ass using DocumentConverter.py for now
+#need to recode with OpenOffice::UNO
+sub odconvert {
+  my( $self, $file, $suffix ) = ( shift, shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
    $self->start_openoffice($opt);
  
    my $program = 'DocumentConverter.py';
@@ -59,20 +101,27 @@ sub html_convert {
    my $timeout = 60; #?
  
    use File::Temp qw/ tempfile /;
-  my($fh, $outfile) = tempfile(SUFFIX => '.html');
+  my($fh, $outfile) = tempfile(SUFFIX => ".$suffix");
    #hmm, it gets overwritten so $fh is bunk
  
    my($out, $err) = ( '', '' );
    local($SIG{CHLD}) = sub {};
-  run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
-    or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-
-  my $html = slurp($outfile);
-
-  $html;
-
+  eval {
+    run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
+      or do {
+              unlink($outfile) or warn "$!\n";
+              die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
+            };
+  };
+  if ( $@ ) {
+    unlink($outfile) or warn "$!\n";
+    die "$program failed: $@\n";
+  }
+
+  $outfile;
  }
  
+
  sub start_openoffice {
    my( $self ) = ( shift, shift );
    my $opt = ref($_[0]) ? shift : { @_ };
diff --git a/lib/HTML/AutoConvert/poppler.pm b/lib/HTML/AutoConvert/poppler.pm

index cca5b0d..a75a54f 100644 (file)
--- a/lib/HTML/AutoConvert/poppler.pm
+++ b/lib/HTML/AutoConvert/poppler.pm
@@ -13,36 +13,55 @@ poppler can be downloaded from http://poppler.freedesktop.org/
  use strict;
  use vars qw( %info );
  use base 'HTML::AutoConvert::Run';
+use File::Temp qw( tempdir );
+use File::Slurp qw( slurp );
+use IPC::Run qw( run timeout );
  
  %info = (
-  'types'   => 'pdf',
-  'weight'  => 10,
-  'url'     => 'http://poppler.freedesktop.org/',
+  'types'          => 'pdf',
+  'weight'         => 10,
+  'returns_images' => 1,
+  'url'            => 'http://poppler.freedesktop.org/',
  );
  
  sub program { ( 'pdftohtml', '-stdout' ) }
  
-#false laziness w/OpenOffice.pm
-#sub html_convert {
-#  my( $self, $file ) = ( shift, shift );
-#  my $opt = ref($_[0]) ? shift : { @_ };
-#
-#  my $program = 'pdftohtml';
-#
-#  my $timeout = 60; #?
-#
-#  my($out, $err) = ( '', '' );
-#  local($SIG{CHLD}) = sub {};
-#  run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) )
-#    or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-#
-#  ( my $outfile = $file ) =~ s/\.pdf$/.html/i
-#    or die "poppler.pm called with non-PDF file?!";
-#
-#  my $html = slurp($outfile);
-#
-#  $html;
-#
-#}
+#some false laziness "in spirit" w/OpenOffice.pm
+sub html_convert {
+  my( $self, $file ) = ( shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
+  my $html = $self->SUPER::html_convert($file, $opt);
+  return $html unless wantarray;
+
+  my @images = $self->extract_images($file, $opt);
+
+  ( $html, @images);
+}
+
+sub extract_images {
+  my( $self, $file ) = ( shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
+  my $imgdir = tempdir( CLEANUP=>1 ).'/';
+
+  #some false laziness w/Run::html_convert :(
+  my @program = ( 'pdfimages' );
+  my $program = $program[0];
+
+  my $timeout = 60; #?
+
+  my( $out, $err ) = ( '', '');
+  local($SIG{CHLD}) = sub {};
+  run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) )
+    or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";
+
+  map {
+        ( my $filename = $_ ) =~ s/^.*\/\-?//;
+        [ $filename, scalar(slurp($_)) ];
+      }
+      glob("$imgdir*");
+
+}
  
  1;
diff --git a/t/01-doc.t b/t/01-doc.t

index cbb8286..a2a788d 100644 (file)
--- a/t/01-doc.t
+++ b/t/01-doc.t
@@ -1,5 +1,7 @@
  #!perl
  
+BEGIN { chomp($pwd=`pwd`); $ENV{PATH} .= ":$pwd/bin"; };
+
  use Test::More tests => 2;
  
  use HTML::AutoConvert;
diff --git a/t/34-doc_images-OpenOffice.t b/t/34-doc_images-OpenOffice.t

new file mode 100644 (file)

index 0000000..ba32b2d
--- /dev/null
+++ b/t/34-doc_images-OpenOffice.t
@@ -0,0 +1,35 @@
+#!perl
+
+BEGIN { chomp($pwd=`pwd`); $ENV{PATH} .= ":$pwd/bin"; };
+
+use Test::More tests => 5;
+
+use HTML::AutoConvert;
+
+my $c = new HTML::AutoConvert;
+
+my $force = 'OpenOffice';
+#$c->{'handlers'}{'doc'}{$force}{'weight'} = -1;
+my @del = grep { $_ ne $force } keys %{ $c->{'handlers'}{'doc'} };
+delete($c->{'handlers'}{'doc'}{$_}) foreach @del;
+
+my( $html, @images ) = $c->html_convert('t/HeatherElko.doc');
+
+ok( scalar(@images) == 2, 'got two images' );
+
+#save em off
+#foreach my $image (@images) {
+#  my( $file, $data) = @$image;
+#  open(FILE, ">t/$file") or die $!;
+#  print FILE $data;
+#  close FILE or die $!;
+#}
+
+#check the names & lengths at least
+is( $images[0]->[0], '10000000000000C80000009688B0FEF3.png', '1st image name');
+ok( length($images[0]->[1]) == 8704, '1st image size');
+
+is( $images[1]->[0], '100000000000009D0000009F54B4BCB3.png', '2nd image name');
+ok( length($images[1]->[1]) == 2125, '2nd image size');
+
+
diff --git a/t/46-pdf_images-poppler.t b/t/46-pdf_images-poppler.t

new file mode 100644 (file)

index 0000000..9bc1fc1
--- /dev/null
+++ b/t/46-pdf_images-poppler.t
@@ -0,0 +1,38 @@
+#!perl
+
+use Test::More tests => 9;
+
+use HTML::AutoConvert;
+
+my $c = new HTML::AutoConvert;
+
+my $force = 'poppler';
+#$c->{'handlers'}{'doc'}{$force}{'weight'} = -1;
+my @del = grep { $_ ne $force } keys %{ $c->{'handlers'}{'pdf'} };
+delete($c->{'handlers'}{'pdf'}{$_}) foreach @del;
+
+my( $html, @images ) = $c->html_convert('t/attitude.pdf');
+
+ok( scalar(@images) == 21, 'got 21 images' );
+
+#save em off
+#foreach my $image (@images) {
+#  my( $file, $data) = @$image;
+#  open(FILE, ">t/$file") or die $!;
+#  print FILE $data;
+#  close FILE or die $!;
+#}
+
+#check the names & lengths at least
+is( $images[0]->[0], '000.ppm', '1st image name');
+ok( length($images[0]->[1]) == 25949, '1st image size');
+
+is( $images[1]->[0], '001.ppm', '2nd image name');
+ok( length($images[1]->[1]) == 43664, '1st image size');
+
+is( $images[2]->[0], '002.ppm', '3rd image name');
+ok( length($images[2]->[1]) == 46833, '1st image size');
+
+is( $images[9]->[0], '009.ppm', '10th image name');
+ok( length($images[9]->[1]) == 46374, '10th image size');
+
diff --git a/t/HeatherElko.doc b/t/HeatherElko.doc

new file mode 100644 (file)

index 0000000..4af5af7

Binary files /dev/null and b/t/HeatherElko.doc differ
author	ivan <ivan>
	Tue, 12 Aug 2008 04:02:02 +0000 (04:02 +0000)
committer	ivan <ivan>
	Tue, 12 Aug 2008 04:02:02 +0000 (04:02 +0000)
MANIFEST		patch \| blob \| history
Makefile.PL		patch \| blob \| history
TODO		patch \| blob \| history
lib/HTML/AutoConvert.pm		patch \| blob \| history
lib/HTML/AutoConvert/OpenOffice.pm		patch \| blob \| history
lib/HTML/AutoConvert/poppler.pm		patch \| blob \| history
t/01-doc.t		patch \| blob \| history
t/34-doc_images-OpenOffice.t	[new file with mode: 0644]	patch \| blob
t/46-pdf_images-poppler.t	[new file with mode: 0644]	patch \| blob
t/HeatherElko.doc	[new file with mode: 0644]	patch \| blob