3 files changed, 143 insertions, 42 deletions
diff --git a/lib/HTML/AutoConvert.pm b/lib/HTML/AutoConvert.pm
index 7df3b82..bdbc5fd 100644
--- a/lib/HTML/AutoConvert.pm
+++ b/lib/HTML/AutoConvert.pm
@@ -23,6 +23,8 @@ our $VERSION = '0.01';
     #or to turn on debugging
     my $converter = HTML::AutoConvert->new('debug'=>1);
 
+    my $html = $converter->html_convert( $file );
+    # OR 
     my( $html, @images ) = $converter->html_convert( $file );
 
     #turn on or off debugging later
@@ -58,7 +60,22 @@ sub new {
 
 =head2 html_convert FILENAME
 
-Convert the given filename to HTML.  The HTML output is returned as a scalar.
+Convert the given filename to HTML.
+
+In a scalar context, simply returns the HTML output as a scalar.
+
+    my $html = $converter->html_convert( $file );
+
+In a list context, returns a list consisting of the HTML output as a scalar,
+followed by references for each image extracted, if any.  Each image reference
+is a list reference consisting of two elements: the first is the filename and
+the second is the image itself.
+
+    my( $html, @images ) = $converter->html_convert( $file );
+    foreach my $image ( @images ) {
+      my( $filename, $data ) = @$image;
+      #...
+    }
 
 =cut
 
@@ -72,10 +89,21 @@ sub html_convert {
     or die "no registered handlers for filetype ". $self->filetype( $file );
 
   my( $converted, $html, $errors ) = ( 0, '', '' );
+  my @imgs = ();
   foreach my $handler ( @handlers ) {
 
     my $module = 'HTML::AutoConvert::'. $handler->{'module'};
-    my $tmp_html = eval { $module->html_convert( $self->{'file'} ) };
+
+    my $tmp_html = '';
+    my @tmp_imgs = ();
+    if ( $handler->{'returns_images'} && wantarray ) {
+      ( $tmp_html, @tmp_imgs ) =
+        eval { $module->html_convert( $self->{'file'} ) };
+    } else {
+      $tmp_html =
+        eval { $module->html_convert( $self->{'file'} ) };
+    }
+
     if ( $@ ) {
        my $tmp_err = "conversion with $module failed: $@\n";
        warn $tmp_err if $self->{'debug'};
@@ -85,12 +113,17 @@ sub html_convert {
 
     $converted = 1;
     $html = $tmp_html;
+    @imgs = @tmp_imgs;
     last;
   }
 
   die "couldn't convert $file:\n$errors" unless $converted;
 
-  $html;
+  if ( wantarray ) {
+    ( $html, @imgs );
+  } else {
+    $html;
+  }
 
 }
 
diff --git a/lib/HTML/AutoConvert/OpenOffice.pm b/lib/HTML/AutoConvert/OpenOffice.pm
index e09a9e4..7b35595 100644
--- a/lib/HTML/AutoConvert/OpenOffice.pm
+++ b/lib/HTML/AutoConvert/OpenOffice.pm
@@ -34,24 +34,66 @@ use strict;
 use vars qw( %info ); #$slept );
 use IPC::Run qw( run timeout io );
 use File::Slurp qw( slurp );
+use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
 
 %info = (
-  'types'   => [qw( doc rtf odt sxw )],
-  'weight'  => 80,
-  'url'     => 'http://wvware.sourceforge.net/',
+  'types'          => [qw( doc rtf odt sxw )],
+  'weight'         => 10,
+  'returns_images' => 1,
+  'url'            => 'http://www.openoffice.org/',
 );
 
 #$slept = 0;
 
 #sub program { ( 'openoffice', '-headless' ); }
 
-#half-ass using DocumentConverter.py for now
-#need to recode with OpenOffice::UNO
-
 sub html_convert {
   my( $self, $file ) = ( shift, shift );
   my $opt = ref($_[0]) ? shift : { @_ };
 
+  my $outfile = $self->odconvert($file, 'html');
+  my $html = slurp($outfile);
+  unlink($outfile) or warn "can't unlink $outfile";
+
+  return $html unless wantarray;
+
+  my @images = $self->extract_images($file, $opt);
+
+  ( $html, @images );
+
+}
+
+#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/
+sub extract_images {
+  my( $self, $file ) = ( shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
+  my $zipfile = $self->odconvert($file, 'odt');
+  my $zip = Archive::Zip->new();
+
+  unless ( $zip->read( $zipfile ) == AZ_OK ) {
+    die "error reading $zipfile for images";
+  }
+
+  my @members = $zip->membersMatching( '^Pictures/*' );
+
+  my @images = map {
+                     ( my $filename = $_->fileName ) =~ s/^.*\///;
+                     [ $filename, $zip->contents($_) ];
+                   }
+                   @members;
+
+  unlink($zipfile);
+
+  @images;
+}
+
+#half-ass using DocumentConverter.py for now
+#need to recode with OpenOffice::UNO
+sub odconvert {
+  my( $self, $file, $suffix ) = ( shift, shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
   $self->start_openoffice($opt);
 
   my $program = 'DocumentConverter.py';
@@ -59,20 +101,27 @@ sub html_convert {
   my $timeout = 60; #?
 
   use File::Temp qw/ tempfile /;
-  my($fh, $outfile) = tempfile(SUFFIX => '.html');
+  my($fh, $outfile) = tempfile(SUFFIX => ".$suffix");
   #hmm, it gets overwritten so $fh is bunk
 
   my($out, $err) = ( '', '' );
   local($SIG{CHLD}) = sub {};
-  run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
-    or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-
-  my $html = slurp($outfile);
-
-  $html;
-
+  eval {
+    run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
+      or do {
+              unlink($outfile) or warn "$!\n";
+              die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
+            };
+  };
+  if ( $@ ) {
+    unlink($outfile) or warn "$!\n";
+    die "$program failed: $@\n";
+  }
+
+  $outfile;
 }
 
+
 sub start_openoffice {
   my( $self ) = ( shift, shift );
   my $opt = ref($_[0]) ? shift : { @_ };
diff --git a/lib/HTML/AutoConvert/poppler.pm b/lib/HTML/AutoConvert/poppler.pm
index cca5b0d..a75a54f 100644
--- a/lib/HTML/AutoConvert/poppler.pm
+++ b/lib/HTML/AutoConvert/poppler.pm
@@ -13,36 +13,55 @@ poppler can be downloaded from http://poppler.freedesktop.org/
 use strict;
 use vars qw( %info );
 use base 'HTML::AutoConvert::Run';
+use File::Temp qw( tempdir );
+use File::Slurp qw( slurp );
+use IPC::Run qw( run timeout );
 
 %info = (
-  'types'   => 'pdf',
-  'weight'  => 10,
-  'url'     => 'http://poppler.freedesktop.org/',
+  'types'          => 'pdf',
+  'weight'         => 10,
+  'returns_images' => 1,
+  'url'            => 'http://poppler.freedesktop.org/',
 );
 
 sub program { ( 'pdftohtml', '-stdout' ) }
 
-#false laziness w/OpenOffice.pm
-#sub html_convert {
-#  my( $self, $file ) = ( shift, shift );
-#  my $opt = ref($_[0]) ? shift : { @_ };
-#
-#  my $program = 'pdftohtml';
-#
-#  my $timeout = 60; #?
-#
-#  my($out, $err) = ( '', '' );
-#  local($SIG{CHLD}) = sub {};
-#  run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) )
-#    or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-#
-#  ( my $outfile = $file ) =~ s/\.pdf$/.html/i
-#    or die "poppler.pm called with non-PDF file?!";
-#
-#  my $html = slurp($outfile);
-#
-#  $html;
-#
-#}
+#some false laziness "in spirit" w/OpenOffice.pm
+sub html_convert {
+  my( $self, $file ) = ( shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
+  my $html = $self->SUPER::html_convert($file, $opt);
+  return $html unless wantarray;
+
+  my @images = $self->extract_images($file, $opt);
+
+  ( $html, @images);
+}
+
+sub extract_images {
+  my( $self, $file ) = ( shift, shift );
+  my $opt = ref($_[0]) ? shift : { @_ };
+
+  my $imgdir = tempdir( CLEANUP=>1 ).'/';
+
+  #some false laziness w/Run::html_convert :(
+  my @program = ( 'pdfimages' );
+  my $program = $program[0];
+
+  my $timeout = 60; #?
+
+  my( $out, $err ) = ( '', '');
+  local($SIG{CHLD}) = sub {};
+  run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) )
+    or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";
+
+  map {
+        ( my $filename = $_ ) =~ s/^.*\/\-?//;
+        [ $filename, scalar(slurp($_)) ];
+      }
+      glob("$imgdir*");
+
+}
 
 1;