t/14-rtf-OpenOffice.t
t/15-rtf-unrtf.t
t/26-pdf-poppler.t
+t/34-doc_images-OpenOffice.t
+t/46-pdf_images-poppler.t
+t/attitude.pdf
t/DiaryofaKillerCat.doc
+t/HeatherElko.doc
t/VEGAN_RECIPES.rtf
-t/attitude.pdf
'INSTALLSCRIPT' => '/usr/local/bin',
'INSTALLSITEBIN' => '/usr/local/bin',
PREREQ_PM => {
- 'Test::More' => 0,
- 'IPC::Run' => 0,
- 'File::Slurp' => 0,
+ 'Test::More' => 0,
+ 'IPC::Run' => 0,
+ 'File::Slurp' => 0,
+ 'Archive::Zip' => 0,
},
dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
clean => { FILES => 'HTML-AutoConvert-*' },
-- DOC: images
-- PDF: images
-- RTF: images
+- add the ability to supress starting our own OO and connect to one running
+ elsewhere
+
+- OpenOffice.pm: image converter seems to be leaving behind images in /tmp...
+ Archive::Zip?
+
+- auto-convert non-web images to jpg/gif/png?
+
+- wvWare/other backends besides OO and poppler: handle images?
+
+- OpenOffice.pm: poll via UNO to determine readiness rather than sleep (or not)
-- OpenOffice.pm: poll via UNO to determine readiness rather than sleep
- OpenOffice.pm: convert DocumentConverter.py to Perl using OpenOffice::UNO
#or to turn on debugging
my $converter = HTML::AutoConvert->new('debug'=>1);
+ my $html = $converter->html_convert( $file );
+ # OR
my( $html, @images ) = $converter->html_convert( $file );
#turn on or off debugging later
=head2 html_convert FILENAME
-Convert the given filename to HTML. The HTML output is returned as a scalar.
+Convert the given filename to HTML.
+
+In a scalar context, simply returns the HTML output as a scalar.
+
+ my $html = $converter->html_convert( $file );
+
+In a list context, returns a list consisting of the HTML output as a scalar,
+followed by references for each image extracted, if any. Each image reference
+is a list reference consisting of two elements: the first is the filename and
+the second is the image itself.
+
+ my( $html, @images ) = $converter->html_convert( $file );
+ foreach my $image ( @images ) {
+ my( $filename, $data ) = @$image;
+ #...
+ }
=cut
or die "no registered handlers for filetype ". $self->filetype( $file );
my( $converted, $html, $errors ) = ( 0, '', '' );
+ my @imgs = ();
foreach my $handler ( @handlers ) {
my $module = 'HTML::AutoConvert::'. $handler->{'module'};
- my $tmp_html = eval { $module->html_convert( $self->{'file'} ) };
+
+ my $tmp_html = '';
+ my @tmp_imgs = ();
+ if ( $handler->{'returns_images'} && wantarray ) {
+ ( $tmp_html, @tmp_imgs ) =
+ eval { $module->html_convert( $self->{'file'} ) };
+ } else {
+ $tmp_html =
+ eval { $module->html_convert( $self->{'file'} ) };
+ }
+
if ( $@ ) {
my $tmp_err = "conversion with $module failed: $@\n";
warn $tmp_err if $self->{'debug'};
$converted = 1;
$html = $tmp_html;
+ @imgs = @tmp_imgs;
last;
}
die "couldn't convert $file:\n$errors" unless $converted;
- $html;
+ if ( wantarray ) {
+ ( $html, @imgs );
+ } else {
+ $html;
+ }
}
use vars qw( %info ); #$slept );
use IPC::Run qw( run timeout io );
use File::Slurp qw( slurp );
+use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
%info = (
- 'types' => [qw( doc rtf odt sxw )],
- 'weight' => 80,
- 'url' => 'http://wvware.sourceforge.net/',
+ 'types' => [qw( doc rtf odt sxw )],
+ 'weight' => 10,
+ 'returns_images' => 1,
+ 'url' => 'http://www.openoffice.org/',
);
#$slept = 0;
#sub program { ( 'openoffice', '-headless' ); }
-#half-ass using DocumentConverter.py for now
-#need to recode with OpenOffice::UNO
-
sub html_convert {
my( $self, $file ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
+ my $outfile = $self->odconvert($file, 'html');
+ my $html = slurp($outfile);
+ unlink($outfile) or warn "can't unlink $outfile";
+
+ return $html unless wantarray;
+
+ my @images = $self->extract_images($file, $opt);
+
+ ( $html, @images );
+
+}
+
+#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/
+sub extract_images {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ my $zipfile = $self->odconvert($file, 'odt');
+ my $zip = Archive::Zip->new();
+
+ unless ( $zip->read( $zipfile ) == AZ_OK ) {
+ die "error reading $zipfile for images";
+ }
+
+ my @members = $zip->membersMatching( '^Pictures/*' );
+
+ my @images = map {
+ ( my $filename = $_->fileName ) =~ s/^.*\///;
+ [ $filename, $zip->contents($_) ];
+ }
+ @members;
+
+ unlink($zipfile);
+
+ @images;
+}
+
+#half-ass using DocumentConverter.py for now
+#need to recode with OpenOffice::UNO
+sub odconvert {
+ my( $self, $file, $suffix ) = ( shift, shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
$self->start_openoffice($opt);
my $program = 'DocumentConverter.py';
my $timeout = 60; #?
use File::Temp qw/ tempfile /;
- my($fh, $outfile) = tempfile(SUFFIX => '.html');
+ my($fh, $outfile) = tempfile(SUFFIX => ".$suffix");
#hmm, it gets overwritten so $fh is bunk
my($out, $err) = ( '', '' );
local($SIG{CHLD}) = sub {};
- run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
- or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-
- my $html = slurp($outfile);
-
- $html;
-
+ eval {
+ run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
+ or do {
+ unlink($outfile) or warn "$!\n";
+ die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
+ };
+ };
+ if ( $@ ) {
+ unlink($outfile) or warn "$!\n";
+ die "$program failed: $@\n";
+ }
+
+ $outfile;
}
+
sub start_openoffice {
my( $self ) = ( shift, shift );
my $opt = ref($_[0]) ? shift : { @_ };
use strict;
use vars qw( %info );
use base 'HTML::AutoConvert::Run';
+use File::Temp qw( tempdir );
+use File::Slurp qw( slurp );
+use IPC::Run qw( run timeout );
%info = (
- 'types' => 'pdf',
- 'weight' => 10,
- 'url' => 'http://poppler.freedesktop.org/',
+ 'types' => 'pdf',
+ 'weight' => 10,
+ 'returns_images' => 1,
+ 'url' => 'http://poppler.freedesktop.org/',
);
sub program { ( 'pdftohtml', '-stdout' ) }
-#false laziness w/OpenOffice.pm
-#sub html_convert {
-# my( $self, $file ) = ( shift, shift );
-# my $opt = ref($_[0]) ? shift : { @_ };
-#
-# my $program = 'pdftohtml';
-#
-# my $timeout = 60; #?
-#
-# my($out, $err) = ( '', '' );
-# local($SIG{CHLD}) = sub {};
-# run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) )
-# or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
-#
-# ( my $outfile = $file ) =~ s/\.pdf$/.html/i
-# or die "poppler.pm called with non-PDF file?!";
-#
-# my $html = slurp($outfile);
-#
-# $html;
-#
-#}
+#some false laziness "in spirit" w/OpenOffice.pm
+sub html_convert {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ my $html = $self->SUPER::html_convert($file, $opt);
+ return $html unless wantarray;
+
+ my @images = $self->extract_images($file, $opt);
+
+ ( $html, @images);
+}
+
+sub extract_images {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ my $imgdir = tempdir( CLEANUP=>1 ).'/';
+
+ #some false laziness w/Run::html_convert :(
+ my @program = ( 'pdfimages' );
+ my $program = $program[0];
+
+ my $timeout = 60; #?
+
+ my( $out, $err ) = ( '', '');
+ local($SIG{CHLD}) = sub {};
+ run( [ @program, $file, $imgdir ], \undef, \$out, \$err, timeout($timeout) )
+ or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";
+
+ map {
+ ( my $filename = $_ ) =~ s/^.*\/\-?//;
+ [ $filename, scalar(slurp($_)) ];
+ }
+ glob("$imgdir*");
+
+}
1;
#!perl
+BEGIN { chomp($pwd=`pwd`); $ENV{PATH} .= ":$pwd/bin"; };
+
use Test::More tests => 2;
use HTML::AutoConvert;
--- /dev/null
+#!perl
+
+BEGIN { chomp($pwd=`pwd`); $ENV{PATH} .= ":$pwd/bin"; };
+
+use Test::More tests => 5;
+
+use HTML::AutoConvert;
+
+my $c = new HTML::AutoConvert;
+
+my $force = 'OpenOffice';
+#$c->{'handlers'}{'doc'}{$force}{'weight'} = -1;
+my @del = grep { $_ ne $force } keys %{ $c->{'handlers'}{'doc'} };
+delete($c->{'handlers'}{'doc'}{$_}) foreach @del;
+
+my( $html, @images ) = $c->html_convert('t/HeatherElko.doc');
+
+ok( scalar(@images) == 2, 'got two images' );
+
+#save em off
+#foreach my $image (@images) {
+# my( $file, $data) = @$image;
+# open(FILE, ">t/$file") or die $!;
+# print FILE $data;
+# close FILE or die $!;
+#}
+
+#check the names & lengths at least
+is( $images[0]->[0], '10000000000000C80000009688B0FEF3.png', '1st image name');
+ok( length($images[0]->[1]) == 8704, '1st image size');
+
+is( $images[1]->[0], '100000000000009D0000009F54B4BCB3.png', '2nd image name');
+ok( length($images[1]->[1]) == 2125, '2nd image size');
+
+
--- /dev/null
+#!perl
+
+use Test::More tests => 9;
+
+use HTML::AutoConvert;
+
+my $c = new HTML::AutoConvert;
+
+my $force = 'poppler';
+#$c->{'handlers'}{'doc'}{$force}{'weight'} = -1;
+my @del = grep { $_ ne $force } keys %{ $c->{'handlers'}{'pdf'} };
+delete($c->{'handlers'}{'pdf'}{$_}) foreach @del;
+
+my( $html, @images ) = $c->html_convert('t/attitude.pdf');
+
+ok( scalar(@images) == 21, 'got 21 images' );
+
+#save em off
+#foreach my $image (@images) {
+# my( $file, $data) = @$image;
+# open(FILE, ">t/$file") or die $!;
+# print FILE $data;
+# close FILE or die $!;
+#}
+
+#check the names & lengths at least
+is( $images[0]->[0], '000.ppm', '1st image name');
+ok( length($images[0]->[1]) == 25949, '1st image size');
+
+is( $images[1]->[0], '001.ppm', '2nd image name');
+ok( length($images[1]->[1]) == 43664, '1st image size');
+
+is( $images[2]->[0], '002.ppm', '3rd image name');
+ok( length($images[2]->[1]) == 46833, '1st image size');
+
+is( $images[9]->[0], '009.ppm', '10th image name');
+ok( length($images[9]->[1]) == 46374, '10th image size');
+