summaryrefslogtreecommitdiff
path: root/lib/HTML/AutoConvert
diff options
context:
space:
mode:
Diffstat (limited to 'lib/HTML/AutoConvert')
-rw-r--r--lib/HTML/AutoConvert/.OpenOffice.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.Run.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.antiword.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.poppler.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.unrtf.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.wvWare.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/OpenOffice.pm96
-rw-r--r--lib/HTML/AutoConvert/Run.pm30
-rw-r--r--lib/HTML/AutoConvert/antiword.pm31
-rw-r--r--lib/HTML/AutoConvert/poppler.pm48
-rw-r--r--lib/HTML/AutoConvert/unrtf.pm25
-rw-r--r--lib/HTML/AutoConvert/wvWare.pm25
12 files changed, 255 insertions, 0 deletions
diff --git a/lib/HTML/AutoConvert/.OpenOffice.pm.swp b/lib/HTML/AutoConvert/.OpenOffice.pm.swp
new file mode 100644
index 0000000..750ad68
--- /dev/null
+++ b/lib/HTML/AutoConvert/.OpenOffice.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.Run.pm.swp b/lib/HTML/AutoConvert/.Run.pm.swp
new file mode 100644
index 0000000..8a90a9a
--- /dev/null
+++ b/lib/HTML/AutoConvert/.Run.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.antiword.pm.swp b/lib/HTML/AutoConvert/.antiword.pm.swp
new file mode 100644
index 0000000..79e92ef
--- /dev/null
+++ b/lib/HTML/AutoConvert/.antiword.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.poppler.pm.swp b/lib/HTML/AutoConvert/.poppler.pm.swp
new file mode 100644
index 0000000..2c011bf
--- /dev/null
+++ b/lib/HTML/AutoConvert/.poppler.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.unrtf.pm.swp b/lib/HTML/AutoConvert/.unrtf.pm.swp
new file mode 100644
index 0000000..9bc779c
--- /dev/null
+++ b/lib/HTML/AutoConvert/.unrtf.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.wvWare.pm.swp b/lib/HTML/AutoConvert/.wvWare.pm.swp
new file mode 100644
index 0000000..39e4317
--- /dev/null
+++ b/lib/HTML/AutoConvert/.wvWare.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/OpenOffice.pm b/lib/HTML/AutoConvert/OpenOffice.pm
new file mode 100644
index 0000000..e09a9e4
--- /dev/null
+++ b/lib/HTML/AutoConvert/OpenOffice.pm
@@ -0,0 +1,96 @@
+package HTML::AutoConvert::OpenOffice;
+
+=head1 NAME
+
+HTML::AutoConvert::antiword - antiword plugin for HTML::AutoConvert
+
+=head1 PREREQUISITES
+
+OpenOffice v2.3 or later
+
+(currently)
+Python
+Python-UNO
+
+(future)
+Perl OpenOffice::UNO
+
+=head1 SECURITY NOTE
+
+This module starts and leaves an OpenOffice instance running.
+
+The OpenOffice instance binds to and listens to a port on localhost for
+commands. Anything which can talk to this port can instruct OpenOffice to
+read or write any file the current user has access to.
+
+By default, port 8100 is used. You can choose a different port by passing an
+option to the new() constructor:
+
+ my $converter = HTML::AutoConvert->new('openoffice_port'=>5555);
+
+=cut
+
+use strict;
+use vars qw( %info ); #$slept );
+use IPC::Run qw( run timeout io );
+use File::Slurp qw( slurp );
+
+%info = (
+ 'types' => [qw( doc rtf odt sxw )],
+ 'weight' => 80,
+ 'url' => 'http://wvware.sourceforge.net/',
+);
+
+#$slept = 0;
+
+#sub program { ( 'openoffice', '-headless' ); }
+
+#half-ass using DocumentConverter.py for now
+#need to recode with OpenOffice::UNO
+
+sub html_convert {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ $self->start_openoffice($opt);
+
+ my $program = 'DocumentConverter.py';
+
+ my $timeout = 60; #?
+
+ use File::Temp qw/ tempfile /;
+ my($fh, $outfile) = tempfile(SUFFIX => '.html');
+ #hmm, it gets overwritten so $fh is bunk
+
+ my($out, $err) = ( '', '' );
+ local($SIG{CHLD}) = sub {};
+ run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
+ or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
+
+ my $html = slurp($outfile);
+
+ $html;
+
+}
+
+sub start_openoffice {
+ my( $self ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+ my $port = $opt->{'openoffice_port'} || 8100;
+
+ my $cmd = [ 'openoffice', '-headless',
+ "-accept=socket,port=$port;urp",
+ #'-splash-pipe=5',
+ ];
+
+ local($SIG{CHLD}) = sub {};
+ run $cmd, \undef, '>/dev/null', '2>/dev/null'
+ or die "can't launch openoffice: $@\n";
+
+ #it isn't ready to run commands right away :(
+ #it would be better if we could ping against UNO API somehow until ready...
+ #sleep 5 unless $slept++;
+
+}
+
+1;
diff --git a/lib/HTML/AutoConvert/Run.pm b/lib/HTML/AutoConvert/Run.pm
new file mode 100644
index 0000000..2eada89
--- /dev/null
+++ b/lib/HTML/AutoConvert/Run.pm
@@ -0,0 +1,30 @@
+package HTML::AutoConvert::Run;
+
+=head1 NAME
+
+HTML::AutoConvert::Run - Base class for HTML::AutoConvert plugs that run an external program
+
+=cut
+
+use strict;
+use IPC::Run qw( run timeout );
+
+sub html_convert {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ my @program = $self->program;
+ my $program = $program[0];
+
+ my $timeout = 60; #?
+
+ my( $html, $err ) = ( '', '');
+ local($SIG{CHLD}) = sub {};
+ run( [ @program, $file ], \undef, \$html, \$err, timeout($timeout) )
+ or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";
+
+ $html;
+
+}
+
+1;
diff --git a/lib/HTML/AutoConvert/antiword.pm b/lib/HTML/AutoConvert/antiword.pm
new file mode 100644
index 0000000..4622a79
--- /dev/null
+++ b/lib/HTML/AutoConvert/antiword.pm
@@ -0,0 +1,31 @@
+package HTML::AutoConvert::antiword;
+
+=head1 NAME
+
+HTML::AutoConvert::antiword - antiword plugin for HTML::AutoConvert
+
+=head1 URL
+
+Antiword can be downloaded from http://www.winfield.demon.nl/index.html
+
+=cut
+
+use strict;
+use vars qw( %info );
+use base 'HTML::AutoConvert::Run';
+
+%info = (
+ 'types' => 'doc',
+ 'weight' => 90,
+ 'url' => 'http://www.winfield.demon.nl/index.html',
+);
+
+sub program { ( 'antiword' ) }
+
+sub html_convert {
+ my $self = shift;
+ my $html = $self->SUPER::html_convert(@_);
+ "<HTML><HEAD></HEAD><BODY><PRE>\n$html\n</PRE></BODY></HTML>";
+}
+
+1;
diff --git a/lib/HTML/AutoConvert/poppler.pm b/lib/HTML/AutoConvert/poppler.pm
new file mode 100644
index 0000000..cca5b0d
--- /dev/null
+++ b/lib/HTML/AutoConvert/poppler.pm
@@ -0,0 +1,48 @@
+package HTML::AutoConvert::poppler;
+
+=head1 NAME
+
+HTML::AutoConvert::poppler - poppler (pdftohtml) plugin for HTML::AutoConvert
+
+=head1 URL
+
+poppler can be downloaded from http://poppler.freedesktop.org/
+
+=cut
+
+use strict;
+use vars qw( %info );
+use base 'HTML::AutoConvert::Run';
+
+%info = (
+ 'types' => 'pdf',
+ 'weight' => 10,
+ 'url' => 'http://poppler.freedesktop.org/',
+);
+
+sub program { ( 'pdftohtml', '-stdout' ) }
+
+#false laziness w/OpenOffice.pm
+#sub html_convert {
+# my( $self, $file ) = ( shift, shift );
+# my $opt = ref($_[0]) ? shift : { @_ };
+#
+# my $program = 'pdftohtml';
+#
+# my $timeout = 60; #?
+#
+# my($out, $err) = ( '', '' );
+# local($SIG{CHLD}) = sub {};
+# run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) )
+# or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
+#
+# ( my $outfile = $file ) =~ s/\.pdf$/.html/i
+# or die "poppler.pm called with non-PDF file?!";
+#
+# my $html = slurp($outfile);
+#
+# $html;
+#
+#}
+
+1;
diff --git a/lib/HTML/AutoConvert/unrtf.pm b/lib/HTML/AutoConvert/unrtf.pm
new file mode 100644
index 0000000..034440e
--- /dev/null
+++ b/lib/HTML/AutoConvert/unrtf.pm
@@ -0,0 +1,25 @@
+package HTML::AutoConvert::unrtf;
+
+=head1 NAME
+
+HTML::AutoConvert::unrtf - unrtf plugin for HTML::AutoConvert
+
+=head1 URL
+
+unrtf can be downloaded from ftp://ftp.gnu.org/pub/gnu/unrtf/
+
+=cut
+
+use strict;
+use vars qw( %info );
+use base 'HTML::AutoConvert::Run';
+
+%info = (
+ 'types' => 'rtf',
+ 'weight' => 90,
+ 'url' => 'ftp://ftp.gnu.org/pub/gnu/unrtf/',
+);
+
+sub program { ( 'unrtf' ) }
+
+1;
diff --git a/lib/HTML/AutoConvert/wvWare.pm b/lib/HTML/AutoConvert/wvWare.pm
new file mode 100644
index 0000000..26a57cd
--- /dev/null
+++ b/lib/HTML/AutoConvert/wvWare.pm
@@ -0,0 +1,25 @@
+package HTML::AutoConvert::wvWare;
+
+=head1 NAME
+
+HTML::AutoConvert::wvWare - wvWare plugin for HTML::AutoConvert
+
+=head1 URL
+
+wvWare can be downloaded from http://wvware.sourceforge.net/
+
+=cut
+
+use strict;
+use vars qw( %info );
+use base 'HTML::AutoConvert::Run';
+
+%info = (
+ 'types' => 'doc',
+ 'weight' => 80,
+ 'url' => 'http://wvware.sourceforge.net/',
+);
+
+sub program { ( 'wvWare' ) }
+
+1;