summaryrefslogtreecommitdiff
path: root/lib/HTML
diff options
context:
space:
mode:
Diffstat (limited to 'lib/HTML')
-rw-r--r--lib/HTML/.AutoConvert.pm.swpbin0 -> 20480 bytes
-rw-r--r--lib/HTML/AutoConvert.pm249
-rw-r--r--lib/HTML/AutoConvert/.OpenOffice.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.Run.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.antiword.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.poppler.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.unrtf.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/.wvWare.pm.swpbin0 -> 12288 bytes
-rw-r--r--lib/HTML/AutoConvert/OpenOffice.pm96
-rw-r--r--lib/HTML/AutoConvert/Run.pm30
-rw-r--r--lib/HTML/AutoConvert/antiword.pm31
-rw-r--r--lib/HTML/AutoConvert/poppler.pm48
-rw-r--r--lib/HTML/AutoConvert/unrtf.pm25
-rw-r--r--lib/HTML/AutoConvert/wvWare.pm25
14 files changed, 504 insertions, 0 deletions
diff --git a/lib/HTML/.AutoConvert.pm.swp b/lib/HTML/.AutoConvert.pm.swp
new file mode 100644
index 0000000..f6927d2
--- /dev/null
+++ b/lib/HTML/.AutoConvert.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert.pm b/lib/HTML/AutoConvert.pm
new file mode 100644
index 0000000..7df3b82
--- /dev/null
+++ b/lib/HTML/AutoConvert.pm
@@ -0,0 +1,249 @@
+package HTML::AutoConvert;
+
+use warnings;
+use strict;
+
+=head1 NAME
+
+HTML::AutoConvert - Best-effort HTML conversion of arbitrary files to HTML.
+
+=head1 VERSION
+
+Version 0.01
+
+=cut
+
+our $VERSION = '0.01';
+
+=head1 SYNOPSIS
+
+ use HTML::AutoConvert;
+
+ my $converter = HTML::AutoConvert->new();
+ #or to turn on debugging
+ my $converter = HTML::AutoConvert->new('debug'=>1);
+
+ my( $html, @images ) = $converter->html_convert( $file );
+
+ #turn on or off debugging later
+ $converter->debug(1);
+
+=head1 DESCRIPTION
+
+Convert arbitrary file types to HTML.
+
+#=head1 EXPORT
+#
+#doc on also using html_convert functional interface
+
+=head1 FUNCTIONS
+
+=head2 new
+
+=cut
+
+sub new {
+ my $proto = shift;
+ my $class = ref($proto) || $proto;
+
+ my $opts = ref($_[0]) ? shift : { @_ };
+ my $self = $opts; #{};
+ bless ($self, $class);
+
+ $self->find_handlers;
+
+ $self;
+
+}
+
+=head2 html_convert FILENAME
+
+Convert the given filename to HTML. The HTML output is returned as a scalar.
+
+=cut
+
+sub html_convert {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ $self->{'file'} = $file;
+
+ my @handlers = $self->handlers
+ or die "no registered handlers for filetype ". $self->filetype( $file );
+
+ my( $converted, $html, $errors ) = ( 0, '', '' );
+ foreach my $handler ( @handlers ) {
+
+ my $module = 'HTML::AutoConvert::'. $handler->{'module'};
+ my $tmp_html = eval { $module->html_convert( $self->{'file'} ) };
+ if ( $@ ) {
+ my $tmp_err = "conversion with $module failed: $@\n";
+ warn $tmp_err if $self->{'debug'};
+ $errors .= $tmp_err;
+ next;
+ }
+
+ $converted = 1;
+ $html = $tmp_html;
+ last;
+ }
+
+ die "couldn't convert $file:\n$errors" unless $converted;
+
+ $html;
+
+}
+
+=head2 debug
+
+Get or set the debugging level
+
+=cut
+
+sub debug {
+ my $self = shift;
+ $self->{'debug'} = shift if @_;
+ $self->{'debug'};
+}
+
+=head1 INTERNAL FUNCTIONS
+
+=head2 find_handlers
+
+Search for installed HTML::AutoConvert::* plugins.
+
+=cut
+
+sub find_handlers {
+ my $self = shift;
+
+ my %types;
+ foreach my $INC ( @INC ) {
+ warn "globbing $INC/HTML/AutoConvert/*.pm\n" if $self->{'debug'};
+ foreach my $file ( glob("$INC/HTML/AutoConvert/*.pm") ) {
+ warn "attempting to load handler info from $file\n" if $self->{'debug'};
+ $file =~ /\/(\w+)\.pm$/ or do {
+ warn "unrecognized file in $INC/HTML/AutoConvert/: $file\n";
+ next;
+ };
+ my $mod = $1;
+ my $info = eval "use HTML::AutoConvert::$mod; ".
+ "\\%HTML::AutoConvert::$mod\::info;";
+ if ( $@ ) {
+ die "error using HTML::AutoConvert::$mod (skipping): $@\n" if $@;
+ next;
+ }
+ unless ( keys %$info ) {
+ warn "no %info hash in HTML::AutoConvert::$mod, skipping\n" if $self->{'debug'};
+ next;
+ }
+ warn "got handler info from HTML::AutoConvert::$mod: $info\n" if $self->{'debug'};
+ if ( exists($info->{'disabled'}) && $info->{'disabled'} ) {
+ warn "skipping disabled handler HTML::AutoConvert::$mod" if $self->{'debug'};
+ next;
+ }
+
+ my $types = $info->{'types'};
+ $types = [ $types ] unless ref($types);
+
+ foreach my $type ( @$types ) {
+ $types{lc($type)}->{$mod} = { 'module' => $mod, %$info };
+ }
+
+ }
+ }
+
+ $self->{'handlers'} = \%types;
+
+}
+
+=head2 handlers
+
+Return the available handlers for the current file.
+
+=cut
+
+sub handlers {
+ my( $self ) = @_;
+
+ my $types = $self->{'handlers'};
+
+ my $type = $self->filetype;
+
+ sort { $a->{'weight'} <=> $b->{'weight'} }
+ values %{ $types->{lc($type)} };
+}
+
+=head2
+
+
+=head2 filetype
+
+Determine the type of the current file.
+
+=cut
+
+#just use the file extension... could also use File::MMagic or something
+sub filetype {
+ my $self = shift;
+
+ my $file = $self->{'file'};
+ $file =~ /\.(\w{3,4})$/ or die "can't parse $file for extension";
+ lc($1);
+}
+
+=head1 AUTHOR
+
+Ivan Kohler, C<< <ivan-html-autoconvert at 420.am> >>
+
+=head1 BUGS
+
+Please report any bugs or feature requests to C<bug-html-autoconvert at rt.cpan.org>, or through
+the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-AutoConvert>. I will be notified, and then you'll
+automatically be notified of progress on your bug as I make changes.
+
+=head1 SUPPORT
+
+You can find documentation for this module with the perldoc command.
+
+ perldoc HTML::AutoConvert
+
+You can also look for information at:
+
+=over 4
+
+=item * RT: CPAN's request tracker
+
+L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-AutoConvert>
+
+=item * AnnoCPAN: Annotated CPAN documentation
+
+L<http://annocpan.org/dist/HTML-AutoConvert>
+
+=item * CPAN Ratings
+
+L<http://cpanratings.perl.org/d/HTML-AutoConvert>
+
+=item * Search CPAN
+
+L<http://search.cpan.org/dist/HTML-AutoConvert>
+
+=back
+
+
+=head1 ACKNOWLEDGEMENTS
+
+
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2008 Freeside Internet Services, Inc.
+All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
+
+1; # End of HTML::AutoConvert
+
diff --git a/lib/HTML/AutoConvert/.OpenOffice.pm.swp b/lib/HTML/AutoConvert/.OpenOffice.pm.swp
new file mode 100644
index 0000000..750ad68
--- /dev/null
+++ b/lib/HTML/AutoConvert/.OpenOffice.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.Run.pm.swp b/lib/HTML/AutoConvert/.Run.pm.swp
new file mode 100644
index 0000000..8a90a9a
--- /dev/null
+++ b/lib/HTML/AutoConvert/.Run.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.antiword.pm.swp b/lib/HTML/AutoConvert/.antiword.pm.swp
new file mode 100644
index 0000000..79e92ef
--- /dev/null
+++ b/lib/HTML/AutoConvert/.antiword.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.poppler.pm.swp b/lib/HTML/AutoConvert/.poppler.pm.swp
new file mode 100644
index 0000000..2c011bf
--- /dev/null
+++ b/lib/HTML/AutoConvert/.poppler.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.unrtf.pm.swp b/lib/HTML/AutoConvert/.unrtf.pm.swp
new file mode 100644
index 0000000..9bc779c
--- /dev/null
+++ b/lib/HTML/AutoConvert/.unrtf.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/.wvWare.pm.swp b/lib/HTML/AutoConvert/.wvWare.pm.swp
new file mode 100644
index 0000000..39e4317
--- /dev/null
+++ b/lib/HTML/AutoConvert/.wvWare.pm.swp
Binary files differ
diff --git a/lib/HTML/AutoConvert/OpenOffice.pm b/lib/HTML/AutoConvert/OpenOffice.pm
new file mode 100644
index 0000000..e09a9e4
--- /dev/null
+++ b/lib/HTML/AutoConvert/OpenOffice.pm
@@ -0,0 +1,96 @@
+package HTML::AutoConvert::OpenOffice;
+
+=head1 NAME
+
+HTML::AutoConvert::antiword - antiword plugin for HTML::AutoConvert
+
+=head1 PREREQUISITES
+
+OpenOffice v2.3 or later
+
+(currently)
+Python
+Python-UNO
+
+(future)
+Perl OpenOffice::UNO
+
+=head1 SECURITY NOTE
+
+This module starts and leaves an OpenOffice instance running.
+
+The OpenOffice instance binds to and listens to a port on localhost for
+commands. Anything which can talk to this port can instruct OpenOffice to
+read or write any file the current user has access to.
+
+By default, port 8100 is used. You can choose a different port by passing an
+option to the new() constructor:
+
+ my $converter = HTML::AutoConvert->new('openoffice_port'=>5555);
+
+=cut
+
+use strict;
+use vars qw( %info ); #$slept );
+use IPC::Run qw( run timeout io );
+use File::Slurp qw( slurp );
+
+%info = (
+ 'types' => [qw( doc rtf odt sxw )],
+ 'weight' => 80,
+ 'url' => 'http://wvware.sourceforge.net/',
+);
+
+#$slept = 0;
+
+#sub program { ( 'openoffice', '-headless' ); }
+
+#half-ass using DocumentConverter.py for now
+#need to recode with OpenOffice::UNO
+
+sub html_convert {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ $self->start_openoffice($opt);
+
+ my $program = 'DocumentConverter.py';
+
+ my $timeout = 60; #?
+
+ use File::Temp qw/ tempfile /;
+ my($fh, $outfile) = tempfile(SUFFIX => '.html');
+ #hmm, it gets overwritten so $fh is bunk
+
+ my($out, $err) = ( '', '' );
+ local($SIG{CHLD}) = sub {};
+ run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
+ or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
+
+ my $html = slurp($outfile);
+
+ $html;
+
+}
+
+sub start_openoffice {
+ my( $self ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+ my $port = $opt->{'openoffice_port'} || 8100;
+
+ my $cmd = [ 'openoffice', '-headless',
+ "-accept=socket,port=$port;urp",
+ #'-splash-pipe=5',
+ ];
+
+ local($SIG{CHLD}) = sub {};
+ run $cmd, \undef, '>/dev/null', '2>/dev/null'
+ or die "can't launch openoffice: $@\n";
+
+ #it isn't ready to run commands right away :(
+ #it would be better if we could ping against UNO API somehow until ready...
+ #sleep 5 unless $slept++;
+
+}
+
+1;
diff --git a/lib/HTML/AutoConvert/Run.pm b/lib/HTML/AutoConvert/Run.pm
new file mode 100644
index 0000000..2eada89
--- /dev/null
+++ b/lib/HTML/AutoConvert/Run.pm
@@ -0,0 +1,30 @@
+package HTML::AutoConvert::Run;
+
+=head1 NAME
+
+HTML::AutoConvert::Run - Base class for HTML::AutoConvert plugs that run an external program
+
+=cut
+
+use strict;
+use IPC::Run qw( run timeout );
+
+sub html_convert {
+ my( $self, $file ) = ( shift, shift );
+ my $opt = ref($_[0]) ? shift : { @_ };
+
+ my @program = $self->program;
+ my $program = $program[0];
+
+ my $timeout = 60; #?
+
+ my( $html, $err ) = ( '', '');
+ local($SIG{CHLD}) = sub {};
+ run( [ @program, $file ], \undef, \$html, \$err, timeout($timeout) )
+ or die "$program failed with exit status ". ( $? >> 8 ). ": $err\n";
+
+ $html;
+
+}
+
+1;
diff --git a/lib/HTML/AutoConvert/antiword.pm b/lib/HTML/AutoConvert/antiword.pm
new file mode 100644
index 0000000..4622a79
--- /dev/null
+++ b/lib/HTML/AutoConvert/antiword.pm
@@ -0,0 +1,31 @@
+package HTML::AutoConvert::antiword;
+
+=head1 NAME
+
+HTML::AutoConvert::antiword - antiword plugin for HTML::AutoConvert
+
+=head1 URL
+
+Antiword can be downloaded from http://www.winfield.demon.nl/index.html
+
+=cut
+
+use strict;
+use vars qw( %info );
+use base 'HTML::AutoConvert::Run';
+
+%info = (
+ 'types' => 'doc',
+ 'weight' => 90,
+ 'url' => 'http://www.winfield.demon.nl/index.html',
+);
+
+sub program { ( 'antiword' ) }
+
+sub html_convert {
+ my $self = shift;
+ my $html = $self->SUPER::html_convert(@_);
+ "<HTML><HEAD></HEAD><BODY><PRE>\n$html\n</PRE></BODY></HTML>";
+}
+
+1;
diff --git a/lib/HTML/AutoConvert/poppler.pm b/lib/HTML/AutoConvert/poppler.pm
new file mode 100644
index 0000000..cca5b0d
--- /dev/null
+++ b/lib/HTML/AutoConvert/poppler.pm
@@ -0,0 +1,48 @@
+package HTML::AutoConvert::poppler;
+
+=head1 NAME
+
+HTML::AutoConvert::poppler - poppler (pdftohtml) plugin for HTML::AutoConvert
+
+=head1 URL
+
+poppler can be downloaded from http://poppler.freedesktop.org/
+
+=cut
+
+use strict;
+use vars qw( %info );
+use base 'HTML::AutoConvert::Run';
+
+%info = (
+ 'types' => 'pdf',
+ 'weight' => 10,
+ 'url' => 'http://poppler.freedesktop.org/',
+);
+
+sub program { ( 'pdftohtml', '-stdout' ) }
+
+#false laziness w/OpenOffice.pm
+#sub html_convert {
+# my( $self, $file ) = ( shift, shift );
+# my $opt = ref($_[0]) ? shift : { @_ };
+#
+# my $program = 'pdftohtml';
+#
+# my $timeout = 60; #?
+#
+# my($out, $err) = ( '', '' );
+# local($SIG{CHLD}) = sub {};
+# run( [ $program, $file ], \undef, \$out, \$err, timeout($timeout) )
+# or die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
+#
+# ( my $outfile = $file ) =~ s/\.pdf$/.html/i
+# or die "poppler.pm called with non-PDF file?!";
+#
+# my $html = slurp($outfile);
+#
+# $html;
+#
+#}
+
+1;
diff --git a/lib/HTML/AutoConvert/unrtf.pm b/lib/HTML/AutoConvert/unrtf.pm
new file mode 100644
index 0000000..034440e
--- /dev/null
+++ b/lib/HTML/AutoConvert/unrtf.pm
@@ -0,0 +1,25 @@
+package HTML::AutoConvert::unrtf;
+
+=head1 NAME
+
+HTML::AutoConvert::unrtf - unrtf plugin for HTML::AutoConvert
+
+=head1 URL
+
+unrtf can be downloaded from ftp://ftp.gnu.org/pub/gnu/unrtf/
+
+=cut
+
+use strict;
+use vars qw( %info );
+use base 'HTML::AutoConvert::Run';
+
+%info = (
+ 'types' => 'rtf',
+ 'weight' => 90,
+ 'url' => 'ftp://ftp.gnu.org/pub/gnu/unrtf/',
+);
+
+sub program { ( 'unrtf' ) }
+
+1;
diff --git a/lib/HTML/AutoConvert/wvWare.pm b/lib/HTML/AutoConvert/wvWare.pm
new file mode 100644
index 0000000..26a57cd
--- /dev/null
+++ b/lib/HTML/AutoConvert/wvWare.pm
@@ -0,0 +1,25 @@
+package HTML::AutoConvert::wvWare;
+
+=head1 NAME
+
+HTML::AutoConvert::wvWare - wvWare plugin for HTML::AutoConvert
+
+=head1 URL
+
+wvWare can be downloaded from http://wvware.sourceforge.net/
+
+=cut
+
+use strict;
+use vars qw( %info );
+use base 'HTML::AutoConvert::Run';
+
+%info = (
+ 'types' => 'doc',
+ 'weight' => 80,
+ 'url' => 'http://wvware.sourceforge.net/',
+);
+
+sub program { ( 'wvWare' ) }
+
+1;