summaryrefslogtreecommitdiff
path: root/lib/HTML/AutoConvert/OpenOffice.pm
blob: 7b35595b1f360a783ad69451a176970a75b1e267 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
package HTML::AutoConvert::OpenOffice;

=head1 NAME

HTML::AutoConvert::antiword - antiword plugin for HTML::AutoConvert

=head1 PREREQUISITES

OpenOffice v2.3 or later

(currently)
Python
Python-UNO

(future)
Perl OpenOffice::UNO

=head1 SECURITY NOTE

This module starts and leaves an OpenOffice instance running.

The OpenOffice instance binds to and listens to a port on localhost for
commands.  Anything which can talk to this port can instruct OpenOffice to
read or write any file the current user has access to.

By default, port 8100 is used.  You can choose a different port by passing an
option to the new() constructor:

  my $converter = HTML::AutoConvert->new('openoffice_port'=>5555);

=cut

use strict;
use vars qw( %info ); #$slept );
use IPC::Run qw( run timeout io );
use File::Slurp qw( slurp );
use Archive::Zip qw( :ERROR_CODES :CONSTANTS );

%info = (
  'types'          => [qw( doc rtf odt sxw )],
  'weight'         => 10,
  'returns_images' => 1,
  'url'            => 'http://www.openoffice.org/',
);

#$slept = 0;

#sub program { ( 'openoffice', '-headless' ); }

sub html_convert {
  my( $self, $file ) = ( shift, shift );
  my $opt = ref($_[0]) ? shift : { @_ };

  my $outfile = $self->odconvert($file, 'html');
  my $html = slurp($outfile);
  unlink($outfile) or warn "can't unlink $outfile";

  return $html unless wantarray;

  my @images = $self->extract_images($file, $opt);

  ( $html, @images );

}

#http://cdriga.kfacts.com/open-source-world/tutorial-extract-original-images-from-ms-word-doc-using-openofficeorg/2007/11/04/
sub extract_images {
  my( $self, $file ) = ( shift, shift );
  my $opt = ref($_[0]) ? shift : { @_ };

  my $zipfile = $self->odconvert($file, 'odt');
  my $zip = Archive::Zip->new();

  unless ( $zip->read( $zipfile ) == AZ_OK ) {
    die "error reading $zipfile for images";
  }

  my @members = $zip->membersMatching( '^Pictures/*' );

  my @images = map {
                     ( my $filename = $_->fileName ) =~ s/^.*\///;
                     [ $filename, $zip->contents($_) ];
                   }
                   @members;

  unlink($zipfile);

  @images;
}

#half-ass using DocumentConverter.py for now
#need to recode with OpenOffice::UNO
sub odconvert {
  my( $self, $file, $suffix ) = ( shift, shift, shift );
  my $opt = ref($_[0]) ? shift : { @_ };

  $self->start_openoffice($opt);

  my $program = 'DocumentConverter.py';

  my $timeout = 60; #?

  use File::Temp qw/ tempfile /;
  my($fh, $outfile) = tempfile(SUFFIX => ".$suffix");
  #hmm, it gets overwritten so $fh is bunk

  my($out, $err) = ( '', '' );
  local($SIG{CHLD}) = sub {};
  eval {
    run( [ $program, $file, $outfile ], \undef, \$out, \$err, timeout($timeout) )
      or do {
              unlink($outfile) or warn "$!\n";
              die "$program failed with exit status ". ( $? >> 8 ). ": $out\n";
            };
  };
  if ( $@ ) {
    unlink($outfile) or warn "$!\n";
    die "$program failed: $@\n";
  }

  $outfile;
}


sub start_openoffice {
  my( $self ) = ( shift, shift );
  my $opt = ref($_[0]) ? shift : { @_ };
  my $port = $opt->{'openoffice_port'} || 8100;

  my $cmd = [ 'openoffice', '-headless',
                            "-accept=socket,port=$port;urp",
                            #'-splash-pipe=5',
            ];

  local($SIG{CHLD}) = sub {};
  run $cmd, \undef, '>/dev/null', '2>/dev/null'
    or die "can't launch openoffice: $@\n";

  #it isn't ready to run commands right away :(
  #it would be better if we could ping against UNO API somehow until ready...
  #sleep 5 unless $slept++;

}

1;