2 # BEGIN BPS TAGGED BLOCK {{{
6 # This software is Copyright (c) 1996-2015 Best Practical Solutions, LLC
7 # <sales@bestpractical.com>
9 # (Except where explicitly superseded by other copyright notices)
14 # This work is made available to you under the terms of Version 2 of
15 # the GNU General Public License. A copy of that license should have
16 # been provided with this software, but in any event can be snarfed
19 # This work is distributed in the hope that it will be useful, but
20 # WITHOUT ANY WARRANTY; without even the implied warranty of
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 # General Public License for more details.
24 # You should have received a copy of the GNU General Public License
25 # along with this program; if not, write to the Free Software
26 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 # 02110-1301 or visit their web page on the internet at
28 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
31 # CONTRIBUTION SUBMISSION POLICY:
33 # (The following paragraph is not intended to limit the rights granted
34 # to you to modify and distribute this software under the terms of
35 # the GNU General Public License and is only of importance to you if
36 # you choose to contribute your changes and enhancements to the
37 # community by submitting them to Best Practical Solutions, LLC.)
39 # By intentionally submitting any modifications, corrections or
40 # derivatives to this work, or any other work intended for use with
41 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
42 # you are the copyright holder for those contributions and you grant
43 # Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
44 # royalty-free, perpetual, license to use, copy, create derivative
45 # works based on those contributions, and sublicense and distribute
46 # those contributions and any derivatives thereof.
48 # END BPS TAGGED BLOCK }}}
53 # fix lib paths, some may be relative
56 my @libs = ("@RT_LIB_PATH@", "@LOCAL_LIB_PATH@");
60 unless ( File::Spec->file_name_is_absolute($lib) ) {
62 if ( File::Spec->file_name_is_absolute(__FILE__) ) {
63 $bin_path = ( File::Spec->splitpath(__FILE__) )[1];
68 $bin_path = $FindBin::Bin;
71 $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib );
82 use RT::Interface::CLI ();
89 my @OPT_LIST = qw(help|h! debug! quiet);
91 my $db_type = RT->Config->Get('DatabaseType');
92 if ( $db_type eq 'Pg' ) {
98 push @OPT_LIST, 'limit=i', 'all!';
100 elsif ( $db_type eq 'mysql' ) {
107 push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!';
109 elsif ( $db_type eq 'Oracle' ) {
114 push @OPT_LIST, qw(memory=s);
117 use Getopt::Long qw(GetOptions);
118 GetOptions( \%OPT, @OPT_LIST );
120 if ( $OPT{'help'} ) {
121 RT::Interface::CLI->ShowHelp(
122 Sections => 'NAME|DESCRIPTION|'. uc($db_type),
127 if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
129 RT::Logger->info("$0 is already running; aborting silently, as requested");
133 print STDERR "$0 is already running\n";
138 my $fts_config = RT->Config->Get('FullTextSearch') || {};
139 unless ( $fts_config->{'Enable'} ) {
142 Full text search is disabled in your RT configuration. Run
143 @RT_SBIN_PATH_R@/rt-setup-fulltext-index to configure and enable it.
148 unless ( $fts_config->{'Indexed'} ) {
151 Full text search is enabled in your RT configuration, but not with any
152 full-text database indexing -- hence this tool is not required. Read
153 the documentation for %FullTextSearch in your RT_Config for more details.
159 if ( $db_type eq 'Oracle' ) {
160 my $index = $fts_config->{'IndexName'} || 'rt_fts_index';
161 $RT::Handle->dbh->do(
162 "begin ctx_ddl.sync_index(?, ?); end;", undef,
163 $index, $OPT{'memory'}
166 } elsif ( $db_type eq 'mysql' ) {
167 unless ($OPT{'xmlpipe2'}) {
170 Updates to the external Sphinx index are done via running the sphinx
180 my @types = qw(text html);
181 foreach my $type ( @types ) {
183 my $attachments = attachments($type);
187 VALUE => last_indexed($type)
189 $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
190 $attachments->RowsPerPage( $OPT{'limit'} || 100 );
193 while ( my $a = $attachments->Next ) {
194 next if filter( $type, $a );
195 debug("Found attachment #". $a->id );
196 my $txt = extract($type, $a) or next;
198 process( $type, $a, $txt );
199 debug("Processed attachment #". $a->id );
201 finalize( $type, $attachments ) if $found;
203 goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
208 my $res = RT::Attachments->new( RT->SystemUser );
209 my $txn_alias = $res->Join(
211 FIELD1 => 'TransactionId',
212 TABLE2 => 'Transactions',
217 FIELD => 'ObjectType',
218 VALUE => 'RT::Ticket',
220 my $ticket_alias = $res->Join(
221 ALIAS1 => $txn_alias,
222 FIELD1 => 'ObjectId',
227 ALIAS => $ticket_alias,
233 # On newer DBIx::SearchBuilder's, indicate that making the query DISTINCT
234 # is unnecessary because the joins won't produce duplicates. This
235 # drastically improves performance when fetching attachments.
236 $res->{joins_are_distinct} = 1;
238 return goto_specific(
240 error => "Don't know how to find $type attachments",
247 return goto_specific(
249 error => "Don't know how to find last indexed $type attachment for $db_type DB",
256 return goto_specific(
264 return goto_specific(
266 error => "No way to convert $type attachment into text",
272 return goto_specific(
274 error => "No processer for $db_type DB",
280 return goto_specific(
287 return goto_specific(
294 sub last_indexed_mysql {
296 my $attr = $RT::System->FirstAttribute('LastIndexedAttachments');
297 return 0 unless $attr;
298 return 0 unless exists $attr->{ $type };
299 return $attr->{ $type } || 0;
303 my ($type, $attachment, $text) = (@_);
305 my $doc = sphinx_template();
307 my $element = $doc->createElement('sphinx:document');
308 $element->setAttribute( id => $attachment->id );
309 $element->appendTextChild( content => $$text );
311 $doc->documentElement->appendChild( $element );
315 sub sphinx_template {
319 $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
320 my $root = $doc->createElement('sphinx:docset');
321 $doc->setDocumentElement( $root );
323 my $schema = $doc->createElement('sphinx:schema');
324 $root->appendChild( $schema );
325 foreach ( qw(content) ) {
326 my $field = $doc->createElement('sphinx:field');
327 $field->setAttribute( name => $_ );
328 $schema->appendChild( $field );
335 my ($type, $attachments) = @_;
336 sphinx_template()->toFH(*STDOUT, 1);
345 sub last_indexed_pg {
347 my $attachments = attachments( $type );
349 if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
350 $alias = $attachments->Join(
353 TABLE2 => $fts_config->{'Table'},
359 FIELD => $fts_config->{'Column'},
360 OPERATOR => 'IS NOT',
363 $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
364 $attachments->RowsPerPage( 1 );
365 my $res = $attachments->First;
366 return 0 unless $res;
371 my ($type, $attachment, $text) = (@_);
373 my $dbh = $RT::Handle->dbh;
374 my $table = $fts_config->{'Table'};
375 my $column = $fts_config->{'Column'};
379 if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
380 $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
382 $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
385 $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
388 my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) };
390 if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
391 warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
392 } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
393 warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
395 die "error: ". $dbh->errstr;
398 # Insert an empty tsvector, so we count this row as "indexed"
399 # for purposes of knowing where to pick up
400 eval { $dbh->do( $query, undef, "", $attachment->id ) }
401 or die "Failed to insert empty tsvector: " . $dbh->errstr;
405 sub attachments_text {
407 $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' );
412 my $attachment = shift;
413 my $text = $attachment->Content;
414 return undef unless defined $text && length($text);
418 sub attachments_html {
420 $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
425 my $attachment = shift;
426 if ( my $parent = $attachment->ParentObj ) {
427 # skip html parts that are alternatives
428 return 1 if $parent->id
429 && $parent->ContentType eq 'mulitpart/alternative';
435 my $attachment = shift;
436 my $text = $attachment->Content;
437 return undef unless defined $text && length($text);
438 # the rich text editor generates html entities for characters
439 # but Pg doesn't index them, so decode to something it can index.
440 require HTML::Entities;
441 HTML::Entities::decode_entities($text);
448 my $func = (caller(1))[3];
450 my $call = $func ."_". lc $args{'suffix'};
451 unless ( defined &$call ) {
452 return undef unless $args{'error'};
453 require Carp; Carp::croak( $args{'error'} );
455 @_ = @{ $args{'arguments'} };
461 sub debug { print @_, "\n" if $OPT{debug}; 1 }
462 sub error { $RT::Logger->error(_(@_)); 1 }
463 sub warning { $RT::Logger->warn(_(@_)); 1 }
467 rt-fulltext-indexer - Indexer for full text search
471 This is a helper script to keep full text indexes in sync with data.
472 Read F<docs/full_text_indexing.pod> for complete details on how and when
477 Ruslan Zakirov E<lt>ruz@bestpractical.comE<gt>,
478 Alex Vandiver E<lt>alexmv@bestpractical.comE<gt>