X-Git-Url: http://git.freeside.biz/gitweb/?p=freeside.git;a=blobdiff_plain;f=rt%2Fsbin%2Frt-fulltext-indexer.in;h=53dc3fd2afdd5386ec098360b24a0d5a5aa74c22;hp=7e31cac84d3b5290c28996bd75de47a4abf4ead0;hb=de9d037528895f7151a9aead6724ce2df95f9586;hpb=a6fe07e49e3fc12169e801b1ed6874c3a5bd8500 diff --git a/rt/sbin/rt-fulltext-indexer.in b/rt/sbin/rt-fulltext-indexer.in index 7e31cac84..53dc3fd2a 100644 --- a/rt/sbin/rt-fulltext-indexer.in +++ b/rt/sbin/rt-fulltext-indexer.in @@ -3,7 +3,7 @@ # # COPYRIGHT: # -# This software is Copyright (c) 1996-2012 Best Practical Solutions, LLC +# This software is Copyright (c) 1996-2017 Best Practical Solutions, LLC # # # (Except where explicitly superseded by other copyright notices) @@ -48,81 +48,62 @@ # END BPS TAGGED BLOCK }}} use strict; use warnings; -no warnings 'once'; +use 5.010; # fix lib paths, some may be relative -BEGIN { +BEGIN { # BEGIN RT CMD BOILERPLATE require File::Spec; + require Cwd; my @libs = ("@RT_LIB_PATH@", "@LOCAL_LIB_PATH@"); my $bin_path; for my $lib (@libs) { unless ( File::Spec->file_name_is_absolute($lib) ) { - unless ($bin_path) { - if ( File::Spec->file_name_is_absolute(__FILE__) ) { - $bin_path = ( File::Spec->splitpath(__FILE__) )[1]; - } - else { - require FindBin; - no warnings "once"; - $bin_path = $FindBin::Bin; - } - } + $bin_path ||= ( File::Spec->splitpath(Cwd::abs_path(__FILE__)) )[1]; $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib ); } unshift @INC, $lib; } + } -BEGIN { - use RT; - RT::LoadConfig(); - RT::Init(); -}; +use RT -init; use RT::Interface::CLI (); +use HTML::Entities; -my %OPT = ( - help => 0, - debug => 0, -); -my @OPT_LIST = qw(help|h! debug!); +use Getopt::Long qw(GetOptions); +my %OPT = ( memory => '2M', limit => 0 ); +GetOptions( \%OPT, + "help|h!", + "debug!", + "quiet!", -my $db_type = RT->Config->Get('DatabaseType'); -if ( $db_type eq 'Pg' ) { - %OPT = ( - %OPT, - limit => 0, - all => 0, - ); - push @OPT_LIST, 'limit=i', 'all!'; -} -elsif ( $db_type eq 'mysql' ) { - %OPT = ( - %OPT, - limit => 0, - all => 0, - xmlpipe2 => 0, - ); - push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!'; -} -elsif ( $db_type eq 'Oracle' ) { - %OPT = ( - %OPT, - memory => '2M', - ); - push @OPT_LIST, qw(memory=s); -} + "all!", + "limit=i", -use Getopt::Long qw(GetOptions); -GetOptions( \%OPT, @OPT_LIST ); + "memory=s", +); +$OPT{limit} ||= 200; -if ( $OPT{'help'} ) { - RT::Interface::CLI->ShowHelp( - Sections => 'NAME|DESCRIPTION|'. uc($db_type), - ); +RT::Interface::CLI->ShowHelp if $OPT{help}; + +use Fcntl ':flock'; +if ( !flock main::DATA, LOCK_EX | LOCK_NB ) { + if ( $OPT{quiet} ) { + RT::Logger->info("$0 is already running; aborting silently, as requested"); + exit; + } + else { + print STDERR "$0 is already running\n"; + exit 1; + } } -my $fts_config = RT->Config->Get('FullTextSearch') || {}; +my $max_size = RT->Config->Get('MaxFulltextAttachmentSize'); + +my $db_type = RT->Config->Get('DatabaseType'); +my $fts_config = $ENV{RT_FTS_CONFIG} ? JSON::from_json($ENV{RT_FTS_CONFIG}) + : RT->Config->Get('FullTextSearch') || {}; unless ( $fts_config->{'Enable'} ) { print STDERR <{Sphinx} ) { + print STDERR <Limit( - FIELD => 'id', - OPERATOR => '>', - VALUE => last_indexed($type) - ); - $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' ); - $attachments->RowsPerPage( $OPT{'limit'} || 100 ); - - my $found = 0; - while ( my $a = $attachments->Next ) { - next if filter( $type, $a ); - debug("Found attachment #". $a->id ); - my $txt = extract($type, $a) or next; - $found++; - process( $type, $a, $txt ); - debug("Processed attachment #". $a->id ); - } - finalize( $type, $attachments ) if $found; - clean( $type ); - goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100) + exit 1; } -sub attachments { - my $type = shift; - my $res = RT::Attachments->new( RT->SystemUser ); - my $txn_alias = $res->Join( - ALIAS1 => 'main', - FIELD1 => 'TransactionId', - TABLE2 => 'Transactions', - FIELD2 => 'id', - ); - $res->Limit( - ALIAS => $txn_alias, - FIELD => 'ObjectType', - VALUE => 'RT::Ticket', - ); - my $ticket_alias = $res->Join( - ALIAS1 => $txn_alias, - FIELD1 => 'ObjectId', - TABLE2 => 'Tickets', - FIELD2 => 'id', - ); - $res->Limit( - ALIAS => $ticket_alias, - FIELD => 'Status', - OPERATOR => '!=', - VALUE => 'deleted' - ); - - return goto_specific( - suffix => $type, - error => "Don't know how to find $type attachments", - arguments => [$res], - ); +# Skip ACL checks. This saves a large number of unnecessary queries +# (for tickets, ACLs, and users) which are unnecessary, as we are +# running as the system user. +{ + no warnings 'redefine'; + no warnings 'once'; + *RT::Attachment::_Value = \&DBIx::SearchBuilder::Record::_Value; + *RT::Attachments::Next = \&DBIx::SearchBuilder::Next; } -sub last_indexed { - my ($type) = (@_); - return goto_specific( - suffix => $db_type, - error => "Don't know how to find last indexed $type attachment for $db_type DB", - arguments => \@_, - ); +my $LAST; +if ($db_type eq 'mysql') { + process_mysql(); +} elsif ($db_type eq 'Pg') { + process_pg(); } -sub filter { - my $type = shift; - return goto_specific( - suffix => $type, - arguments => \@_, - ); -} +sub attachment_loop { + my $subref = shift; + my $table = $fts_config->{'Table'}; + $LAST //= 0; + + # Fetch in batches of size --limit + { + # Indexes all text/plain and text/html attachments + my $attachments = RT::Attachments->new( RT->SystemUser ); + $attachments->Limit( + FIELD => 'ContentType', + OPERATOR => 'IN', + VALUE => ['text/plain', 'text/html'], + ); + $attachments->Limit( FIELD => 'id', OPERATOR => '>', VALUE => $LAST ); + $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' ); + $attachments->RowsPerPage( $OPT{'limit'} ); -sub extract { - my $type = shift; - return goto_specific( - suffix => $type, - error => "No way to convert $type attachment into text", - arguments => \@_, - ); -} + # Call back to the DB-specific part + $subref->($attachments); -sub process { - return goto_specific( - suffix => $db_type, - error => "No processer for $db_type DB", - arguments => \@_, - ); -} + $LAST = $attachments->Last->id if $attachments->Count; -sub finalize { - return goto_specific( - suffix => $db_type, - arguments => \@_, - ); + redo if $OPT{'all'} and $attachments->Count == $OPT{'limit'}; + } } -sub clean { - return goto_specific( - suffix => $db_type, - arguments => \@_, - ); -} +sub process_bulk_insert { + my $dbh = $RT::Handle->dbh; + my ($statement, $error) = @_; + + # Doing large inserts is faster than individual statements, but + # comes at a parsing cost; cache the statement handles (99% of which + # will be the same size) for a notable (2x) speed gain. + my %sthandles; + + $sthandles{1} = + $dbh->prepare($statement->(1)); + + attachment_loop( sub { + my ($attachments) = @_; + my @insert; + my $found = 0; + + while ( my $a = $attachments->Next ) { + debug("Found attachment #". $a->id ); + if ( $max_size and $a->ContentLength > $max_size ){ + debug("Attachment #" . $a->id . " is " . $a->ContentLength . + " bytes which is larger than configured MaxFulltextAttachmentSize " . + " of " . $max_size . ", skipping"); + next; + } -{ -sub last_indexed_mysql { - my $type = shift; - my $attr = $RT::System->FirstAttribute('LastIndexedAttachments'); - return 0 unless $attr; - return 0 unless exists $attr->{ $type }; - return $attr->{ $type } || 0; -} + my $text = $a->Content // ""; + HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html"; + push @insert, $text, $a->id; + $found++; + } + return unless $found; -sub process_mysql { - my ($type, $attachment, $text) = (@_); + # $found should be the limit size on all but the last go-around. + $sthandles{$found} ||= $dbh->prepare($statement->($found)); - my $doc = sphinx_template(); + return if eval { $sthandles{$found}->execute(@insert); }; - my $element = $doc->createElement('sphinx:document'); - $element->setAttribute( id => $attachment->id ); - $element->appendTextChild( content => $$text ); + # We can catch and recover from some errors; re-do row-by-row to + # know which row had which errors + while (@insert) { + my ($content, $id) = splice(@insert,0,2); + next if eval { $sthandles{1}->execute($content, $id); }; + $error->($id, $content); - $doc->documentElement->appendChild( $element ); + # If this was a semi-expected error, insert an empty + # tsvector, so we count this row as "indexed" for + # purposes of knowing where to pick up + eval { $sthandles{1}->execute( "", $id ) } + or die "Failed to insert empty row for attachment $id: " . $dbh->errstr; + } + }); } -my $doc = undef; -sub sphinx_template { - return $doc if $doc; - - require XML::LibXML; - $doc = XML::LibXML::Document->new('1.0', 'UTF-8'); - my $root = $doc->createElement('sphinx:docset'); - $doc->setDocumentElement( $root ); - - my $schema = $doc->createElement('sphinx:schema'); - $root->appendChild( $schema ); - foreach ( qw(content) ) { - my $field = $doc->createElement('sphinx:field'); - $field->setAttribute( name => $_ ); - $schema->appendChild( $field ); - } - - return $doc; -} +sub process_mysql { + my $dbh = $RT::Handle->dbh; + my $table = $fts_config->{'Table'}; -sub finalize_mysql { - my ($type, $attachments) = @_; - sphinx_template()->toFH(*STDOUT, 1); + ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table"); + + my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT"; + + process_bulk_insert( + sub { + my ($n) = @_; + return "$insert INTO $table(Content, id) VALUES " + . join(", ", ("(?,?)") x $n); + }, + sub { + my ($id) = @_; + if ($dbh->err == 1366 and $dbh->state eq "HY000") { + warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ". + "Error: ". $dbh->errstr; + } else { + die "Attachment $id cannot be indexed: " . $dbh->errstr; + } + } + ); } -sub clean_mysql { - $doc = undef; -} +sub process_pg { + if ( $fts_config->{'Table'} ne 'Attachments' ) { + process_pg_insert(); + } else { + process_pg_update(); + } } -sub last_indexed_pg { - my $type = shift; - my $attachments = attachments( $type ); - my $alias = 'main'; - if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) { - $alias = $attachments->Join( - TYPE => 'left', - FIELD1 => 'id', - TABLE2 => $fts_config->{'Table'}, - FIELD2 => 'id', - ); - } - $attachments->Limit( - ALIAS => $alias, - FIELD => $fts_config->{'Column'}, - OPERATOR => 'IS NOT', - VALUE => 'NULL', +sub process_pg_insert { + my $dbh = $RT::Handle->dbh; + my $table = $fts_config->{'Table'}; + my $column = $fts_config->{'Column'}; + ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table"); + + process_bulk_insert( + sub { + my ($n) = @_; + return "INSERT INTO $table($column, id) VALUES " + . join(", ", ("(TO_TSVECTOR(?),?)") x $n); + }, + sub { + my ($id) = @_; + if ( $dbh->err == 7 && $dbh->state eq '54000' ) { + warn "Attachment $id cannot be indexed. Most probably it contains too many unique words. ". + "Error: ". $dbh->errstr; + } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) { + warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ". + "Error: ". $dbh->errstr; + } else { + die "Attachment $id cannot be indexed: " . $dbh->errstr; + } + } ); - $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' ); - $attachments->RowsPerPage( 1 ); - my $res = $attachments->First; - return 0 unless $res; - return $res->id; } -sub process_pg { - my ($type, $attachment, $text) = (@_); - +sub process_pg_update { my $dbh = $RT::Handle->dbh; - my $table = $fts_config->{'Table'}; my $column = $fts_config->{'Column'}; - my $query; - if ( $table ) { - if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) { - $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?"; - } else { - $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)"; - } - } else { - $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?"; - } + ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL"); - my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) }; - unless ( $status ) { - if ($dbh->errstr =~ /string is too long for tsvector/) { - warn "Attachment @{[$attachment->id]} not indexed, as it contains too many unique words to be indexed"; - } else { - die "error: ". $dbh->errstr; - } - } -} + my $sth = $dbh->prepare("UPDATE Attachments SET $column = TO_TSVECTOR(?) WHERE id = ?"); -sub attachments_text { - my $res = shift; - $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' ); - return $res; -} + attachment_loop( sub { + my ($attachments) = @_; + my @insert; -sub extract_text { - my $attachment = shift; - my $text = $attachment->Content; - return undef unless defined $text && length($text); - return \$text; -} + while ( my $a = $attachments->Next ) { + debug("Found attachment #". $a->id ); -sub attachments_html { - my $res = shift; - $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' ); - return $res; -} + if ( $max_size and $a->ContentLength > $max_size ){ + debug("Attachment #" . $a->id . " is " . $a->ContentLength . + " bytes which is larger than configured MaxFulltextAttachmentSize " . + " of " . $max_size . ", skipping"); + next; + } -sub filter_html { - my $attachment = shift; - if ( my $parent = $attachment->ParentObj ) { -# skip html parts that are alternatives - return 1 if $parent->id - && $parent->ContentType eq 'mulitpart/alternative'; - } - return 0; -} + my $text = $a->Content // ""; + HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html"; -sub extract_html { - my $attachment = shift; - my $text = $attachment->Content; - return undef unless defined $text && length($text); -# TODO: html -> text - return \$text; -} + push @insert, [$text, $a->id]; + } -sub goto_specific { - my %args = (@_); + # Try in one database transaction; if it fails, we roll it back + # and try one statement at a time. + $dbh->begin_work; + my $ok = 1; + for (@insert) { + $ok = eval { $sth->execute( $_->[0], $_->[1] ) }; + last unless $ok; + } + if ($ok) { + $dbh->commit; + return; + } + $dbh->rollback; + + # Things didn't go well. Retry the UPDATE statements one row at + # a time, outside of the transaction. + for (@insert) { + my ($content, $id) = ($_->[0], $_->[1]); + next if eval { $sth->execute( $content, $id ) }; + if ( $dbh->err == 7 && $dbh->state eq '54000' ) { + warn "Attachment $id cannot be indexed. Most probably it contains too many unique words. ". + "Error: ". $dbh->errstr; + } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) { + warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ". + "Error: ". $dbh->errstr; + } else { + die "Attachment $id cannot be indexed: " . $dbh->errstr; + } - my $func = (caller(1))[3]; - $func =~ s/.*:://; - my $call = $func ."_". lc $args{'suffix'}; - unless ( defined &$call ) { - return undef unless $args{'error'}; - require Carp; Carp::croak( $args{'error'} ); - } - @_ = @{ $args{'arguments'} }; - goto &$call; + # If this was a semi-expected error, insert an empty + # tsvector, so we count this row as "indexed" for + # purposes of knowing where to pick up + eval { $sth->execute( "", $id ) } + or die "Failed to insert empty row for attachment $id: " . $dbh->errstr; + } + }); } @@ -444,10 +385,6 @@ This is a helper script to keep full text indexes in sync with data. Read F for complete details on how and when to run it. -=head1 AUTHOR - -Ruslan Zakirov Eruz@bestpractical.comE, -Alex Vandiver Ealexmv@bestpractical.comE - =cut +__DATA__