#
# COPYRIGHT:
#
-# This software is Copyright (c) 1996-2012 Best Practical Solutions, LLC
+# This software is Copyright (c) 1996-2016 Best Practical Solutions, LLC
# <sales@bestpractical.com>
#
# (Except where explicitly superseded by other copyright notices)
# END BPS TAGGED BLOCK }}}
use strict;
use warnings;
-no warnings 'once';
+use 5.010;
# fix lib paths, some may be relative
-BEGIN {
+BEGIN { # BEGIN RT CMD BOILERPLATE
require File::Spec;
+ require Cwd;
my @libs = ("@RT_LIB_PATH@", "@LOCAL_LIB_PATH@");
my $bin_path;
for my $lib (@libs) {
unless ( File::Spec->file_name_is_absolute($lib) ) {
- unless ($bin_path) {
- if ( File::Spec->file_name_is_absolute(__FILE__) ) {
- $bin_path = ( File::Spec->splitpath(__FILE__) )[1];
- }
- else {
- require FindBin;
- no warnings "once";
- $bin_path = $FindBin::Bin;
- }
- }
+ $bin_path ||= ( File::Spec->splitpath(Cwd::abs_path(__FILE__)) )[1];
$lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib );
}
unshift @INC, $lib;
}
+
}
-BEGIN {
- use RT;
- RT::LoadConfig();
- RT::Init();
-};
+use RT -init;
use RT::Interface::CLI ();
+use HTML::Entities;
-my %OPT = (
- help => 0,
- debug => 0,
-);
-my @OPT_LIST = qw(help|h! debug!);
+use Getopt::Long qw(GetOptions);
+my %OPT = ( memory => '2M', limit => 0 );
+GetOptions( \%OPT,
+ "help|h!",
+ "debug!",
+ "quiet!",
-my $db_type = RT->Config->Get('DatabaseType');
-if ( $db_type eq 'Pg' ) {
- %OPT = (
- %OPT,
- limit => 0,
- all => 0,
- );
- push @OPT_LIST, 'limit=i', 'all!';
-}
-elsif ( $db_type eq 'mysql' ) {
- %OPT = (
- %OPT,
- limit => 0,
- all => 0,
- xmlpipe2 => 0,
- );
- push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!';
-}
-elsif ( $db_type eq 'Oracle' ) {
- %OPT = (
- %OPT,
- memory => '2M',
- );
- push @OPT_LIST, qw(memory=s);
-}
+ "all!",
+ "limit=i",
-use Getopt::Long qw(GetOptions);
-GetOptions( \%OPT, @OPT_LIST );
+ "memory=s",
+);
+$OPT{limit} ||= 200;
-if ( $OPT{'help'} ) {
- RT::Interface::CLI->ShowHelp(
- Sections => 'NAME|DESCRIPTION|'. uc($db_type),
- );
+RT::Interface::CLI->ShowHelp if $OPT{help};
+
+use Fcntl ':flock';
+if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
+ if ( $OPT{quiet} ) {
+ RT::Logger->info("$0 is already running; aborting silently, as requested");
+ exit;
+ }
+ else {
+ print STDERR "$0 is already running\n";
+ exit 1;
+ }
}
-my $fts_config = RT->Config->Get('FullTextSearch') || {};
+my $max_size = RT->Config->Get('MaxFulltextAttachmentSize');
+
+my $db_type = RT->Config->Get('DatabaseType');
+my $fts_config = $ENV{RT_FTS_CONFIG} ? JSON::from_json($ENV{RT_FTS_CONFIG})
+ : RT->Config->Get('FullTextSearch') || {};
unless ( $fts_config->{'Enable'} ) {
print STDERR <<EOT;
$index, $OPT{'memory'}
);
exit;
-} elsif ( $db_type eq 'mysql' ) {
- unless ($OPT{'xmlpipe2'}) {
- print STDERR <<EOT;
+} elsif ( $fts_config->{Sphinx} ) {
+ print STDERR <<EOT;
Updates to the external Sphinx index are done via running the sphinx
`indexer` tool:
indexer rt
EOT
- exit 1;
- }
-}
-
-my @types = qw(text html);
-foreach my $type ( @types ) {
- REDO:
- my $attachments = attachments($type);
- $attachments->Limit(
- FIELD => 'id',
- OPERATOR => '>',
- VALUE => last_indexed($type)
- );
- $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
- $attachments->RowsPerPage( $OPT{'limit'} || 100 );
-
- my $found = 0;
- while ( my $a = $attachments->Next ) {
- next if filter( $type, $a );
- debug("Found attachment #". $a->id );
- my $txt = extract($type, $a) or next;
- $found++;
- process( $type, $a, $txt );
- debug("Processed attachment #". $a->id );
- }
- finalize( $type, $attachments ) if $found;
- clean( $type );
- goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
+ exit 1;
}
-sub attachments {
- my $type = shift;
- my $res = RT::Attachments->new( RT->SystemUser );
- my $txn_alias = $res->Join(
- ALIAS1 => 'main',
- FIELD1 => 'TransactionId',
- TABLE2 => 'Transactions',
- FIELD2 => 'id',
- );
- $res->Limit(
- ALIAS => $txn_alias,
- FIELD => 'ObjectType',
- VALUE => 'RT::Ticket',
- );
- my $ticket_alias = $res->Join(
- ALIAS1 => $txn_alias,
- FIELD1 => 'ObjectId',
- TABLE2 => 'Tickets',
- FIELD2 => 'id',
- );
- $res->Limit(
- ALIAS => $ticket_alias,
- FIELD => 'Status',
- OPERATOR => '!=',
- VALUE => 'deleted'
- );
-
- # On newer DBIx::SearchBuilder's, indicate that making the query DISTINCT
- # is unnecessary because the joins won't produce duplicates. This
- # drastically improves performance when fetching attachments.
- $res->{joins_are_distinct} = 1;
-
- return goto_specific(
- suffix => $type,
- error => "Don't know how to find $type attachments",
- arguments => [$res],
- );
+# Skip ACL checks. This saves a large number of unnecessary queries
+# (for tickets, ACLs, and users) which are unnecessary, as we are
+# running as the system user.
+{
+ no warnings 'redefine';
+ no warnings 'once';
+ *RT::Attachment::_Value = \&DBIx::SearchBuilder::Record::_Value;
+ *RT::Attachments::Next = \&DBIx::SearchBuilder::Next;
}
-sub last_indexed {
- my ($type) = (@_);
- return goto_specific(
- suffix => $db_type,
- error => "Don't know how to find last indexed $type attachment for $db_type DB",
- arguments => \@_,
- );
+my $LAST;
+if ($db_type eq 'mysql') {
+ process_mysql();
+} elsif ($db_type eq 'Pg') {
+ process_pg();
}
-sub filter {
- my $type = shift;
- return goto_specific(
- suffix => $type,
- arguments => \@_,
- );
-}
+sub attachment_loop {
+ my $subref = shift;
+ my $table = $fts_config->{'Table'};
+ $LAST //= 0;
+
+ # Fetch in batches of size --limit
+ {
+ # Indexes all text/plain and text/html attachments
+ my $attachments = RT::Attachments->new( RT->SystemUser );
+ $attachments->Limit(
+ FIELD => 'ContentType',
+ OPERATOR => 'IN',
+ VALUE => ['text/plain', 'text/html'],
+ );
+ $attachments->Limit( FIELD => 'id', OPERATOR => '>', VALUE => $LAST );
+ $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
+ $attachments->RowsPerPage( $OPT{'limit'} );
-sub extract {
- my $type = shift;
- return goto_specific(
- suffix => $type,
- error => "No way to convert $type attachment into text",
- arguments => \@_,
- );
-}
+ # Call back to the DB-specific part
+ $subref->($attachments);
-sub process {
- return goto_specific(
- suffix => $db_type,
- error => "No processer for $db_type DB",
- arguments => \@_,
- );
-}
+ $LAST = $attachments->Last->id if $attachments->Count;
-sub finalize {
- return goto_specific(
- suffix => $db_type,
- arguments => \@_,
- );
+ redo if $OPT{'all'} and $attachments->Count == $OPT{'limit'};
+ }
}
-sub clean {
- return goto_specific(
- suffix => $db_type,
- arguments => \@_,
- );
-}
+sub process_bulk_insert {
+ my $dbh = $RT::Handle->dbh;
+ my ($statement, $error) = @_;
+
+ # Doing large inserts is faster than individual statements, but
+ # comes at a parsing cost; cache the statement handles (99% of which
+ # will be the same size) for a notable (2x) speed gain.
+ my %sthandles;
+
+ $sthandles{1} =
+ $dbh->prepare($statement->(1));
+
+ attachment_loop( sub {
+ my ($attachments) = @_;
+ my @insert;
+ my $found = 0;
+
+ while ( my $a = $attachments->Next ) {
+ debug("Found attachment #". $a->id );
+ if ( $max_size and $a->ContentLength > $max_size ){
+ debug("Attachment #" . $a->id . " is " . $a->ContentLength .
+ " bytes which is larger than configured MaxFulltextAttachmentSize " .
+ " of " . $max_size . ", skipping");
+ next;
+ }
-{
-sub last_indexed_mysql {
- my $type = shift;
- my $attr = $RT::System->FirstAttribute('LastIndexedAttachments');
- return 0 unless $attr;
- return 0 unless exists $attr->{ $type };
- return $attr->{ $type } || 0;
-}
+ my $text = $a->Content // "";
+ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
+ push @insert, $text, $a->id;
+ $found++;
+ }
+ return unless $found;
-sub process_mysql {
- my ($type, $attachment, $text) = (@_);
+ # $found should be the limit size on all but the last go-around.
+ $sthandles{$found} ||= $dbh->prepare($statement->($found));
- my $doc = sphinx_template();
+ return if eval { $sthandles{$found}->execute(@insert); };
- my $element = $doc->createElement('sphinx:document');
- $element->setAttribute( id => $attachment->id );
- $element->appendTextChild( content => $$text );
+ # We can catch and recover from some errors; re-do row-by-row to
+ # know which row had which errors
+ while (@insert) {
+ my ($content, $id) = splice(@insert,0,2);
+ next if eval { $sthandles{1}->execute($content, $id); };
+ $error->($id, $content);
- $doc->documentElement->appendChild( $element );
+ # If this was a semi-expected error, insert an empty
+ # tsvector, so we count this row as "indexed" for
+ # purposes of knowing where to pick up
+ eval { $sthandles{1}->execute( "", $id ) }
+ or die "Failed to insert empty row for attachment $id: " . $dbh->errstr;
+ }
+ });
}
-my $doc = undef;
-sub sphinx_template {
- return $doc if $doc;
-
- require XML::LibXML;
- $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
- my $root = $doc->createElement('sphinx:docset');
- $doc->setDocumentElement( $root );
-
- my $schema = $doc->createElement('sphinx:schema');
- $root->appendChild( $schema );
- foreach ( qw(content) ) {
- my $field = $doc->createElement('sphinx:field');
- $field->setAttribute( name => $_ );
- $schema->appendChild( $field );
- }
-
- return $doc;
-}
+sub process_mysql {
+ my $dbh = $RT::Handle->dbh;
+ my $table = $fts_config->{'Table'};
-sub finalize_mysql {
- my ($type, $attachments) = @_;
- sphinx_template()->toFH(*STDOUT, 1);
+ ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+
+ my $insert = $fts_config->{Engine} eq "MyISAM" ? "INSERT DELAYED" : "INSERT";
+
+ process_bulk_insert(
+ sub {
+ my ($n) = @_;
+ return "$insert INTO $table(Content, id) VALUES "
+ . join(", ", ("(?,?)") x $n);
+ },
+ sub {
+ my ($id) = @_;
+ if ($dbh->err == 1366 and $dbh->state eq "HY000") {
+ warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ".
+ "Error: ". $dbh->errstr;
+ } else {
+ die "Attachment $id cannot be indexed: " . $dbh->errstr;
+ }
+ }
+ );
}
-sub clean_mysql {
- $doc = undef;
-}
+sub process_pg {
+ if ( $fts_config->{'Table'} ne 'Attachments' ) {
+ process_pg_insert();
+ } else {
+ process_pg_update();
+ }
}
-sub last_indexed_pg {
- my $type = shift;
- my $attachments = attachments( $type );
- my $alias = 'main';
- if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
- $alias = $attachments->Join(
- TYPE => 'left',
- FIELD1 => 'id',
- TABLE2 => $fts_config->{'Table'},
- FIELD2 => 'id',
- );
- }
- $attachments->Limit(
- ALIAS => $alias,
- FIELD => $fts_config->{'Column'},
- OPERATOR => 'IS NOT',
- VALUE => 'NULL',
+sub process_pg_insert {
+ my $dbh = $RT::Handle->dbh;
+ my $table = $fts_config->{'Table'};
+ my $column = $fts_config->{'Column'};
+ ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM $table");
+
+ process_bulk_insert(
+ sub {
+ my ($n) = @_;
+ return "INSERT INTO $table($column, id) VALUES "
+ . join(", ", ("(TO_TSVECTOR(?),?)") x $n);
+ },
+ sub {
+ my ($id) = @_;
+ if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
+ warn "Attachment $id cannot be indexed. Most probably it contains too many unique words. ".
+ "Error: ". $dbh->errstr;
+ } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
+ warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ".
+ "Error: ". $dbh->errstr;
+ } else {
+ die "Attachment $id cannot be indexed: " . $dbh->errstr;
+ }
+ }
);
- $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
- $attachments->RowsPerPage( 1 );
- my $res = $attachments->First;
- return 0 unless $res;
- return $res->id;
}
-sub process_pg {
- my ($type, $attachment, $text) = (@_);
-
+sub process_pg_update {
my $dbh = $RT::Handle->dbh;
- my $table = $fts_config->{'Table'};
my $column = $fts_config->{'Column'};
- my $query;
- if ( $table ) {
- if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
- $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
- } else {
- $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
- }
- } else {
- $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
- }
+ ($LAST) = $dbh->selectrow_array("SELECT MAX(id) FROM Attachments WHERE $column IS NOT NULL");
- my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) };
- unless ( $status ) {
- if ($dbh->errstr =~ /string is too long for tsvector/) {
- warn "Attachment @{[$attachment->id]} not indexed, as it contains too many unique words to be indexed";
- } else {
- die "error: ". $dbh->errstr;
- }
- }
-}
+ my $sth = $dbh->prepare("UPDATE Attachments SET $column = TO_TSVECTOR(?) WHERE id = ?");
-sub attachments_text {
- my $res = shift;
- $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' );
- return $res;
-}
+ attachment_loop( sub {
+ my ($attachments) = @_;
+ my @insert;
-sub extract_text {
- my $attachment = shift;
- my $text = $attachment->Content;
- return undef unless defined $text && length($text);
- return \$text;
-}
+ while ( my $a = $attachments->Next ) {
+ debug("Found attachment #". $a->id );
-sub attachments_html {
- my $res = shift;
- $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
- return $res;
-}
+ if ( $max_size and $a->ContentLength > $max_size ){
+ debug("Attachment #" . $a->id . " is " . $a->ContentLength .
+ " bytes which is larger than configured MaxFulltextAttachmentSize " .
+ " of " . $max_size . ", skipping");
+ next;
+ }
-sub filter_html {
- my $attachment = shift;
- if ( my $parent = $attachment->ParentObj ) {
-# skip html parts that are alternatives
- return 1 if $parent->id
- && $parent->ContentType eq 'mulitpart/alternative';
- }
- return 0;
-}
+ my $text = $a->Content // "";
+ HTML::Entities::decode_entities($text) if $a->ContentType eq "text/html";
-sub extract_html {
- my $attachment = shift;
- my $text = $attachment->Content;
- return undef unless defined $text && length($text);
-# TODO: html -> text
- return \$text;
-}
+ push @insert, [$text, $a->id];
+ }
-sub goto_specific {
- my %args = (@_);
+ # Try in one database transaction; if it fails, we roll it back
+ # and try one statement at a time.
+ $dbh->begin_work;
+ my $ok = 1;
+ for (@insert) {
+ $ok = eval { $sth->execute( $_->[0], $_->[1] ) };
+ last unless $ok;
+ }
+ if ($ok) {
+ $dbh->commit;
+ return;
+ }
+ $dbh->rollback;
+
+ # Things didn't go well. Retry the UPDATE statements one row at
+ # a time, outside of the transaction.
+ for (@insert) {
+ my ($content, $id) = ($_->[0], $_->[1]);
+ next if eval { $sth->execute( $content, $id ) };
+ if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
+ warn "Attachment $id cannot be indexed. Most probably it contains too many unique words. ".
+ "Error: ". $dbh->errstr;
+ } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
+ warn "Attachment $id cannot be indexed. Most probably it contains invalid UTF8 bytes. ".
+ "Error: ". $dbh->errstr;
+ } else {
+ die "Attachment $id cannot be indexed: " . $dbh->errstr;
+ }
- my $func = (caller(1))[3];
- $func =~ s/.*:://;
- my $call = $func ."_". lc $args{'suffix'};
- unless ( defined &$call ) {
- return undef unless $args{'error'};
- require Carp; Carp::croak( $args{'error'} );
- }
- @_ = @{ $args{'arguments'} };
- goto &$call;
+ # If this was a semi-expected error, insert an empty
+ # tsvector, so we count this row as "indexed" for
+ # purposes of knowing where to pick up
+ eval { $sth->execute( "", $id ) }
+ or die "Failed to insert empty row for attachment $id: " . $dbh->errstr;
+ }
+ });
}
Read F<docs/full_text_indexing.pod> for complete details on how and when
to run it.
-=head1 AUTHOR
-
-Ruslan Zakirov E<lt>ruz@bestpractical.comE<gt>,
-Alex Vandiver E<lt>alexmv@bestpractical.comE<gt>
-
=cut
+__DATA__