X-Git-Url: http://git.freeside.biz/gitweb/?a=blobdiff_plain;f=rt%2Fsbin%2Frt-fulltext-indexer;fp=rt%2Fsbin%2Frt-fulltext-indexer;h=cdcc78e157a9e2115170a8bc7040c3f84f3b3638;hb=5b3efac57771fbc37874a3dd39d3df835cdd6133;hp=0000000000000000000000000000000000000000;hpb=008524b8e963831999983769f7fec11f55a72f16;p=freeside.git diff --git a/rt/sbin/rt-fulltext-indexer b/rt/sbin/rt-fulltext-indexer new file mode 100755 index 000000000..cdcc78e15 --- /dev/null +++ b/rt/sbin/rt-fulltext-indexer @@ -0,0 +1,479 @@ +#!/usr/bin/perl +# BEGIN BPS TAGGED BLOCK {{{ +# +# COPYRIGHT: +# +# This software is Copyright (c) 1996-2014 Best Practical Solutions, LLC +# +# +# (Except where explicitly superseded by other copyright notices) +# +# +# LICENSE: +# +# This work is made available to you under the terms of Version 2 of +# the GNU General Public License. A copy of that license should have +# been provided with this software, but in any event can be snarfed +# from www.gnu.org. +# +# This work is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301 or visit their web page on the internet at +# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html. +# +# +# CONTRIBUTION SUBMISSION POLICY: +# +# (The following paragraph is not intended to limit the rights granted +# to you to modify and distribute this software under the terms of +# the GNU General Public License and is only of importance to you if +# you choose to contribute your changes and enhancements to the +# community by submitting them to Best Practical Solutions, LLC.) +# +# By intentionally submitting any modifications, corrections or +# derivatives to this work, or any other work intended for use with +# Request Tracker, to Best Practical Solutions, LLC, you confirm that +# you are the copyright holder for those contributions and you grant +# Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable, +# royalty-free, perpetual, license to use, copy, create derivative +# works based on those contributions, and sublicense and distribute +# those contributions and any derivatives thereof. +# +# END BPS TAGGED BLOCK }}} +use strict; +use warnings; +no warnings 'once'; + +# fix lib paths, some may be relative +BEGIN { + require File::Spec; + my @libs = ("/opt/rt3/lib", "/opt/rt3/local/lib"); + my $bin_path; + + for my $lib (@libs) { + unless ( File::Spec->file_name_is_absolute($lib) ) { + unless ($bin_path) { + if ( File::Spec->file_name_is_absolute(__FILE__) ) { + $bin_path = ( File::Spec->splitpath(__FILE__) )[1]; + } + else { + require FindBin; + no warnings "once"; + $bin_path = $FindBin::Bin; + } + } + $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib ); + } + unshift @INC, $lib; + } +} + +BEGIN { + use RT; + RT::LoadConfig(); + RT::Init(); +}; +use RT::Interface::CLI (); + +my %OPT = ( + help => 0, + debug => 0, + quiet => 0, +); +my @OPT_LIST = qw(help|h! debug! quiet); + +my $db_type = RT->Config->Get('DatabaseType'); +if ( $db_type eq 'Pg' ) { + %OPT = ( + %OPT, + limit => 0, + all => 0, + ); + push @OPT_LIST, 'limit=i', 'all!'; +} +elsif ( $db_type eq 'mysql' ) { + %OPT = ( + %OPT, + limit => 0, + all => 0, + xmlpipe2 => 0, + ); + push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!'; +} +elsif ( $db_type eq 'Oracle' ) { + %OPT = ( + %OPT, + memory => '2M', + ); + push @OPT_LIST, qw(memory=s); +} + +use Getopt::Long qw(GetOptions); +GetOptions( \%OPT, @OPT_LIST ); + +if ( $OPT{'help'} ) { + RT::Interface::CLI->ShowHelp( + Sections => 'NAME|DESCRIPTION|'. uc($db_type), + ); +} + +use Fcntl ':flock'; +if ( !flock main::DATA, LOCK_EX | LOCK_NB ) { + if ( $OPT{quiet} ) { + RT::Logger->info("$0 is already running; aborting silently, as requested"); + exit; + } + else { + print STDERR "$0 is already running\n"; + exit 1; + } +} + +my $fts_config = RT->Config->Get('FullTextSearch') || {}; +unless ( $fts_config->{'Enable'} ) { + print STDERR <{'Indexed'} ) { + print STDERR <{'IndexName'} || 'rt_fts_index'; + $RT::Handle->dbh->do( + "begin ctx_ddl.sync_index(?, ?); end;", undef, + $index, $OPT{'memory'} + ); + exit; +} elsif ( $db_type eq 'mysql' ) { + unless ($OPT{'xmlpipe2'}) { + print STDERR <Limit( + FIELD => 'id', + OPERATOR => '>', + VALUE => last_indexed($type) + ); + $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' ); + $attachments->RowsPerPage( $OPT{'limit'} || 100 ); + + my $found = 0; + while ( my $a = $attachments->Next ) { + next if filter( $type, $a ); + debug("Found attachment #". $a->id ); + my $txt = extract($type, $a) or next; + $found++; + process( $type, $a, $txt ); + debug("Processed attachment #". $a->id ); + } + finalize( $type, $attachments ) if $found; + clean( $type ); + goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100) +} + +sub attachments { + my $type = shift; + my $res = RT::Attachments->new( RT->SystemUser ); + my $txn_alias = $res->Join( + ALIAS1 => 'main', + FIELD1 => 'TransactionId', + TABLE2 => 'Transactions', + FIELD2 => 'id', + ); + $res->Limit( + ALIAS => $txn_alias, + FIELD => 'ObjectType', + VALUE => 'RT::Ticket', + ); + my $ticket_alias = $res->Join( + ALIAS1 => $txn_alias, + FIELD1 => 'ObjectId', + TABLE2 => 'Tickets', + FIELD2 => 'id', + ); + $res->Limit( + ALIAS => $ticket_alias, + FIELD => 'Status', + OPERATOR => '!=', + VALUE => 'deleted' + ); + + # On newer DBIx::SearchBuilder's, indicate that making the query DISTINCT + # is unnecessary because the joins won't produce duplicates. This + # drastically improves performance when fetching attachments. + $res->{joins_are_distinct} = 1; + + return goto_specific( + suffix => $type, + error => "Don't know how to find $type attachments", + arguments => [$res], + ); +} + +sub last_indexed { + my ($type) = (@_); + return goto_specific( + suffix => $db_type, + error => "Don't know how to find last indexed $type attachment for $db_type DB", + arguments => \@_, + ); +} + +sub filter { + my $type = shift; + return goto_specific( + suffix => $type, + arguments => \@_, + ); +} + +sub extract { + my $type = shift; + return goto_specific( + suffix => $type, + error => "No way to convert $type attachment into text", + arguments => \@_, + ); +} + +sub process { + return goto_specific( + suffix => $db_type, + error => "No processer for $db_type DB", + arguments => \@_, + ); +} + +sub finalize { + return goto_specific( + suffix => $db_type, + arguments => \@_, + ); +} + +sub clean { + return goto_specific( + suffix => $db_type, + arguments => \@_, + ); +} + +{ +sub last_indexed_mysql { + my $type = shift; + my $attr = $RT::System->FirstAttribute('LastIndexedAttachments'); + return 0 unless $attr; + return 0 unless exists $attr->{ $type }; + return $attr->{ $type } || 0; +} + +sub process_mysql { + my ($type, $attachment, $text) = (@_); + + my $doc = sphinx_template(); + + my $element = $doc->createElement('sphinx:document'); + $element->setAttribute( id => $attachment->id ); + $element->appendTextChild( content => $$text ); + + $doc->documentElement->appendChild( $element ); +} + +my $doc = undef; +sub sphinx_template { + return $doc if $doc; + + require XML::LibXML; + $doc = XML::LibXML::Document->new('1.0', 'UTF-8'); + my $root = $doc->createElement('sphinx:docset'); + $doc->setDocumentElement( $root ); + + my $schema = $doc->createElement('sphinx:schema'); + $root->appendChild( $schema ); + foreach ( qw(content) ) { + my $field = $doc->createElement('sphinx:field'); + $field->setAttribute( name => $_ ); + $schema->appendChild( $field ); + } + + return $doc; +} + +sub finalize_mysql { + my ($type, $attachments) = @_; + sphinx_template()->toFH(*STDOUT, 1); +} + +sub clean_mysql { + $doc = undef; +} + +} + +sub last_indexed_pg { + my $type = shift; + my $attachments = attachments( $type ); + my $alias = 'main'; + if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) { + $alias = $attachments->Join( + TYPE => 'left', + FIELD1 => 'id', + TABLE2 => $fts_config->{'Table'}, + FIELD2 => 'id', + ); + } + $attachments->Limit( + ALIAS => $alias, + FIELD => $fts_config->{'Column'}, + OPERATOR => 'IS NOT', + VALUE => 'NULL', + ); + $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' ); + $attachments->RowsPerPage( 1 ); + my $res = $attachments->First; + return 0 unless $res; + return $res->id; +} + +sub process_pg { + my ($type, $attachment, $text) = (@_); + + my $dbh = $RT::Handle->dbh; + my $table = $fts_config->{'Table'}; + my $column = $fts_config->{'Column'}; + + my $query; + if ( $table ) { + if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) { + $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?"; + } else { + $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)"; + } + } else { + $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?"; + } + + my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) }; + unless ( $status ) { + if ( $dbh->err == 7 && $dbh->state eq '54000' ) { + warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr; + } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) { + warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr; + } else { + die "error: ". $dbh->errstr; + } + + # Insert an empty tsvector, so we count this row as "indexed" + # for purposes of knowing where to pick up + eval { $dbh->do( $query, undef, "", $attachment->id ) } + or die "Failed to insert empty tsvector: " . $dbh->errstr; + } +} + +sub attachments_text { + my $res = shift; + $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' ); + return $res; +} + +sub extract_text { + my $attachment = shift; + my $text = $attachment->Content; + return undef unless defined $text && length($text); + return \$text; +} + +sub attachments_html { + my $res = shift; + $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' ); + return $res; +} + +sub filter_html { + my $attachment = shift; + if ( my $parent = $attachment->ParentObj ) { +# skip html parts that are alternatives + return 1 if $parent->id + && $parent->ContentType eq 'mulitpart/alternative'; + } + return 0; +} + +sub extract_html { + my $attachment = shift; + my $text = $attachment->Content; + return undef unless defined $text && length($text); +# TODO: html -> text + return \$text; +} + +sub goto_specific { + my %args = (@_); + + my $func = (caller(1))[3]; + $func =~ s/.*:://; + my $call = $func ."_". lc $args{'suffix'}; + unless ( defined &$call ) { + return undef unless $args{'error'}; + require Carp; Carp::croak( $args{'error'} ); + } + @_ = @{ $args{'arguments'} }; + goto &$call; +} + + +# helper functions +sub debug { print @_, "\n" if $OPT{debug}; 1 } +sub error { $RT::Logger->error(_(@_)); 1 } +sub warning { $RT::Logger->warn(_(@_)); 1 } + +=head1 NAME + +rt-fulltext-indexer - Indexer for full text search + +=head1 DESCRIPTION + +This is a helper script to keep full text indexes in sync with data. +Read F for complete details on how and when +to run it. + +=head1 AUTHOR + +Ruslan Zakirov Eruz@bestpractical.comE, +Alex Vandiver Ealexmv@bestpractical.comE + +=cut + +__DATA__