diff options
author | Ivan Kohler <ivan@freeside.biz> | 2012-04-24 11:35:56 -0700 |
---|---|---|
committer | Ivan Kohler <ivan@freeside.biz> | 2012-04-24 11:35:56 -0700 |
commit | 6587f6ba7d047ddc1686c080090afe7d53365bd4 (patch) | |
tree | ec77342668e8865aca669c9b4736e84e3077b523 /rt/sbin/rt-fulltext-indexer.in | |
parent | 47153aae5c2fc00316654e7277fccd45f72ff611 (diff) |
first pass RT4 merge, RT#13852
Diffstat (limited to 'rt/sbin/rt-fulltext-indexer.in')
-rw-r--r-- | rt/sbin/rt-fulltext-indexer.in | 453 |
1 files changed, 453 insertions, 0 deletions
diff --git a/rt/sbin/rt-fulltext-indexer.in b/rt/sbin/rt-fulltext-indexer.in new file mode 100644 index 000000000..7e31cac84 --- /dev/null +++ b/rt/sbin/rt-fulltext-indexer.in @@ -0,0 +1,453 @@ +#!@PERL@ +# BEGIN BPS TAGGED BLOCK {{{ +# +# COPYRIGHT: +# +# This software is Copyright (c) 1996-2012 Best Practical Solutions, LLC +# <sales@bestpractical.com> +# +# (Except where explicitly superseded by other copyright notices) +# +# +# LICENSE: +# +# This work is made available to you under the terms of Version 2 of +# the GNU General Public License. A copy of that license should have +# been provided with this software, but in any event can be snarfed +# from www.gnu.org. +# +# This work is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301 or visit their web page on the internet at +# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html. +# +# +# CONTRIBUTION SUBMISSION POLICY: +# +# (The following paragraph is not intended to limit the rights granted +# to you to modify and distribute this software under the terms of +# the GNU General Public License and is only of importance to you if +# you choose to contribute your changes and enhancements to the +# community by submitting them to Best Practical Solutions, LLC.) +# +# By intentionally submitting any modifications, corrections or +# derivatives to this work, or any other work intended for use with +# Request Tracker, to Best Practical Solutions, LLC, you confirm that +# you are the copyright holder for those contributions and you grant +# Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable, +# royalty-free, perpetual, license to use, copy, create derivative +# works based on those contributions, and sublicense and distribute +# those contributions and any derivatives thereof. +# +# END BPS TAGGED BLOCK }}} +use strict; +use warnings; +no warnings 'once'; + +# fix lib paths, some may be relative +BEGIN { + require File::Spec; + my @libs = ("@RT_LIB_PATH@", "@LOCAL_LIB_PATH@"); + my $bin_path; + + for my $lib (@libs) { + unless ( File::Spec->file_name_is_absolute($lib) ) { + unless ($bin_path) { + if ( File::Spec->file_name_is_absolute(__FILE__) ) { + $bin_path = ( File::Spec->splitpath(__FILE__) )[1]; + } + else { + require FindBin; + no warnings "once"; + $bin_path = $FindBin::Bin; + } + } + $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib ); + } + unshift @INC, $lib; + } +} + +BEGIN { + use RT; + RT::LoadConfig(); + RT::Init(); +}; +use RT::Interface::CLI (); + +my %OPT = ( + help => 0, + debug => 0, +); +my @OPT_LIST = qw(help|h! debug!); + +my $db_type = RT->Config->Get('DatabaseType'); +if ( $db_type eq 'Pg' ) { + %OPT = ( + %OPT, + limit => 0, + all => 0, + ); + push @OPT_LIST, 'limit=i', 'all!'; +} +elsif ( $db_type eq 'mysql' ) { + %OPT = ( + %OPT, + limit => 0, + all => 0, + xmlpipe2 => 0, + ); + push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!'; +} +elsif ( $db_type eq 'Oracle' ) { + %OPT = ( + %OPT, + memory => '2M', + ); + push @OPT_LIST, qw(memory=s); +} + +use Getopt::Long qw(GetOptions); +GetOptions( \%OPT, @OPT_LIST ); + +if ( $OPT{'help'} ) { + RT::Interface::CLI->ShowHelp( + Sections => 'NAME|DESCRIPTION|'. uc($db_type), + ); +} + +my $fts_config = RT->Config->Get('FullTextSearch') || {}; +unless ( $fts_config->{'Enable'} ) { + print STDERR <<EOT; + +Full text search is disabled in your RT configuration. Run +@RT_SBIN_PATH_R@/rt-setup-fulltext-index to configure and enable it. + +EOT + exit 1; +} +unless ( $fts_config->{'Indexed'} ) { + print STDERR <<EOT; + +Full text search is enabled in your RT configuration, but not with any +full-text database indexing -- hence this tool is not required. Read +the documentation for %FullTextSearch in your RT_Config for more details. + +EOT + exit 1; +} + +if ( $db_type eq 'Oracle' ) { + my $index = $fts_config->{'IndexName'} || 'rt_fts_index'; + $RT::Handle->dbh->do( + "begin ctx_ddl.sync_index(?, ?); end;", undef, + $index, $OPT{'memory'} + ); + exit; +} elsif ( $db_type eq 'mysql' ) { + unless ($OPT{'xmlpipe2'}) { + print STDERR <<EOT; + +Updates to the external Sphinx index are done via running the sphinx +`indexer` tool: + + indexer rt + +EOT + exit 1; + } +} + +my @types = qw(text html); +foreach my $type ( @types ) { + REDO: + my $attachments = attachments($type); + $attachments->Limit( + FIELD => 'id', + OPERATOR => '>', + VALUE => last_indexed($type) + ); + $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' ); + $attachments->RowsPerPage( $OPT{'limit'} || 100 ); + + my $found = 0; + while ( my $a = $attachments->Next ) { + next if filter( $type, $a ); + debug("Found attachment #". $a->id ); + my $txt = extract($type, $a) or next; + $found++; + process( $type, $a, $txt ); + debug("Processed attachment #". $a->id ); + } + finalize( $type, $attachments ) if $found; + clean( $type ); + goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100) +} + +sub attachments { + my $type = shift; + my $res = RT::Attachments->new( RT->SystemUser ); + my $txn_alias = $res->Join( + ALIAS1 => 'main', + FIELD1 => 'TransactionId', + TABLE2 => 'Transactions', + FIELD2 => 'id', + ); + $res->Limit( + ALIAS => $txn_alias, + FIELD => 'ObjectType', + VALUE => 'RT::Ticket', + ); + my $ticket_alias = $res->Join( + ALIAS1 => $txn_alias, + FIELD1 => 'ObjectId', + TABLE2 => 'Tickets', + FIELD2 => 'id', + ); + $res->Limit( + ALIAS => $ticket_alias, + FIELD => 'Status', + OPERATOR => '!=', + VALUE => 'deleted' + ); + + return goto_specific( + suffix => $type, + error => "Don't know how to find $type attachments", + arguments => [$res], + ); +} + +sub last_indexed { + my ($type) = (@_); + return goto_specific( + suffix => $db_type, + error => "Don't know how to find last indexed $type attachment for $db_type DB", + arguments => \@_, + ); +} + +sub filter { + my $type = shift; + return goto_specific( + suffix => $type, + arguments => \@_, + ); +} + +sub extract { + my $type = shift; + return goto_specific( + suffix => $type, + error => "No way to convert $type attachment into text", + arguments => \@_, + ); +} + +sub process { + return goto_specific( + suffix => $db_type, + error => "No processer for $db_type DB", + arguments => \@_, + ); +} + +sub finalize { + return goto_specific( + suffix => $db_type, + arguments => \@_, + ); +} + +sub clean { + return goto_specific( + suffix => $db_type, + arguments => \@_, + ); +} + +{ +sub last_indexed_mysql { + my $type = shift; + my $attr = $RT::System->FirstAttribute('LastIndexedAttachments'); + return 0 unless $attr; + return 0 unless exists $attr->{ $type }; + return $attr->{ $type } || 0; +} + +sub process_mysql { + my ($type, $attachment, $text) = (@_); + + my $doc = sphinx_template(); + + my $element = $doc->createElement('sphinx:document'); + $element->setAttribute( id => $attachment->id ); + $element->appendTextChild( content => $$text ); + + $doc->documentElement->appendChild( $element ); +} + +my $doc = undef; +sub sphinx_template { + return $doc if $doc; + + require XML::LibXML; + $doc = XML::LibXML::Document->new('1.0', 'UTF-8'); + my $root = $doc->createElement('sphinx:docset'); + $doc->setDocumentElement( $root ); + + my $schema = $doc->createElement('sphinx:schema'); + $root->appendChild( $schema ); + foreach ( qw(content) ) { + my $field = $doc->createElement('sphinx:field'); + $field->setAttribute( name => $_ ); + $schema->appendChild( $field ); + } + + return $doc; +} + +sub finalize_mysql { + my ($type, $attachments) = @_; + sphinx_template()->toFH(*STDOUT, 1); +} + +sub clean_mysql { + $doc = undef; +} + +} + +sub last_indexed_pg { + my $type = shift; + my $attachments = attachments( $type ); + my $alias = 'main'; + if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) { + $alias = $attachments->Join( + TYPE => 'left', + FIELD1 => 'id', + TABLE2 => $fts_config->{'Table'}, + FIELD2 => 'id', + ); + } + $attachments->Limit( + ALIAS => $alias, + FIELD => $fts_config->{'Column'}, + OPERATOR => 'IS NOT', + VALUE => 'NULL', + ); + $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' ); + $attachments->RowsPerPage( 1 ); + my $res = $attachments->First; + return 0 unless $res; + return $res->id; +} + +sub process_pg { + my ($type, $attachment, $text) = (@_); + + my $dbh = $RT::Handle->dbh; + my $table = $fts_config->{'Table'}; + my $column = $fts_config->{'Column'}; + + my $query; + if ( $table ) { + if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) { + $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?"; + } else { + $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)"; + } + } else { + $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?"; + } + + my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) }; + unless ( $status ) { + if ($dbh->errstr =~ /string is too long for tsvector/) { + warn "Attachment @{[$attachment->id]} not indexed, as it contains too many unique words to be indexed"; + } else { + die "error: ". $dbh->errstr; + } + } +} + +sub attachments_text { + my $res = shift; + $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' ); + return $res; +} + +sub extract_text { + my $attachment = shift; + my $text = $attachment->Content; + return undef unless defined $text && length($text); + return \$text; +} + +sub attachments_html { + my $res = shift; + $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' ); + return $res; +} + +sub filter_html { + my $attachment = shift; + if ( my $parent = $attachment->ParentObj ) { +# skip html parts that are alternatives + return 1 if $parent->id + && $parent->ContentType eq 'mulitpart/alternative'; + } + return 0; +} + +sub extract_html { + my $attachment = shift; + my $text = $attachment->Content; + return undef unless defined $text && length($text); +# TODO: html -> text + return \$text; +} + +sub goto_specific { + my %args = (@_); + + my $func = (caller(1))[3]; + $func =~ s/.*:://; + my $call = $func ."_". lc $args{'suffix'}; + unless ( defined &$call ) { + return undef unless $args{'error'}; + require Carp; Carp::croak( $args{'error'} ); + } + @_ = @{ $args{'arguments'} }; + goto &$call; +} + + +# helper functions +sub debug { print @_, "\n" if $OPT{debug}; 1 } +sub error { $RT::Logger->error(_(@_)); 1 } +sub warning { $RT::Logger->warn(_(@_)); 1 } + +=head1 NAME + +rt-fulltext-indexer - Indexer for full text search + +=head1 DESCRIPTION + +This is a helper script to keep full text indexes in sync with data. +Read F<docs/full_text_indexing.pod> for complete details on how and when +to run it. + +=head1 AUTHOR + +Ruslan Zakirov E<lt>ruz@bestpractical.comE<gt>, +Alex Vandiver E<lt>alexmv@bestpractical.comE<gt> + +=cut + |