summaryrefslogtreecommitdiff
path: root/rt/sbin/rt-fulltext-indexer.in
diff options
context:
space:
mode:
Diffstat (limited to 'rt/sbin/rt-fulltext-indexer.in')
-rw-r--r--rt/sbin/rt-fulltext-indexer.in453
1 files changed, 453 insertions, 0 deletions
diff --git a/rt/sbin/rt-fulltext-indexer.in b/rt/sbin/rt-fulltext-indexer.in
new file mode 100644
index 0000000..7e31cac
--- /dev/null
+++ b/rt/sbin/rt-fulltext-indexer.in
@@ -0,0 +1,453 @@
+#!@PERL@
+# BEGIN BPS TAGGED BLOCK {{{
+#
+# COPYRIGHT:
+#
+# This software is Copyright (c) 1996-2012 Best Practical Solutions, LLC
+# <sales@bestpractical.com>
+#
+# (Except where explicitly superseded by other copyright notices)
+#
+#
+# LICENSE:
+#
+# This work is made available to you under the terms of Version 2 of
+# the GNU General Public License. A copy of that license should have
+# been provided with this software, but in any event can be snarfed
+# from www.gnu.org.
+#
+# This work is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+# 02110-1301 or visit their web page on the internet at
+# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
+#
+#
+# CONTRIBUTION SUBMISSION POLICY:
+#
+# (The following paragraph is not intended to limit the rights granted
+# to you to modify and distribute this software under the terms of
+# the GNU General Public License and is only of importance to you if
+# you choose to contribute your changes and enhancements to the
+# community by submitting them to Best Practical Solutions, LLC.)
+#
+# By intentionally submitting any modifications, corrections or
+# derivatives to this work, or any other work intended for use with
+# Request Tracker, to Best Practical Solutions, LLC, you confirm that
+# you are the copyright holder for those contributions and you grant
+# Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
+# royalty-free, perpetual, license to use, copy, create derivative
+# works based on those contributions, and sublicense and distribute
+# those contributions and any derivatives thereof.
+#
+# END BPS TAGGED BLOCK }}}
+use strict;
+use warnings;
+no warnings 'once';
+
+# fix lib paths, some may be relative
+BEGIN {
+ require File::Spec;
+ my @libs = ("@RT_LIB_PATH@", "@LOCAL_LIB_PATH@");
+ my $bin_path;
+
+ for my $lib (@libs) {
+ unless ( File::Spec->file_name_is_absolute($lib) ) {
+ unless ($bin_path) {
+ if ( File::Spec->file_name_is_absolute(__FILE__) ) {
+ $bin_path = ( File::Spec->splitpath(__FILE__) )[1];
+ }
+ else {
+ require FindBin;
+ no warnings "once";
+ $bin_path = $FindBin::Bin;
+ }
+ }
+ $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib );
+ }
+ unshift @INC, $lib;
+ }
+}
+
+BEGIN {
+ use RT;
+ RT::LoadConfig();
+ RT::Init();
+};
+use RT::Interface::CLI ();
+
+my %OPT = (
+ help => 0,
+ debug => 0,
+);
+my @OPT_LIST = qw(help|h! debug!);
+
+my $db_type = RT->Config->Get('DatabaseType');
+if ( $db_type eq 'Pg' ) {
+ %OPT = (
+ %OPT,
+ limit => 0,
+ all => 0,
+ );
+ push @OPT_LIST, 'limit=i', 'all!';
+}
+elsif ( $db_type eq 'mysql' ) {
+ %OPT = (
+ %OPT,
+ limit => 0,
+ all => 0,
+ xmlpipe2 => 0,
+ );
+ push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!';
+}
+elsif ( $db_type eq 'Oracle' ) {
+ %OPT = (
+ %OPT,
+ memory => '2M',
+ );
+ push @OPT_LIST, qw(memory=s);
+}
+
+use Getopt::Long qw(GetOptions);
+GetOptions( \%OPT, @OPT_LIST );
+
+if ( $OPT{'help'} ) {
+ RT::Interface::CLI->ShowHelp(
+ Sections => 'NAME|DESCRIPTION|'. uc($db_type),
+ );
+}
+
+my $fts_config = RT->Config->Get('FullTextSearch') || {};
+unless ( $fts_config->{'Enable'} ) {
+ print STDERR <<EOT;
+
+Full text search is disabled in your RT configuration. Run
+@RT_SBIN_PATH_R@/rt-setup-fulltext-index to configure and enable it.
+
+EOT
+ exit 1;
+}
+unless ( $fts_config->{'Indexed'} ) {
+ print STDERR <<EOT;
+
+Full text search is enabled in your RT configuration, but not with any
+full-text database indexing -- hence this tool is not required. Read
+the documentation for %FullTextSearch in your RT_Config for more details.
+
+EOT
+ exit 1;
+}
+
+if ( $db_type eq 'Oracle' ) {
+ my $index = $fts_config->{'IndexName'} || 'rt_fts_index';
+ $RT::Handle->dbh->do(
+ "begin ctx_ddl.sync_index(?, ?); end;", undef,
+ $index, $OPT{'memory'}
+ );
+ exit;
+} elsif ( $db_type eq 'mysql' ) {
+ unless ($OPT{'xmlpipe2'}) {
+ print STDERR <<EOT;
+
+Updates to the external Sphinx index are done via running the sphinx
+`indexer` tool:
+
+ indexer rt
+
+EOT
+ exit 1;
+ }
+}
+
+my @types = qw(text html);
+foreach my $type ( @types ) {
+ REDO:
+ my $attachments = attachments($type);
+ $attachments->Limit(
+ FIELD => 'id',
+ OPERATOR => '>',
+ VALUE => last_indexed($type)
+ );
+ $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
+ $attachments->RowsPerPage( $OPT{'limit'} || 100 );
+
+ my $found = 0;
+ while ( my $a = $attachments->Next ) {
+ next if filter( $type, $a );
+ debug("Found attachment #". $a->id );
+ my $txt = extract($type, $a) or next;
+ $found++;
+ process( $type, $a, $txt );
+ debug("Processed attachment #". $a->id );
+ }
+ finalize( $type, $attachments ) if $found;
+ clean( $type );
+ goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
+}
+
+sub attachments {
+ my $type = shift;
+ my $res = RT::Attachments->new( RT->SystemUser );
+ my $txn_alias = $res->Join(
+ ALIAS1 => 'main',
+ FIELD1 => 'TransactionId',
+ TABLE2 => 'Transactions',
+ FIELD2 => 'id',
+ );
+ $res->Limit(
+ ALIAS => $txn_alias,
+ FIELD => 'ObjectType',
+ VALUE => 'RT::Ticket',
+ );
+ my $ticket_alias = $res->Join(
+ ALIAS1 => $txn_alias,
+ FIELD1 => 'ObjectId',
+ TABLE2 => 'Tickets',
+ FIELD2 => 'id',
+ );
+ $res->Limit(
+ ALIAS => $ticket_alias,
+ FIELD => 'Status',
+ OPERATOR => '!=',
+ VALUE => 'deleted'
+ );
+
+ return goto_specific(
+ suffix => $type,
+ error => "Don't know how to find $type attachments",
+ arguments => [$res],
+ );
+}
+
+sub last_indexed {
+ my ($type) = (@_);
+ return goto_specific(
+ suffix => $db_type,
+ error => "Don't know how to find last indexed $type attachment for $db_type DB",
+ arguments => \@_,
+ );
+}
+
+sub filter {
+ my $type = shift;
+ return goto_specific(
+ suffix => $type,
+ arguments => \@_,
+ );
+}
+
+sub extract {
+ my $type = shift;
+ return goto_specific(
+ suffix => $type,
+ error => "No way to convert $type attachment into text",
+ arguments => \@_,
+ );
+}
+
+sub process {
+ return goto_specific(
+ suffix => $db_type,
+ error => "No processer for $db_type DB",
+ arguments => \@_,
+ );
+}
+
+sub finalize {
+ return goto_specific(
+ suffix => $db_type,
+ arguments => \@_,
+ );
+}
+
+sub clean {
+ return goto_specific(
+ suffix => $db_type,
+ arguments => \@_,
+ );
+}
+
+{
+sub last_indexed_mysql {
+ my $type = shift;
+ my $attr = $RT::System->FirstAttribute('LastIndexedAttachments');
+ return 0 unless $attr;
+ return 0 unless exists $attr->{ $type };
+ return $attr->{ $type } || 0;
+}
+
+sub process_mysql {
+ my ($type, $attachment, $text) = (@_);
+
+ my $doc = sphinx_template();
+
+ my $element = $doc->createElement('sphinx:document');
+ $element->setAttribute( id => $attachment->id );
+ $element->appendTextChild( content => $$text );
+
+ $doc->documentElement->appendChild( $element );
+}
+
+my $doc = undef;
+sub sphinx_template {
+ return $doc if $doc;
+
+ require XML::LibXML;
+ $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
+ my $root = $doc->createElement('sphinx:docset');
+ $doc->setDocumentElement( $root );
+
+ my $schema = $doc->createElement('sphinx:schema');
+ $root->appendChild( $schema );
+ foreach ( qw(content) ) {
+ my $field = $doc->createElement('sphinx:field');
+ $field->setAttribute( name => $_ );
+ $schema->appendChild( $field );
+ }
+
+ return $doc;
+}
+
+sub finalize_mysql {
+ my ($type, $attachments) = @_;
+ sphinx_template()->toFH(*STDOUT, 1);
+}
+
+sub clean_mysql {
+ $doc = undef;
+}
+
+}
+
+sub last_indexed_pg {
+ my $type = shift;
+ my $attachments = attachments( $type );
+ my $alias = 'main';
+ if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
+ $alias = $attachments->Join(
+ TYPE => 'left',
+ FIELD1 => 'id',
+ TABLE2 => $fts_config->{'Table'},
+ FIELD2 => 'id',
+ );
+ }
+ $attachments->Limit(
+ ALIAS => $alias,
+ FIELD => $fts_config->{'Column'},
+ OPERATOR => 'IS NOT',
+ VALUE => 'NULL',
+ );
+ $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
+ $attachments->RowsPerPage( 1 );
+ my $res = $attachments->First;
+ return 0 unless $res;
+ return $res->id;
+}
+
+sub process_pg {
+ my ($type, $attachment, $text) = (@_);
+
+ my $dbh = $RT::Handle->dbh;
+ my $table = $fts_config->{'Table'};
+ my $column = $fts_config->{'Column'};
+
+ my $query;
+ if ( $table ) {
+ if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
+ $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
+ } else {
+ $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
+ }
+ } else {
+ $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
+ }
+
+ my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) };
+ unless ( $status ) {
+ if ($dbh->errstr =~ /string is too long for tsvector/) {
+ warn "Attachment @{[$attachment->id]} not indexed, as it contains too many unique words to be indexed";
+ } else {
+ die "error: ". $dbh->errstr;
+ }
+ }
+}
+
+sub attachments_text {
+ my $res = shift;
+ $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' );
+ return $res;
+}
+
+sub extract_text {
+ my $attachment = shift;
+ my $text = $attachment->Content;
+ return undef unless defined $text && length($text);
+ return \$text;
+}
+
+sub attachments_html {
+ my $res = shift;
+ $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
+ return $res;
+}
+
+sub filter_html {
+ my $attachment = shift;
+ if ( my $parent = $attachment->ParentObj ) {
+# skip html parts that are alternatives
+ return 1 if $parent->id
+ && $parent->ContentType eq 'mulitpart/alternative';
+ }
+ return 0;
+}
+
+sub extract_html {
+ my $attachment = shift;
+ my $text = $attachment->Content;
+ return undef unless defined $text && length($text);
+# TODO: html -> text
+ return \$text;
+}
+
+sub goto_specific {
+ my %args = (@_);
+
+ my $func = (caller(1))[3];
+ $func =~ s/.*:://;
+ my $call = $func ."_". lc $args{'suffix'};
+ unless ( defined &$call ) {
+ return undef unless $args{'error'};
+ require Carp; Carp::croak( $args{'error'} );
+ }
+ @_ = @{ $args{'arguments'} };
+ goto &$call;
+}
+
+
+# helper functions
+sub debug { print @_, "\n" if $OPT{debug}; 1 }
+sub error { $RT::Logger->error(_(@_)); 1 }
+sub warning { $RT::Logger->warn(_(@_)); 1 }
+
+=head1 NAME
+
+rt-fulltext-indexer - Indexer for full text search
+
+=head1 DESCRIPTION
+
+This is a helper script to keep full text indexes in sync with data.
+Read F<docs/full_text_indexing.pod> for complete details on how and when
+to run it.
+
+=head1 AUTHOR
+
+Ruslan Zakirov E<lt>ruz@bestpractical.comE<gt>,
+Alex Vandiver E<lt>alexmv@bestpractical.comE<gt>
+
+=cut
+