X-Git-Url: http://git.freeside.biz/gitweb/?a=blobdiff_plain;f=rt%2Fsbin%2Frt-fulltext-indexer;fp=rt%2Fsbin%2Frt-fulltext-indexer;h=0000000000000000000000000000000000000000;hb=b2101517513739ce744ecffad184a8a358c2d6c7;hp=cdcc78e157a9e2115170a8bc7040c3f84f3b3638;hpb=5b3efac57771fbc37874a3dd39d3df835cdd6133;p=freeside.git diff --git a/rt/sbin/rt-fulltext-indexer b/rt/sbin/rt-fulltext-indexer deleted file mode 100755 index cdcc78e15..000000000 --- a/rt/sbin/rt-fulltext-indexer +++ /dev/null @@ -1,479 +0,0 @@ -#!/usr/bin/perl -# BEGIN BPS TAGGED BLOCK {{{ -# -# COPYRIGHT: -# -# This software is Copyright (c) 1996-2014 Best Practical Solutions, LLC -# -# -# (Except where explicitly superseded by other copyright notices) -# -# -# LICENSE: -# -# This work is made available to you under the terms of Version 2 of -# the GNU General Public License. A copy of that license should have -# been provided with this software, but in any event can be snarfed -# from www.gnu.org. -# -# This work is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA -# 02110-1301 or visit their web page on the internet at -# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html. -# -# -# CONTRIBUTION SUBMISSION POLICY: -# -# (The following paragraph is not intended to limit the rights granted -# to you to modify and distribute this software under the terms of -# the GNU General Public License and is only of importance to you if -# you choose to contribute your changes and enhancements to the -# community by submitting them to Best Practical Solutions, LLC.) -# -# By intentionally submitting any modifications, corrections or -# derivatives to this work, or any other work intended for use with -# Request Tracker, to Best Practical Solutions, LLC, you confirm that -# you are the copyright holder for those contributions and you grant -# Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable, -# royalty-free, perpetual, license to use, copy, create derivative -# works based on those contributions, and sublicense and distribute -# those contributions and any derivatives thereof. -# -# END BPS TAGGED BLOCK }}} -use strict; -use warnings; -no warnings 'once'; - -# fix lib paths, some may be relative -BEGIN { - require File::Spec; - my @libs = ("/opt/rt3/lib", "/opt/rt3/local/lib"); - my $bin_path; - - for my $lib (@libs) { - unless ( File::Spec->file_name_is_absolute($lib) ) { - unless ($bin_path) { - if ( File::Spec->file_name_is_absolute(__FILE__) ) { - $bin_path = ( File::Spec->splitpath(__FILE__) )[1]; - } - else { - require FindBin; - no warnings "once"; - $bin_path = $FindBin::Bin; - } - } - $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib ); - } - unshift @INC, $lib; - } -} - -BEGIN { - use RT; - RT::LoadConfig(); - RT::Init(); -}; -use RT::Interface::CLI (); - -my %OPT = ( - help => 0, - debug => 0, - quiet => 0, -); -my @OPT_LIST = qw(help|h! debug! quiet); - -my $db_type = RT->Config->Get('DatabaseType'); -if ( $db_type eq 'Pg' ) { - %OPT = ( - %OPT, - limit => 0, - all => 0, - ); - push @OPT_LIST, 'limit=i', 'all!'; -} -elsif ( $db_type eq 'mysql' ) { - %OPT = ( - %OPT, - limit => 0, - all => 0, - xmlpipe2 => 0, - ); - push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!'; -} -elsif ( $db_type eq 'Oracle' ) { - %OPT = ( - %OPT, - memory => '2M', - ); - push @OPT_LIST, qw(memory=s); -} - -use Getopt::Long qw(GetOptions); -GetOptions( \%OPT, @OPT_LIST ); - -if ( $OPT{'help'} ) { - RT::Interface::CLI->ShowHelp( - Sections => 'NAME|DESCRIPTION|'. uc($db_type), - ); -} - -use Fcntl ':flock'; -if ( !flock main::DATA, LOCK_EX | LOCK_NB ) { - if ( $OPT{quiet} ) { - RT::Logger->info("$0 is already running; aborting silently, as requested"); - exit; - } - else { - print STDERR "$0 is already running\n"; - exit 1; - } -} - -my $fts_config = RT->Config->Get('FullTextSearch') || {}; -unless ( $fts_config->{'Enable'} ) { - print STDERR <{'Indexed'} ) { - print STDERR <{'IndexName'} || 'rt_fts_index'; - $RT::Handle->dbh->do( - "begin ctx_ddl.sync_index(?, ?); end;", undef, - $index, $OPT{'memory'} - ); - exit; -} elsif ( $db_type eq 'mysql' ) { - unless ($OPT{'xmlpipe2'}) { - print STDERR <Limit( - FIELD => 'id', - OPERATOR => '>', - VALUE => last_indexed($type) - ); - $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' ); - $attachments->RowsPerPage( $OPT{'limit'} || 100 ); - - my $found = 0; - while ( my $a = $attachments->Next ) { - next if filter( $type, $a ); - debug("Found attachment #". $a->id ); - my $txt = extract($type, $a) or next; - $found++; - process( $type, $a, $txt ); - debug("Processed attachment #". $a->id ); - } - finalize( $type, $attachments ) if $found; - clean( $type ); - goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100) -} - -sub attachments { - my $type = shift; - my $res = RT::Attachments->new( RT->SystemUser ); - my $txn_alias = $res->Join( - ALIAS1 => 'main', - FIELD1 => 'TransactionId', - TABLE2 => 'Transactions', - FIELD2 => 'id', - ); - $res->Limit( - ALIAS => $txn_alias, - FIELD => 'ObjectType', - VALUE => 'RT::Ticket', - ); - my $ticket_alias = $res->Join( - ALIAS1 => $txn_alias, - FIELD1 => 'ObjectId', - TABLE2 => 'Tickets', - FIELD2 => 'id', - ); - $res->Limit( - ALIAS => $ticket_alias, - FIELD => 'Status', - OPERATOR => '!=', - VALUE => 'deleted' - ); - - # On newer DBIx::SearchBuilder's, indicate that making the query DISTINCT - # is unnecessary because the joins won't produce duplicates. This - # drastically improves performance when fetching attachments. - $res->{joins_are_distinct} = 1; - - return goto_specific( - suffix => $type, - error => "Don't know how to find $type attachments", - arguments => [$res], - ); -} - -sub last_indexed { - my ($type) = (@_); - return goto_specific( - suffix => $db_type, - error => "Don't know how to find last indexed $type attachment for $db_type DB", - arguments => \@_, - ); -} - -sub filter { - my $type = shift; - return goto_specific( - suffix => $type, - arguments => \@_, - ); -} - -sub extract { - my $type = shift; - return goto_specific( - suffix => $type, - error => "No way to convert $type attachment into text", - arguments => \@_, - ); -} - -sub process { - return goto_specific( - suffix => $db_type, - error => "No processer for $db_type DB", - arguments => \@_, - ); -} - -sub finalize { - return goto_specific( - suffix => $db_type, - arguments => \@_, - ); -} - -sub clean { - return goto_specific( - suffix => $db_type, - arguments => \@_, - ); -} - -{ -sub last_indexed_mysql { - my $type = shift; - my $attr = $RT::System->FirstAttribute('LastIndexedAttachments'); - return 0 unless $attr; - return 0 unless exists $attr->{ $type }; - return $attr->{ $type } || 0; -} - -sub process_mysql { - my ($type, $attachment, $text) = (@_); - - my $doc = sphinx_template(); - - my $element = $doc->createElement('sphinx:document'); - $element->setAttribute( id => $attachment->id ); - $element->appendTextChild( content => $$text ); - - $doc->documentElement->appendChild( $element ); -} - -my $doc = undef; -sub sphinx_template { - return $doc if $doc; - - require XML::LibXML; - $doc = XML::LibXML::Document->new('1.0', 'UTF-8'); - my $root = $doc->createElement('sphinx:docset'); - $doc->setDocumentElement( $root ); - - my $schema = $doc->createElement('sphinx:schema'); - $root->appendChild( $schema ); - foreach ( qw(content) ) { - my $field = $doc->createElement('sphinx:field'); - $field->setAttribute( name => $_ ); - $schema->appendChild( $field ); - } - - return $doc; -} - -sub finalize_mysql { - my ($type, $attachments) = @_; - sphinx_template()->toFH(*STDOUT, 1); -} - -sub clean_mysql { - $doc = undef; -} - -} - -sub last_indexed_pg { - my $type = shift; - my $attachments = attachments( $type ); - my $alias = 'main'; - if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) { - $alias = $attachments->Join( - TYPE => 'left', - FIELD1 => 'id', - TABLE2 => $fts_config->{'Table'}, - FIELD2 => 'id', - ); - } - $attachments->Limit( - ALIAS => $alias, - FIELD => $fts_config->{'Column'}, - OPERATOR => 'IS NOT', - VALUE => 'NULL', - ); - $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' ); - $attachments->RowsPerPage( 1 ); - my $res = $attachments->First; - return 0 unless $res; - return $res->id; -} - -sub process_pg { - my ($type, $attachment, $text) = (@_); - - my $dbh = $RT::Handle->dbh; - my $table = $fts_config->{'Table'}; - my $column = $fts_config->{'Column'}; - - my $query; - if ( $table ) { - if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) { - $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?"; - } else { - $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)"; - } - } else { - $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?"; - } - - my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) }; - unless ( $status ) { - if ( $dbh->err == 7 && $dbh->state eq '54000' ) { - warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr; - } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) { - warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr; - } else { - die "error: ". $dbh->errstr; - } - - # Insert an empty tsvector, so we count this row as "indexed" - # for purposes of knowing where to pick up - eval { $dbh->do( $query, undef, "", $attachment->id ) } - or die "Failed to insert empty tsvector: " . $dbh->errstr; - } -} - -sub attachments_text { - my $res = shift; - $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' ); - return $res; -} - -sub extract_text { - my $attachment = shift; - my $text = $attachment->Content; - return undef unless defined $text && length($text); - return \$text; -} - -sub attachments_html { - my $res = shift; - $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' ); - return $res; -} - -sub filter_html { - my $attachment = shift; - if ( my $parent = $attachment->ParentObj ) { -# skip html parts that are alternatives - return 1 if $parent->id - && $parent->ContentType eq 'mulitpart/alternative'; - } - return 0; -} - -sub extract_html { - my $attachment = shift; - my $text = $attachment->Content; - return undef unless defined $text && length($text); -# TODO: html -> text - return \$text; -} - -sub goto_specific { - my %args = (@_); - - my $func = (caller(1))[3]; - $func =~ s/.*:://; - my $call = $func ."_". lc $args{'suffix'}; - unless ( defined &$call ) { - return undef unless $args{'error'}; - require Carp; Carp::croak( $args{'error'} ); - } - @_ = @{ $args{'arguments'} }; - goto &$call; -} - - -# helper functions -sub debug { print @_, "\n" if $OPT{debug}; 1 } -sub error { $RT::Logger->error(_(@_)); 1 } -sub warning { $RT::Logger->warn(_(@_)); 1 } - -=head1 NAME - -rt-fulltext-indexer - Indexer for full text search - -=head1 DESCRIPTION - -This is a helper script to keep full text indexes in sync with data. -Read F for complete details on how and when -to run it. - -=head1 AUTHOR - -Ruslan Zakirov Eruz@bestpractical.comE, -Alex Vandiver Ealexmv@bestpractical.comE - -=cut - -__DATA__