rt/lib/RT/I18N.pm

   1 # BEGIN BPS TAGGED BLOCK {{{
   2 #
   3 # COPYRIGHT:
   4 #
   5 # This software is Copyright (c) 1996-2011 Best Practical Solutions, LLC
   6 #                                          <sales@bestpractical.com>
   7 #
   8 # (Except where explicitly superseded by other copyright notices)
   9 #
  10 #
  11 # LICENSE:
  12 #
  13 # This work is made available to you under the terms of Version 2 of
  14 # the GNU General Public License. A copy of that license should have
  15 # been provided with this software, but in any event can be snarfed
  16 # from www.gnu.org.
  17 #
  18 # This work is distributed in the hope that it will be useful, but
  19 # WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 # General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU General Public License
  24 # along with this program; if not, write to the Free Software
  25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26 # 02110-1301 or visit their web page on the internet at
  27 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
  28 #
  29 #
  30 # CONTRIBUTION SUBMISSION POLICY:
  31 #
  32 # (The following paragraph is not intended to limit the rights granted
  33 # to you to modify and distribute this software under the terms of
  34 # the GNU General Public License and is only of importance to you if
  35 # you choose to contribute your changes and enhancements to the
  36 # community by submitting them to Best Practical Solutions, LLC.)
  37 #
  38 # By intentionally submitting any modifications, corrections or
  39 # derivatives to this work, or any other work intended for use with
  40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
  41 # you are the copyright holder for those contributions and you grant
  42 # Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
  43 # royalty-free, perpetual, license to use, copy, create derivative
  44 # works based on those contributions, and sublicense and distribute
  45 # those contributions and any derivatives thereof.
  46 #
  47 # END BPS TAGGED BLOCK }}}
  48
  49 =head1 NAME
  50
  51 RT::I18N - a base class for localization of RT
  52
  53 =cut
  54
  55 package RT::I18N;
  56
  57 use strict;
  58 use warnings;
  59
  60 use Locale::Maketext 1.04;
  61 use Locale::Maketext::Lexicon 0.25;
  62 use base ('Locale::Maketext::Fuzzy');
  63
  64 use Encode;
  65 use MIME::Entity;
  66 use MIME::Head;
  67
  68 # I decree that this project's first language is English.
  69
  70 our %Lexicon = (
  71    'TEST_STRING' => 'Concrete Mixer',
  72
  73     '__Content-Type' => 'text/plain; charset=utf-8',
  74
  75   '_AUTO' => 1,
  76   # That means that lookup failures can't happen -- if we get as far
  77   #  as looking for something in this lexicon, and we don't find it,
  78   #  then automagically set $Lexicon{$key} = $key, before possibly
  79   #  compiling it.
  80
  81   # The exception is keys that start with "_" -- they aren't auto-makeable.
  82
  83 );
  84 # End of lexicon.
  85
  86 =head2 Init
  87
  88 Initializes the lexicons used for localization.
  89
  90
  91 =cut
  92
  93 sub Init {
  94     require File::Glob;
  95
  96     my @lang = RT->Config->Get('LexiconLanguages');
  97     @lang = ('*') unless @lang;
  98
  99     # load default functions
 100     require substr(__FILE__, 0, -3) . '/i_default.pm';
 101
 102     # Load language-specific functions
 103     foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
 104         unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
 105             warn("$file is tainted. not loading");
 106             next;
 107         }
 108         $file = $1;
 109
 110         my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
 111         next unless grep $_ eq '*' || $_ eq $lang, @lang;
 112         require $file;
 113     }
 114
 115     my %import;
 116     foreach my $l ( @lang ) {
 117         $import{$l} = [
 118             Gettext => (substr(__FILE__, 0, -3) . "/$l.po"),
 119             Gettext => "$RT::LocalLexiconPath/*/$l.po",
 120             Gettext => "$RT::LocalLexiconPath/$l.po",
 121         ];
 122         push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
 123     }
 124
 125     # Acquire all .po files and iterate them into lexicons
 126     Locale::Maketext::Lexicon->import({ _decode => 1, %import });
 127
 128     return 1;
 129 }
 130
 131 sub LoadLexicons {
 132
 133     no strict 'refs';
 134     foreach my $k (keys %{RT::I18N::} ) {
 135         next if $k eq 'main::';
 136         next unless index($k, '::', -2) >= 0;
 137         next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
 138
 139         my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
 140         # run fetch to force load
 141         my $tmp = $lex->{'foo'};
 142         # XXX: untie may fail with "untie attempted
 143         # while 1 inner references still exist"
 144         # TODO: untie that has to lower fetch impact
 145         # untie %$lex if tied %$lex;
 146     }
 147 }
 148
 149 =head2 encoding
 150
 151 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 152 If it can't find anything, it returns 'ISO-8859-1'
 153
 154
 155
 156 =cut
 157
 158
 159 sub encoding { 'utf-8' }
 160
 161 # {{{ SetMIMEEntityToUTF8
 162
 163 =head2 SetMIMEEntityToUTF8 $entity
 164
 165 An utility function which will try to convert entity body into utf8.
 166 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 167
 168 =cut
 169
 170 sub SetMIMEEntityToUTF8 {
 171     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 172 }
 173
 174 # }}}
 175
 176 # {{{ IsTextualContentType
 177
 178 =head2 IsTextualContentType $type
 179
 180 An utility function that determines whether $type is I<textual>, meaning
 181 that it can sensibly be converted to Unicode text.
 182
 183 Currently, it returns true iff $type matches this regular expression
 184 (case-insensitively):
 185
 186     ^(?:text/(?:plain|html)|message/rfc822)\b
 187
 188 # }}}
 189
 190 =cut
 191
 192 sub IsTextualContentType {
 193     my $type = shift;
 194     ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
 195 }
 196
 197 # {{{ SetMIMEEntityToEncoding
 198
 199 =head2 SetMIMEEntityToEncoding $entity, $encoding
 200
 201 An utility function which will try to convert entity body into specified
 202 charset encoding (encoded as octets, *not* unicode-strings).  It will
 203 iterate all the entities in $entity, and try to convert each one into
 204 specified charset if whose Content-Type is 'text/plain'.
 205
 206 the methods are tries in order:
 207 1) to convert the entity to $encoding,
 208 2) to interpret the entity as iso-8859-1 and then convert it to $encoding,
 209 3) forcibly convert it to $encoding.
 210
 211 This function doesn't return anything meaningful.
 212
 213 =cut
 214
 215 sub SetMIMEEntityToEncoding {
 216     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 217
 218     # do the same for parts first of all
 219     SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 220
 221     my $charset = _FindOrGuessCharset($entity) or return;
 222     # one and only normalization
 223     $charset = 'utf-8' if $charset =~ /^utf-?8$/i;
 224     $enc     = 'utf-8' if $enc     =~ /^utf-?8$/i;
 225
 226     SetMIMEHeadToEncoding(
 227         $entity->head,
 228         _FindOrGuessCharset($entity, 1) => $enc,
 229         $preserve_words
 230     );
 231
 232     my $head = $entity->head;
 233
 234     # convert at least MIME word encoded attachment filename
 235     foreach my $attr (qw(content-type.name content-disposition.filename)) {
 236         if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
 237             $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
 238         }
 239     }
 240
 241     # If this is a textual entity, we'd need to preserve its original encoding
 242     $head->replace( "X-RT-Original-Encoding" => $charset )
 243         if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
 244
 245     return unless IsTextualContentType($head->mime_type);
 246
 247     my $body = $entity->bodyhandle;
 248
 249     if ( $enc ne $charset && $body ) {
 250         my $string = $body->as_string or return;
 251         # NOTE:: see the comments at the end of the sub.
 252         Encode::_utf8_off($string);
 253         my $orig_string = $string;
 254
 255         # {{{ Convert the body
 256         eval {
 257             $RT::Logger->debug( "Converting '$charset' to '$enc' for "
 258                   . $head->mime_type . " - "
 259                   . ( $head->get('subject') || 'Subjectless message' ) );
 260             Encode::from_to( $string, $charset => $enc, Encode::FB_CROAK );
 261         };
 262
 263         if ($@) {
 264             $RT::Logger->error( "Encoding error: "
 265                   . $@
 266                   . " falling back to iso-8859-1 => $enc" );
 267             $string = $orig_string;
 268             eval {
 269                 Encode::from_to(
 270                     $string,
 271                     'iso-8859-1' => $enc,
 272                     Encode::FB_CROAK
 273                 );
 274             };
 275             if ($@) {
 276                 $RT::Logger->error( "Encoding error: "
 277                       . $@
 278                       . " forcing conversion to $charset => $enc" );
 279                 $string = $orig_string;
 280                 Encode::from_to( $string, $charset => $enc );
 281             }
 282         }
 283
 284         # }}}
 285
 286         my $new_body = MIME::Body::InCore->new($string);
 287
 288         # set up the new entity
 289         $head->mime_attr( "content-type" => 'text/plain' )
 290           unless ( $head->mime_attr("content-type") );
 291         $head->mime_attr( "content-type.charset" => $enc );
 292         $entity->bodyhandle($new_body);
 293     }
 294 }
 295
 296 # NOTES:  Why Encode::_utf8_off before Encode::from_to
 297 #
 298 # All the strings in RT are utf-8 now.  Quotes from Encode POD:
 299 #
 300 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 301 # ... The data in $octets must be encoded as octets and not as
 302 # characters in Perl's internal format. ...
 303 #
 304 # Not turning off the UTF-8 flag in the string will prevent the string
 305 # from conversion.
 306
 307 # }}}
 308
 309 # {{{ DecodeMIMEWordsToUTF8
 310
 311 =head2 DecodeMIMEWordsToUTF8 $raw
 312
 313 An utility method which mimics MIME::Words::decode_mimewords, but only
 314 limited functionality.  This function returns an utf-8 string.
 315
 316 It returns the decoded string, or the original string if it's not
 317 encoded.  Since the subroutine converts specified string into utf-8
 318 charset, it should not alter a subject written in English.
 319
 320 Why not use MIME::Words directly?  Because it fails in RT when I
 321 tried.  Maybe it's ok now.
 322
 323 =cut
 324
 325 sub DecodeMIMEWordsToUTF8 {
 326     my $str = shift;
 327     return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
 328 }
 329
 330 sub DecodeMIMEWordsToEncoding {
 331     my $str = shift;
 332     my $to_charset = shift;
 333     my $field = shift || '';
 334
 335     my @list = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gcs;
 336     return ($str) unless (@list);
 337
 338     # add everything that hasn't matched to the end of the latest
 339     # string in array this happen when we have 'key="=?encoded?="; key="plain"'
 340     $list[-1] .= substr($str, pos $str);
 341
 342     $str = "";
 343     while (@list) {
 344         my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 345             splice @list, 0, 5;
 346         $encoding = lc $encoding;
 347
 348         $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 349
 350         if ( $encoding eq 'q' ) {
 351             use MIME::QuotedPrint;
 352             $enc_str =~ tr/_/ /;                # Observed from Outlook Express
 353             $enc_str = decode_qp($enc_str);
 354         } elsif ( $encoding eq 'b' ) {
 355             use MIME::Base64;
 356             $enc_str = decode_base64($enc_str);
 357         } else {
 358             $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
 359             ."only Q(uoted-printable) and B(ase64) are supported");
 360         }
 361
 362         # now we have got a decoded subject, try to convert into the encoding
 363         unless ( $charset eq $to_charset ) {
 364             my $orig_str = $enc_str;
 365             eval { Encode::from_to( $enc_str, $charset, $to_charset, Encode::FB_CROAK ) };
 366             if ($@) {
 367                 $enc_str = $orig_str;
 368                 $charset = _GuessCharset( $enc_str );
 369                 Encode::from_to( $enc_str, $charset, $to_charset );
 370             }
 371         }
 372
 373         # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
 374         # We _should_ be preserving them encoded until after parsing is completed and
 375         # THEN undo the mime-encoding.
 376         #
 377         # This routine should be translating the existing mimeencoding to utf8 but leaving
 378         # things encoded.
 379         #
 380         # It's legal for headers to contain mime-encoded commas and semicolons which
 381         # should not be treated as address separators. (Encoding == quoting here)
 382         #
 383         # until this is fixed, we must escape any string containing a comma or semicolon
 384         # this is only a bandaid
 385
 386         # Some _other_ MUAs encode quotes _already_, and double quotes
 387         # confuse us a lot, so only quote it if it isn't quoted
 388         # already.
 389         $enc_str = qq{"$enc_str"}
 390             if $enc_str =~ /[,;]/
 391             and $enc_str !~ /^".*"$/
 392             and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
 393
 394         $str .= $prefix . $enc_str . $trailing;
 395     }
 396
 397     # We might have \n without trailing whitespace, which will result in
 398     # invalid headers.
 399     $str =~ s/\n//g;
 400
 401     return ($str)
 402 }
 403
 404 # }}}
 405
 406 # {{{ _FindOrGuessCharset
 407
 408 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 409
 410 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
 411
 412 If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.
 413
 414 =cut
 415
 416 sub _FindOrGuessCharset {
 417     my $entity = shift;
 418     my $head_only = shift;
 419     my $head = $entity->head;
 420
 421     if ( my $charset = $head->mime_attr("content-type.charset") ) {
 422         return $charset;
 423     }
 424
 425     if ( !$head_only and $head->mime_type =~ m{^text/}) {
 426         my $body = $entity->bodyhandle or return;
 427         return _GuessCharset( $body->as_string );
 428     }
 429     else {
 430         # potentially binary data -- don't guess the body
 431         return _GuessCharset( $head->as_string );
 432     }
 433 }
 434
 435 # }}}
 436
 437 # {{{ _GuessCharset
 438
 439 =head2 _GuessCharset STRING
 440
 441 use Encode::Guess to try to figure it out the string's encoding.
 442
 443 =cut
 444
 445 sub _GuessCharset {
 446     my $fallback = 'iso-8859-1';
 447
 448     # if $_[0] is null/empty, we don't guess its encoding
 449     return $fallback unless defined $_[0] && length $_[0];
 450
 451     my $charset;
 452     my @encodings = RT->Config->Get('EmailInputEncodings');
 453     if ( @encodings and eval { require Encode::Guess; 1 } ) {
 454         Encode::Guess->set_suspects( @encodings );
 455         my $decoder = Encode::Guess->guess( $_[0] );
 456
 457       if ( defined($decoder) ) {
 458         if ( ref $decoder ) {
 459             $charset = $decoder->name;
 460             $RT::Logger->debug("Guessed encoding: $charset");
 461             return $charset;
 462         }
 463         elsif ($decoder =~ /(\S+ or .+)/) {
 464             my %matched = map { $_ => 1 } split(/ or /, $1);
 465             return 'utf-8' if $matched{'utf8'}; # one and only normalization
 466
 467             foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
 468                 next unless $matched{$suspect};
 469                 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 470                 $charset = $suspect;
 471                 last;
 472             }
 473         }
 474         else {
 475             $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 476         }
 477       }
 478       else {
 479           $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
 480       }
 481     }
 482     elsif ( @encodings && $@ ) {
 483         $RT::Logger->error("You have set EmailInputEncodings, but we couldn't load Encode::Guess: $@");
 484     } else {
 485         $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
 486     }
 487
 488     return ($charset || $fallback);
 489 }
 490
 491 # }}}
 492
 493 # {{{ SetMIMEHeadToEncoding
 494
 495 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 496
 497 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 498 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 499 all the time
 500
 501
 502 =cut
 503
 504 sub SetMIMEHeadToEncoding {
 505     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 506
 507     $charset = 'utf-8' if $charset eq 'utf8';
 508     $enc     = 'utf-8' if $enc     eq 'utf8';
 509
 510     return if $charset eq $enc and $preserve_words;
 511
 512     foreach my $tag ( $head->tags ) {
 513         next unless $tag; # seen in wild: headers with no name
 514         my @values = $head->get_all($tag);
 515         $head->delete($tag);
 516         foreach my $value (@values) {
 517             Encode::_utf8_off($value);
 518             my $orig_value = $value;
 519             if ( $charset ne $enc ) {
 520                 eval {
 521                     Encode::from_to( $value, $charset => $enc, Encode::FB_CROAK );
 522                 };
 523                 if ($@) {
 524                     $RT::Logger->error( "Encoding error: "
 525                           . $@
 526                           . " falling back to iso-8859-1 => $enc" );
 527                     $value = $orig_value;
 528                     eval {
 529                         Encode::from_to(
 530                             $value,
 531                             'iso-8859-1' => $enc,
 532                             Encode::FB_CROAK
 533                         );
 534                     };
 535                     if ($@) {
 536                         $RT::Logger->error( "Encoding error: "
 537                               . $@
 538                               . " forcing conversion to $charset => $enc" );
 539                         $value = $orig_value;
 540                         Encode::from_to( $value, $charset => $enc );
 541                     }
 542                 }
 543             }
 544             $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
 545                 unless $preserve_words;
 546             $head->add( $tag, $value );
 547         }
 548     }
 549
 550 }
 551 # }}}
 552
 553 RT::Base->_ImportOverlays();
 554
 555 1;  # End of module.
 556