rt/lib/RT/I18N.pm

   1 # BEGIN BPS TAGGED BLOCK {{{
   2 #
   3 # COPYRIGHT:
   4 #
   5 # This software is Copyright (c) 1996-2015 Best Practical Solutions, LLC
   6 #                                          <sales@bestpractical.com>
   7 #
   8 # (Except where explicitly superseded by other copyright notices)
   9 #
  10 #
  11 # LICENSE:
  12 #
  13 # This work is made available to you under the terms of Version 2 of
  14 # the GNU General Public License. A copy of that license should have
  15 # been provided with this software, but in any event can be snarfed
  16 # from www.gnu.org.
  17 #
  18 # This work is distributed in the hope that it will be useful, but
  19 # WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 # General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU General Public License
  24 # along with this program; if not, write to the Free Software
  25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26 # 02110-1301 or visit their web page on the internet at
  27 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
  28 #
  29 #
  30 # CONTRIBUTION SUBMISSION POLICY:
  31 #
  32 # (The following paragraph is not intended to limit the rights granted
  33 # to you to modify and distribute this software under the terms of
  34 # the GNU General Public License and is only of importance to you if
  35 # you choose to contribute your changes and enhancements to the
  36 # community by submitting them to Best Practical Solutions, LLC.)
  37 #
  38 # By intentionally submitting any modifications, corrections or
  39 # derivatives to this work, or any other work intended for use with
  40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
  41 # you are the copyright holder for those contributions and you grant
  42 # Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
  43 # royalty-free, perpetual, license to use, copy, create derivative
  44 # works based on those contributions, and sublicense and distribute
  45 # those contributions and any derivatives thereof.
  46 #
  47 # END BPS TAGGED BLOCK }}}
  48
  49 =head1 NAME
  50
  51 RT::I18N - a base class for localization of RT
  52
  53 =cut
  54
  55 package RT::I18N;
  56
  57 use strict;
  58 use warnings;
  59
  60
  61 use Locale::Maketext 1.04;
  62 use Locale::Maketext::Lexicon 0.25;
  63 use base 'Locale::Maketext::Fuzzy';
  64
  65 use MIME::Entity;
  66 use MIME::Head;
  67 use File::Glob;
  68
  69 # I decree that this project's first language is English.
  70
  71 our %Lexicon = (
  72    'TEST_STRING' => 'Concrete Mixer',
  73
  74     '__Content-Type' => 'text/plain; charset=utf-8',
  75
  76   '_AUTO' => 1,
  77   # That means that lookup failures can't happen -- if we get as far
  78   #  as looking for something in this lexicon, and we don't find it,
  79   #  then automagically set $Lexicon{$key} = $key, before possibly
  80   #  compiling it.
  81
  82   # The exception is keys that start with "_" -- they aren't auto-makeable.
  83
  84 );
  85 # End of lexicon.
  86
  87 =head2 Init
  88
  89 Initializes the lexicons used for localization.
  90
  91
  92 =cut
  93
  94 sub Init {
  95
  96     my @lang = RT->Config->Get('LexiconLanguages');
  97     @lang = ('*') unless @lang;
  98
  99     # load default functions
 100     require substr(__FILE__, 0, -3) . '/i_default.pm';
 101
 102     # Load language-specific functions
 103     foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
 104         unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
 105             warn("$file is tainted. not loading");
 106             next;
 107         }
 108         $file = $1;
 109
 110         my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
 111         next unless grep $_ eq '*' || $_ eq $lang, @lang;
 112         require $file;
 113     }
 114
 115     my %import;
 116     foreach my $l ( @lang ) {
 117         $import{$l} = [
 118             Gettext => $RT::LexiconPath."/$l.po",
 119         ];
 120         push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
 121         push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
 122                                 Gettext => $RT::LocalLexiconPath."/$l.po");
 123     }
 124
 125     # Acquire all .po files and iterate them into lexicons
 126     Locale::Maketext::Lexicon->import({ _decode => 1, %import });
 127
 128     return 1;
 129 }
 130
 131 sub LoadLexicons {
 132
 133     no strict 'refs';
 134     foreach my $k (keys %{RT::I18N::} ) {
 135         next if $k eq 'main::';
 136         next unless index($k, '::', -2) >= 0;
 137         next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
 138
 139         my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
 140         # run fetch to force load
 141         my $tmp = $lex->{'foo'};
 142         # XXX: untie may fail with "untie attempted
 143         # while 1 inner references still exist"
 144         # TODO: untie that has to lower fetch impact
 145         # untie %$lex if tied %$lex;
 146     }
 147 }
 148
 149 =head2 encoding
 150
 151 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 152 If it can't find anything, it returns 'ISO-8859-1'
 153
 154
 155
 156 =cut
 157
 158
 159 sub encoding { 'utf-8' }
 160
 161
 162 =head2 SetMIMEEntityToUTF8 $entity
 163
 164 An utility function which will try to convert entity body into utf8.
 165 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 166
 167 =cut
 168
 169 sub SetMIMEEntityToUTF8 {
 170     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 171 }
 172
 173
 174
 175 =head2 IsTextualContentType $type
 176
 177 An utility function that determines whether $type is I<textual>, meaning
 178 that it can sensibly be converted to Unicode text.
 179
 180 Currently, it returns true iff $type matches this regular expression
 181 (case-insensitively):
 182
 183     ^(?:text/(?:plain|html)|message/rfc822)\b
 184
 185
 186 =cut
 187
 188 sub IsTextualContentType {
 189     my $type = shift;
 190     ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
 191 }
 192
 193
 194 =head2 SetMIMEEntityToEncoding $entity, $encoding
 195
 196 An utility function which will try to convert entity body into specified
 197 charset encoding (encoded as octets, *not* unicode-strings).  It will
 198 iterate all the entities in $entity, and try to convert each one into
 199 specified charset if whose Content-Type is 'text/plain'.
 200
 201 This function doesn't return anything meaningful.
 202
 203 =cut
 204
 205 sub SetMIMEEntityToEncoding {
 206     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 207
 208     # do the same for parts first of all
 209     SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 210
 211     my $head = $entity->head;
 212
 213     my $charset = _FindOrGuessCharset($entity);
 214     if ( $charset ) {
 215         unless( Encode::find_encoding($charset) ) {
 216             $RT::Logger->warning("Encoding '$charset' is not supported");
 217             $charset = undef;
 218         }
 219     }
 220     unless ( $charset ) {
 221         $head->replace( "X-RT-Original-Content-Type" => $head->mime_attr('Content-Type') );
 222         $head->mime_attr('Content-Type' => 'application/octet-stream');
 223         return;
 224     }
 225
 226     SetMIMEHeadToEncoding(
 227         $head,
 228         _FindOrGuessCharset($entity, 1) => $enc,
 229         $preserve_words
 230     );
 231
 232     # If this is a textual entity, we'd need to preserve its original encoding
 233     $head->replace( "X-RT-Original-Encoding" => Encode::encode( "UTF-8", $charset ) )
 234         if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
 235
 236     return unless IsTextualContentType($head->mime_type);
 237
 238     my $body = $entity->bodyhandle;
 239
 240     if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) {
 241         my $string = $body->as_string or return;
 242         RT::Util::assert_bytes($string);
 243
 244         $RT::Logger->debug( "Converting '$charset' to '$enc' for "
 245               . $head->mime_type . " - "
 246               . ( Encode::decode("UTF-8",$head->get('subject')) || 'Subjectless message' ) );
 247
 248         {
 249             no warnings 'utf8';
 250             $string = Encode::encode( $enc, Encode::decode( $charset, $string) );
 251         }
 252
 253         my $new_body = MIME::Body::InCore->new($string);
 254
 255         # set up the new entity
 256         $head->mime_attr( "content-type" => 'text/plain' )
 257           unless ( $head->mime_attr("content-type") );
 258         $head->mime_attr( "content-type.charset" => $enc );
 259         $entity->bodyhandle($new_body);
 260     }
 261 }
 262
 263 =head2 DecodeMIMEWordsToUTF8 $raw
 264
 265 An utility method which mimics MIME::Words::decode_mimewords, but only
 266 limited functionality.  Despite its name, this function returns the
 267 bytes of the string, in UTF-8.
 268
 269 =cut
 270
 271 sub DecodeMIMEWordsToUTF8 {
 272     my $str = shift;
 273     return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
 274 }
 275
 276 sub DecodeMIMEWordsToEncoding {
 277     my $str = shift;
 278     my $to_charset = _CanonicalizeCharset(shift);
 279     my $field = shift || '';
 280
 281     # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
 282     # continuations, and similar syntax from RFC 2231
 283     if ($field =~ /^Content-(Type|Disposition)/i) {
 284         # This concatenates continued parameters and normalizes encoded params
 285         # to QB encoded-words which we handle below
 286         $str = MIME::Field::ParamVal->parse($str)->stringify;
 287     }
 288
 289     # Pre-parse by removing all whitespace between encoded words
 290     my $encoded_word = qr/
 291                  =\?            # =?
 292                  ([^?]+?)       # charset
 293                  (?:\*[^?]+)?   # optional '*language'
 294                  \?             # ?
 295                  ([QqBb])       # encoding
 296                  \?             # ?
 297                  ([^?]+)        # encoded string
 298                  \?=            # ?=
 299                  /x;
 300     $str =~ s/($encoded_word)\s+(?=$encoded_word)/$1/g;
 301
 302     # Also merge quoted-printable sections together, in case multiple
 303     # octets of a single encoded character were split between chunks.
 304     # Though not valid according to RFC 2047, this has been seen in the
 305     # wild.
 306     1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i;
 307
 308     # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
 309     # custom MIME word decoding and charset canonicalization.  We can't do this
 310     # until we parse before decode, instead of the other way around.
 311     my @list = $str =~ m/(.*?)          # prefix
 312                          $encoded_word
 313                          ([^=]*)        # trailing
 314                         /xgcs;
 315
 316     if ( @list ) {
 317         # add everything that hasn't matched to the end of the latest
 318         # string in array this happen when we have 'key="=?encoded?="; key="plain"'
 319         $list[-1] .= substr($str, pos $str);
 320
 321         $str = "";
 322         while (@list) {
 323             my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 324                     splice @list, 0, 5;
 325             $charset  = _CanonicalizeCharset($charset);
 326             $encoding = lc $encoding;
 327
 328             $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 329
 330             if ( $encoding eq 'q' ) {
 331                 use MIME::QuotedPrint;
 332                 $enc_str =~ tr/_/ /;            # Observed from Outlook Express
 333                 $enc_str = decode_qp($enc_str);
 334             } elsif ( $encoding eq 'b' ) {
 335                 use MIME::Base64;
 336                 $enc_str = decode_base64($enc_str);
 337             } else {
 338                 $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
 339                     ."only Q(uoted-printable) and B(ase64) are supported");
 340             }
 341
 342             # now we have got a decoded subject, try to convert into the encoding
 343             if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
 344                 if ( Encode::find_encoding($charset) ) {
 345                     Encode::from_to( $enc_str, $charset, $to_charset );
 346                 } else {
 347                     $RT::Logger->warning("Charset '$charset' is not supported");
 348                     $enc_str =~ s/[^[:print:]]/\357\277\275/g;
 349                     Encode::from_to( $enc_str, 'UTF-8', $to_charset )
 350                         unless $to_charset eq 'utf-8';
 351                 }
 352             }
 353
 354             # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
 355             # We _should_ be preserving them encoded until after parsing is completed and
 356             # THEN undo the mime-encoding.
 357             #
 358             # This routine should be translating the existing mimeencoding to utf8 but leaving
 359             # things encoded.
 360             #
 361             # It's legal for headers to contain mime-encoded commas and semicolons which
 362             # should not be treated as address separators. (Encoding == quoting here)
 363             #
 364             # until this is fixed, we must escape any string containing a comma or semicolon
 365             # this is only a bandaid
 366
 367             # Some _other_ MUAs encode quotes _already_, and double quotes
 368             # confuse us a lot, so only quote it if it isn't quoted
 369             # already.
 370             $enc_str = qq{"$enc_str"}
 371                 if $enc_str =~ /[,;]/
 372                 and $enc_str !~ /^".*"$/
 373                 and $prefix !~ /"$/ and $trailing !~ /^"/
 374                 and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
 375
 376             $str .= $prefix . $enc_str . $trailing;
 377         }
 378     }
 379
 380     # We might have \n without trailing whitespace, which will result in
 381     # invalid headers.
 382     $str =~ s/\n//g;
 383
 384     return ($str)
 385 }
 386
 387
 388
 389 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 390
 391 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
 392
 393 If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.
 394
 395 =cut
 396
 397 sub _FindOrGuessCharset {
 398     my $entity = shift;
 399     my $head_only = shift;
 400     my $head = $entity->head;
 401
 402     if ( my $charset = $head->mime_attr("content-type.charset") ) {
 403         return _CanonicalizeCharset($charset);
 404     }
 405
 406     if ( !$head_only and $head->mime_type =~ m{^text/} ) {
 407         my $body = $entity->bodyhandle or return;
 408         return _GuessCharset( $body->as_string );
 409     }
 410     else {
 411
 412         # potentially binary data -- don't guess the body
 413         return _GuessCharset( $head->as_string );
 414     }
 415 }
 416
 417
 418
 419 =head2 _GuessCharset STRING
 420
 421 use Encode::Guess to try to figure it out the string's encoding.
 422
 423 =cut
 424
 425 use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
 426 use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
 427
 428 sub _GuessCharset {
 429     my $fallback = _CanonicalizeCharset('iso-8859-1');
 430
 431     # if $_[0] is null/empty, we don't guess its encoding
 432     return $fallback
 433         unless defined $_[0] && length $_[0];
 434
 435     my @encodings = RT->Config->Get('EmailInputEncodings');
 436     unless ( @encodings ) {
 437         $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
 438         return $fallback;
 439     }
 440
 441     if ( $encodings[0] eq '*' ) {
 442         shift @encodings;
 443         if ( HAS_ENCODE_DETECT ) {
 444             my $charset = Encode::Detect::Detector::detect( $_[0] );
 445             if ( $charset ) {
 446                 $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
 447                 return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
 448             }
 449             else {
 450                 $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
 451             }
 452         }
 453         else {
 454             $RT::Logger->error(
 455                 "You requested to guess encoding, but we couldn't"
 456                 ." load Encode::Detect::Detector module"
 457             );
 458         }
 459     }
 460
 461     unless ( @encodings ) {
 462         $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
 463         return $fallback;
 464     }
 465
 466     unless ( HAS_ENCODE_GUESS ) {
 467         $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
 468         return $fallback;
 469     }
 470
 471     Encode::Guess->set_suspects( @encodings );
 472     my $decoder = Encode::Guess->guess( $_[0] );
 473     unless ( defined $decoder ) {
 474         $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
 475         return $fallback;
 476     }
 477
 478     if ( ref $decoder ) {
 479         my $charset = $decoder->name;
 480         $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
 481         return _CanonicalizeCharset( $charset );
 482     }
 483     elsif ($decoder =~ /(\S+ or .+)/) {
 484         my %matched = map { $_ => 1 } split(/ or /, $1);
 485         return 'utf-8' if $matched{'utf8'}; # one and only normalization
 486
 487         foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
 488             next unless $matched{$suspect};
 489             $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 490             return _CanonicalizeCharset( $suspect );
 491         }
 492     }
 493     else {
 494         $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 495     }
 496
 497     return $fallback;
 498 }
 499
 500 =head2 _CanonicalizeCharset NAME
 501
 502 canonicalize charset, return lowercase version.
 503 special cases are: gb2312 => gbk, utf8 => utf-8
 504
 505 =cut
 506
 507 sub _CanonicalizeCharset {
 508     my $charset = lc shift;
 509     return $charset unless $charset;
 510
 511     # Canonicalize aliases if they're known
 512     if (my $canonical = Encode::resolve_alias($charset)) {
 513         $charset = $canonical;
 514     }
 515
 516     if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
 517         return 'utf-8';
 518     }
 519     elsif ( $charset eq 'euc-cn' ) {
 520         # gbk is superset of gb2312/euc-cn so it's safe
 521         return 'gbk';
 522         # XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
 523         # but needs Encode::HanExtra installed
 524     }
 525     else {
 526         return $charset;
 527     }
 528 }
 529
 530
 531 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 532
 533 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 534 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 535 all the time
 536
 537
 538 =cut
 539
 540 sub SetMIMEHeadToEncoding {
 541     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 542
 543     $charset = _CanonicalizeCharset($charset);
 544     $enc     = _CanonicalizeCharset($enc);
 545
 546     return if $charset eq $enc and $preserve_words;
 547
 548     RT::Util::assert_bytes( $head->as_string );
 549     foreach my $tag ( $head->tags ) {
 550         next unless $tag; # seen in wild: headers with no name
 551         my @values = $head->get_all($tag);
 552         $head->delete($tag);
 553         foreach my $value (@values) {
 554             if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) {
 555                 no warnings 'utf8';
 556                 $value = Encode::encode( $enc, Encode::decode( $charset, $value) );
 557             }
 558             $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
 559                 unless $preserve_words;
 560
 561             # We intentionally add a leading space when re-adding the
 562             # header; Mail::Header strips it before storing, but it
 563             # serves to prevent it from "helpfully" canonicalizing
 564             # $head->add("Subject", "Subject: foo") into the same as
 565             # $head->add("Subject", "foo");
 566             $head->add( $tag, " " . $value );
 567         }
 568     }
 569
 570 }
 571
 572 RT::Base->_ImportOverlays();
 573
 574 1;  # End of module.
 575