rt/lib/RT/I18N.pm

   1 # BEGIN BPS TAGGED BLOCK {{{
   2 #
   3 # COPYRIGHT:
   4 #
   5 # This software is Copyright (c) 1996-2014 Best Practical Solutions, LLC
   6 #                                          <sales@bestpractical.com>
   7 #
   8 # (Except where explicitly superseded by other copyright notices)
   9 #
  10 #
  11 # LICENSE:
  12 #
  13 # This work is made available to you under the terms of Version 2 of
  14 # the GNU General Public License. A copy of that license should have
  15 # been provided with this software, but in any event can be snarfed
  16 # from www.gnu.org.
  17 #
  18 # This work is distributed in the hope that it will be useful, but
  19 # WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 # General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU General Public License
  24 # along with this program; if not, write to the Free Software
  25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26 # 02110-1301 or visit their web page on the internet at
  27 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
  28 #
  29 #
  30 # CONTRIBUTION SUBMISSION POLICY:
  31 #
  32 # (The following paragraph is not intended to limit the rights granted
  33 # to you to modify and distribute this software under the terms of
  34 # the GNU General Public License and is only of importance to you if
  35 # you choose to contribute your changes and enhancements to the
  36 # community by submitting them to Best Practical Solutions, LLC.)
  37 #
  38 # By intentionally submitting any modifications, corrections or
  39 # derivatives to this work, or any other work intended for use with
  40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
  41 # you are the copyright holder for those contributions and you grant
  42 # Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
  43 # royalty-free, perpetual, license to use, copy, create derivative
  44 # works based on those contributions, and sublicense and distribute
  45 # those contributions and any derivatives thereof.
  46 #
  47 # END BPS TAGGED BLOCK }}}
  48
  49 =head1 NAME
  50
  51 RT::I18N - a base class for localization of RT
  52
  53 =cut
  54
  55 package RT::I18N;
  56
  57 use strict;
  58 use warnings;
  59
  60
  61 use Locale::Maketext 1.04;
  62 use Locale::Maketext::Lexicon 0.25;
  63 use base 'Locale::Maketext::Fuzzy';
  64
  65 use Encode;
  66 use MIME::Entity;
  67 use MIME::Head;
  68 use File::Glob;
  69
  70 # I decree that this project's first language is English.
  71
  72 our %Lexicon = (
  73    'TEST_STRING' => 'Concrete Mixer',
  74
  75     '__Content-Type' => 'text/plain; charset=utf-8',
  76
  77   '_AUTO' => 1,
  78   # That means that lookup failures can't happen -- if we get as far
  79   #  as looking for something in this lexicon, and we don't find it,
  80   #  then automagically set $Lexicon{$key} = $key, before possibly
  81   #  compiling it.
  82
  83   # The exception is keys that start with "_" -- they aren't auto-makeable.
  84
  85 );
  86 # End of lexicon.
  87
  88 =head2 Init
  89
  90 Initializes the lexicons used for localization.
  91
  92
  93 =cut
  94
  95 sub Init {
  96
  97     my @lang = RT->Config->Get('LexiconLanguages');
  98     @lang = ('*') unless @lang;
  99
 100     # load default functions
 101     require substr(__FILE__, 0, -3) . '/i_default.pm';
 102
 103     # Load language-specific functions
 104     foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
 105         unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
 106             warn("$file is tainted. not loading");
 107             next;
 108         }
 109         $file = $1;
 110
 111         my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
 112         next unless grep $_ eq '*' || $_ eq $lang, @lang;
 113         require $file;
 114     }
 115
 116     my %import;
 117     foreach my $l ( @lang ) {
 118         $import{$l} = [
 119             Gettext => $RT::LexiconPath."/$l.po",
 120         ];
 121         push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
 122         push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
 123                                 Gettext => $RT::LocalLexiconPath."/$l.po");
 124     }
 125
 126     # Acquire all .po files and iterate them into lexicons
 127     Locale::Maketext::Lexicon->import({ _decode => 1, %import });
 128
 129     return 1;
 130 }
 131
 132 sub LoadLexicons {
 133
 134     no strict 'refs';
 135     foreach my $k (keys %{RT::I18N::} ) {
 136         next if $k eq 'main::';
 137         next unless index($k, '::', -2) >= 0;
 138         next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
 139
 140         my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
 141         # run fetch to force load
 142         my $tmp = $lex->{'foo'};
 143         # XXX: untie may fail with "untie attempted
 144         # while 1 inner references still exist"
 145         # TODO: untie that has to lower fetch impact
 146         # untie %$lex if tied %$lex;
 147     }
 148 }
 149
 150 =head2 encoding
 151
 152 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 153 If it can't find anything, it returns 'ISO-8859-1'
 154
 155
 156
 157 =cut
 158
 159
 160 sub encoding { 'utf-8' }
 161
 162
 163 =head2 SetMIMEEntityToUTF8 $entity
 164
 165 An utility function which will try to convert entity body into utf8.
 166 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 167
 168 =cut
 169
 170 sub SetMIMEEntityToUTF8 {
 171     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 172 }
 173
 174
 175
 176 =head2 IsTextualContentType $type
 177
 178 An utility function that determines whether $type is I<textual>, meaning
 179 that it can sensibly be converted to Unicode text.
 180
 181 Currently, it returns true iff $type matches this regular expression
 182 (case-insensitively):
 183
 184     ^(?:text/(?:plain|html)|message/rfc822)\b
 185
 186
 187 =cut
 188
 189 sub IsTextualContentType {
 190     my $type = shift;
 191     ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
 192 }
 193
 194
 195 =head2 SetMIMEEntityToEncoding $entity, $encoding
 196
 197 An utility function which will try to convert entity body into specified
 198 charset encoding (encoded as octets, *not* unicode-strings).  It will
 199 iterate all the entities in $entity, and try to convert each one into
 200 specified charset if whose Content-Type is 'text/plain'.
 201
 202 This function doesn't return anything meaningful.
 203
 204 =cut
 205
 206 sub SetMIMEEntityToEncoding {
 207     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 208
 209     # do the same for parts first of all
 210     SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 211
 212     my $head = $entity->head;
 213
 214     my $charset = _FindOrGuessCharset($entity);
 215     if ( $charset ) {
 216         unless( Encode::find_encoding($charset) ) {
 217             $RT::Logger->warning("Encoding '$charset' is not supported");
 218             $charset = undef;
 219         }
 220     }
 221     unless ( $charset ) {
 222         $head->replace( "X-RT-Original-Content-Type" => $head->mime_attr('Content-Type') );
 223         $head->mime_attr('Content-Type' => 'application/octet-stream');
 224         return;
 225     }
 226
 227     SetMIMEHeadToEncoding(
 228         $head,
 229         _FindOrGuessCharset($entity, 1) => $enc,
 230         $preserve_words
 231     );
 232
 233     # If this is a textual entity, we'd need to preserve its original encoding
 234     $head->replace( "X-RT-Original-Encoding" => $charset )
 235         if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
 236
 237     return unless IsTextualContentType($head->mime_type);
 238
 239     my $body = $entity->bodyhandle;
 240
 241     if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) {
 242         my $string = $body->as_string or return;
 243
 244         $RT::Logger->debug( "Converting '$charset' to '$enc' for "
 245               . $head->mime_type . " - "
 246               . ( $head->get('subject') || 'Subjectless message' ) );
 247
 248         # NOTE:: see the comments at the end of the sub.
 249         Encode::_utf8_off($string);
 250         Encode::from_to( $string, $charset => $enc );
 251
 252         my $new_body = MIME::Body::InCore->new($string);
 253
 254         # set up the new entity
 255         $head->mime_attr( "content-type" => 'text/plain' )
 256           unless ( $head->mime_attr("content-type") );
 257         $head->mime_attr( "content-type.charset" => $enc );
 258         $entity->bodyhandle($new_body);
 259     }
 260 }
 261
 262 # NOTES:  Why Encode::_utf8_off before Encode::from_to
 263 #
 264 # All the strings in RT are utf-8 now.  Quotes from Encode POD:
 265 #
 266 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 267 # ... The data in $octets must be encoded as octets and not as
 268 # characters in Perl's internal format. ...
 269 #
 270 # Not turning off the UTF-8 flag in the string will prevent the string
 271 # from conversion.
 272
 273
 274
 275 =head2 DecodeMIMEWordsToUTF8 $raw
 276
 277 An utility method which mimics MIME::Words::decode_mimewords, but only
 278 limited functionality.  This function returns an utf-8 string.
 279
 280 It returns the decoded string, or the original string if it's not
 281 encoded.  Since the subroutine converts specified string into utf-8
 282 charset, it should not alter a subject written in English.
 283
 284 Why not use MIME::Words directly?  Because it fails in RT when I
 285 tried.  Maybe it's ok now.
 286
 287 =cut
 288
 289 sub DecodeMIMEWordsToUTF8 {
 290     my $str = shift;
 291     return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
 292 }
 293
 294 sub DecodeMIMEWordsToEncoding {
 295     my $str = shift;
 296     my $to_charset = _CanonicalizeCharset(shift);
 297     my $field = shift || '';
 298
 299     # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
 300     # continuations, and similar syntax from RFC 2231
 301     if ($field =~ /^Content-(Type|Disposition)/i) {
 302         # This concatenates continued parameters and normalizes encoded params
 303         # to QB encoded-words which we handle below
 304         $str = MIME::Field::ParamVal->parse($str)->stringify;
 305     }
 306
 307     # Pre-parse by removing all whitespace between encoded words
 308     my $encoded_word = qr/
 309                  =\?            # =?
 310                  ([^?]+?)       # charset
 311                  (?:\*[^?]+)?   # optional '*language'
 312                  \?             # ?
 313                  ([QqBb])       # encoding
 314                  \?             # ?
 315                  ([^?]+)        # encoded string
 316                  \?=            # ?=
 317                  /x;
 318     $str =~ s/($encoded_word)\s+(?=$encoded_word)/$1/g;
 319
 320     # Also merge quoted-printable sections together, in case multiple
 321     # octets of a single encoded character were split between chunks.
 322     # Though not valid according to RFC 2047, this has been seen in the
 323     # wild.
 324     1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i;
 325
 326     # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
 327     # custom MIME word decoding and charset canonicalization.  We can't do this
 328     # until we parse before decode, instead of the other way around.
 329     my @list = $str =~ m/(.*?)          # prefix
 330                          $encoded_word
 331                          ([^=]*)        # trailing
 332                         /xgcs;
 333
 334     if ( @list ) {
 335         # add everything that hasn't matched to the end of the latest
 336         # string in array this happen when we have 'key="=?encoded?="; key="plain"'
 337         $list[-1] .= substr($str, pos $str);
 338
 339         $str = "";
 340         while (@list) {
 341             my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 342                     splice @list, 0, 5;
 343             $charset  = _CanonicalizeCharset($charset);
 344             $encoding = lc $encoding;
 345
 346             $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 347
 348             if ( $encoding eq 'q' ) {
 349                 use MIME::QuotedPrint;
 350                 $enc_str =~ tr/_/ /;            # Observed from Outlook Express
 351                 $enc_str = decode_qp($enc_str);
 352             } elsif ( $encoding eq 'b' ) {
 353                 use MIME::Base64;
 354                 $enc_str = decode_base64($enc_str);
 355             } else {
 356                 $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
 357                     ."only Q(uoted-printable) and B(ase64) are supported");
 358             }
 359
 360             # now we have got a decoded subject, try to convert into the encoding
 361             if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
 362                 if ( Encode::find_encoding($charset) ) {
 363                     Encode::from_to( $enc_str, $charset, $to_charset );
 364                 } else {
 365                     $RT::Logger->warning("Charset '$charset' is not supported");
 366                     $enc_str =~ s/[^[:print:]]/\357\277\275/g;
 367                     Encode::from_to( $enc_str, 'UTF-8', $to_charset )
 368                         unless $to_charset eq 'utf-8';
 369                 }
 370             }
 371
 372             # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
 373             # We _should_ be preserving them encoded until after parsing is completed and
 374             # THEN undo the mime-encoding.
 375             #
 376             # This routine should be translating the existing mimeencoding to utf8 but leaving
 377             # things encoded.
 378             #
 379             # It's legal for headers to contain mime-encoded commas and semicolons which
 380             # should not be treated as address separators. (Encoding == quoting here)
 381             #
 382             # until this is fixed, we must escape any string containing a comma or semicolon
 383             # this is only a bandaid
 384
 385             # Some _other_ MUAs encode quotes _already_, and double quotes
 386             # confuse us a lot, so only quote it if it isn't quoted
 387             # already.
 388             $enc_str = qq{"$enc_str"}
 389                 if $enc_str =~ /[,;]/
 390                 and $enc_str !~ /^".*"$/
 391                 and $prefix !~ /"$/ and $trailing !~ /^"/
 392                 and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
 393
 394             $str .= $prefix . $enc_str . $trailing;
 395         }
 396     }
 397
 398     # We might have \n without trailing whitespace, which will result in
 399     # invalid headers.
 400     $str =~ s/\n//g;
 401
 402     return ($str)
 403 }
 404
 405
 406
 407 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 408
 409 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
 410
 411 If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.
 412
 413 =cut
 414
 415 sub _FindOrGuessCharset {
 416     my $entity = shift;
 417     my $head_only = shift;
 418     my $head = $entity->head;
 419
 420     if ( my $charset = $head->mime_attr("content-type.charset") ) {
 421         return _CanonicalizeCharset($charset);
 422     }
 423
 424     if ( !$head_only and $head->mime_type =~ m{^text/} ) {
 425         my $body = $entity->bodyhandle or return;
 426         return _GuessCharset( $body->as_string );
 427     }
 428     else {
 429
 430         # potentially binary data -- don't guess the body
 431         return _GuessCharset( $head->as_string );
 432     }
 433 }
 434
 435
 436
 437 =head2 _GuessCharset STRING
 438
 439 use Encode::Guess to try to figure it out the string's encoding.
 440
 441 =cut
 442
 443 use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
 444 use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
 445
 446 sub _GuessCharset {
 447     my $fallback = _CanonicalizeCharset('iso-8859-1');
 448
 449     # if $_[0] is null/empty, we don't guess its encoding
 450     return $fallback
 451         unless defined $_[0] && length $_[0];
 452
 453     my @encodings = RT->Config->Get('EmailInputEncodings');
 454     unless ( @encodings ) {
 455         $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
 456         return $fallback;
 457     }
 458
 459     if ( $encodings[0] eq '*' ) {
 460         shift @encodings;
 461         if ( HAS_ENCODE_DETECT ) {
 462             my $charset = Encode::Detect::Detector::detect( $_[0] );
 463             if ( $charset ) {
 464                 $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
 465                 return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
 466             }
 467             else {
 468                 $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
 469             }
 470         }
 471         else {
 472             $RT::Logger->error(
 473                 "You requested to guess encoding, but we couldn't"
 474                 ." load Encode::Detect::Detector module"
 475             );
 476         }
 477     }
 478
 479     unless ( @encodings ) {
 480         $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
 481         return $fallback;
 482     }
 483
 484     unless ( HAS_ENCODE_GUESS ) {
 485         $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
 486         return $fallback;
 487     }
 488
 489     Encode::Guess->set_suspects( @encodings );
 490     my $decoder = Encode::Guess->guess( $_[0] );
 491     unless ( defined $decoder ) {
 492         $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
 493         return $fallback;
 494     }
 495
 496     if ( ref $decoder ) {
 497         my $charset = $decoder->name;
 498         $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
 499         return _CanonicalizeCharset( $charset );
 500     }
 501     elsif ($decoder =~ /(\S+ or .+)/) {
 502         my %matched = map { $_ => 1 } split(/ or /, $1);
 503         return 'utf-8' if $matched{'utf8'}; # one and only normalization
 504
 505         foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
 506             next unless $matched{$suspect};
 507             $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 508             return _CanonicalizeCharset( $suspect );
 509         }
 510     }
 511     else {
 512         $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 513     }
 514
 515     return $fallback;
 516 }
 517
 518 =head2 _CanonicalizeCharset NAME
 519
 520 canonicalize charset, return lowercase version.
 521 special cases are: gb2312 => gbk, utf8 => utf-8
 522
 523 =cut
 524
 525 sub _CanonicalizeCharset {
 526     my $charset = lc shift;
 527     return $charset unless $charset;
 528
 529     # Canonicalize aliases if they're known
 530     if (my $canonical = Encode::resolve_alias($charset)) {
 531         $charset = $canonical;
 532     }
 533
 534     if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
 535         return 'utf-8';
 536     }
 537     elsif ( $charset eq 'euc-cn' ) {
 538         # gbk is superset of gb2312/euc-cn so it's safe
 539         return 'gbk';
 540         # XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
 541         # but needs Encode::HanExtra installed
 542     }
 543     else {
 544         return $charset;
 545     }
 546 }
 547
 548
 549 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 550
 551 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 552 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 553 all the time
 554
 555
 556 =cut
 557
 558 sub SetMIMEHeadToEncoding {
 559     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 560
 561     $charset = _CanonicalizeCharset($charset);
 562     $enc     = _CanonicalizeCharset($enc);
 563
 564     return if $charset eq $enc and $preserve_words;
 565
 566     foreach my $tag ( $head->tags ) {
 567         next unless $tag; # seen in wild: headers with no name
 568         my @values = $head->get_all($tag);
 569         $head->delete($tag);
 570         foreach my $value (@values) {
 571             if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) {
 572                 Encode::_utf8_off($value);
 573                 Encode::from_to( $value, $charset => $enc );
 574             }
 575             $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
 576                 unless $preserve_words;
 577
 578             # We intentionally add a leading space when re-adding the
 579             # header; Mail::Header strips it before storing, but it
 580             # serves to prevent it from "helpfully" canonicalizing
 581             # $head->add("Subject", "Subject: foo") into the same as
 582             # $head->add("Subject", "foo");
 583             $head->add( $tag, " " . $value );
 584         }
 585     }
 586
 587 }
 588
 589 RT::Base->_ImportOverlays();
 590
 591 1;  # End of module.
 592