rt/lib/RT/I18N.pm

   1 # BEGIN BPS TAGGED BLOCK {{{
   2 #
   3 # COPYRIGHT:
   4 #
   5 # This software is Copyright (c) 1996-2007 Best Practical Solutions, LLC
   6 #                                          <jesse@bestpractical.com>
   7 #
   8 # (Except where explicitly superseded by other copyright notices)
   9 #
  10 #
  11 # LICENSE:
  12 #
  13 # This work is made available to you under the terms of Version 2 of
  14 # the GNU General Public License. A copy of that license should have
  15 # been provided with this software, but in any event can be snarfed
  16 # from www.gnu.org.
  17 #
  18 # This work is distributed in the hope that it will be useful, but
  19 # WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 # General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU General Public License
  24 # along with this program; if not, write to the Free Software
  25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26 # 02110-1301 or visit their web page on the internet at
  27 # http://www.gnu.org/copyleft/gpl.html.
  28 #
  29 #
  30 # CONTRIBUTION SUBMISSION POLICY:
  31 #
  32 # (The following paragraph is not intended to limit the rights granted
  33 # to you to modify and distribute this software under the terms of
  34 # the GNU General Public License and is only of importance to you if
  35 # you choose to contribute your changes and enhancements to the
  36 # community by submitting them to Best Practical Solutions, LLC.)
  37 #
  38 # By intentionally submitting any modifications, corrections or
  39 # derivatives to this work, or any other work intended for use with
  40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
  41 # you are the copyright holder for those contributions and you grant
  42 # Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
  43 # royalty-free, perpetual, license to use, copy, create derivative
  44 # works based on those contributions, and sublicense and distribute
  45 # those contributions and any derivatives thereof.
  46 #
  47 # END BPS TAGGED BLOCK }}}
  48 =head1 NAME
  49
  50 RT::I18N - a base class for localization of RT
  51
  52 =cut
  53
  54 package RT::I18N;
  55
  56 use strict;
  57 use warnings;
  58
  59 use Locale::Maketext 1.04;
  60 use Locale::Maketext::Lexicon 0.25;
  61 use base ('Locale::Maketext::Fuzzy');
  62
  63 use Encode;
  64 use MIME::Entity;
  65 use MIME::Head;
  66
  67 # I decree that this project's first language is English.
  68
  69 our %Lexicon = (
  70    'TEST_STRING' => 'Concrete Mixer',
  71
  72     '__Content-Type' => 'text/plain; charset=utf-8',
  73
  74   '_AUTO' => 1,
  75   # That means that lookup failures can't happen -- if we get as far
  76   #  as looking for something in this lexicon, and we don't find it,
  77   #  then automagically set $Lexicon{$key} = $key, before possibly
  78   #  compiling it.
  79
  80   # The exception is keys that start with "_" -- they aren't auto-makeable.
  81
  82 );
  83 # End of lexicon.
  84
  85 =head2 Init
  86
  87 Initializes the lexicons used for localization.
  88
  89 =begin testing
  90
  91 use_ok (RT::I18N);
  92 ok(RT::I18N->Init);
  93
  94 =end testing
  95
  96 =cut
  97
  98 sub Init {
  99     require File::Glob;
 100
 101     # Load language-specific functions
 102     foreach my $language ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm")) {
 103         if ($language =~ /^([-\w\s.\/\\~:]+)$/) {
 104             require $1;
 105         }
 106         else {
 107             warn("$language is tainted. not loading");
 108         }
 109     }
 110
 111     my @lang = @RT::LexiconLanguages;
 112     @lang = ('*') unless @lang;
 113
 114     # Acquire all .po files and iterate them into lexicons
 115     Locale::Maketext::Lexicon->import({
 116         _decode => 1, map {
 117             $_  => [
 118                 Gettext => (substr(__FILE__, 0, -3) . "/$_.po"),
 119                 Gettext => "$RT::LocalLexiconPath/*/$_.po",
 120                 Gettext => "$RT::LocalLexiconPath/$_.po",
 121             ],
 122         } @lang
 123     });
 124
 125     return 1;
 126 }
 127
 128 =head2 encoding
 129
 130 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 131 If it can't find anything, it returns 'ISO-8859-1'
 132
 133 =begin testing
 134
 135 ok(my $chinese = RT::I18N->get_handle('zh_tw'));
 136 ok(UNIVERSAL::can($chinese, 'maketext'));
 137 ok($chinese->maketext('__Content-Type') =~ /utf-8/i, "Found the utf-8 charset for traditional chinese in the string ".$chinese->maketext('__Content-Type'));
 138 ok($chinese->encoding eq 'utf-8', "The encoding is 'utf-8' -".$chinese->encoding);
 139
 140 ok(my $en = RT::I18N->get_handle('en'));
 141 ok(UNIVERSAL::can($en, 'maketext'));
 142 ok($en->encoding eq 'utf-8', "The encoding ".$en->encoding." is 'utf-8'");
 143
 144 =end testing
 145
 146
 147 =cut
 148
 149
 150 sub encoding { 'utf-8' }
 151
 152 # {{{ SetMIMEEntityToUTF8
 153
 154 =head2 SetMIMEEntityToUTF8 $entity
 155
 156 An utility function which will try to convert entity body into utf8.
 157 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 158
 159 =cut
 160
 161 sub SetMIMEEntityToUTF8 {
 162     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 163 }
 164
 165 # }}}
 166
 167 # {{{ IsTextualContentType
 168
 169 =head2 IsTextualContentType $type
 170
 171 An utility function that determines whether $type is I<textual>, meaning
 172 that it can sensibly be converted to Unicode text.
 173
 174 Currently, it returns true iff $type matches this regular expression
 175 (case-insensitively):
 176
 177     ^(?:text/(?:plain|html)|message/rfc822)\b
 178
 179 # }}}
 180
 181 =cut
 182
 183 sub IsTextualContentType {
 184     my $type = shift;
 185     ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
 186 }
 187
 188 # {{{ SetMIMEEntityToEncoding
 189
 190 =head2 SetMIMEEntityToEncoding $entity, $encoding
 191
 192 An utility function which will try to convert entity body into specified
 193 charset encoding (encoded as octets, *not* unicode-strings).  It will
 194 iterate all the entities in $entity, and try to convert each one into
 195 specified charset if whose Content-Type is 'text/plain'.
 196
 197 This function doesn't return anything meaningful.
 198
 199 =cut
 200
 201 sub SetMIMEEntityToEncoding {
 202     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 203
 204     # do the same for parts first of all
 205     SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 206
 207     my $charset = _FindOrGuessCharset($entity) or return;
 208     # one and only normalization
 209     $charset = 'utf-8' if $charset =~ /^utf-?8$/i;
 210     $enc     = 'utf-8' if $enc     =~ /^utf-?8$/i;
 211
 212     SetMIMEHeadToEncoding(
 213         $entity->head,
 214         _FindOrGuessCharset($entity, 1) => $enc,
 215         $preserve_words
 216     );
 217
 218     my $head = $entity->head;
 219
 220     # convert at least MIME word encoded attachment filename
 221     foreach my $attr (qw(content-type.name content-disposition.filename)) {
 222         if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
 223             $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
 224         }
 225     }
 226
 227     # If this is a textual entity, we'd need to preserve its original encoding
 228     $head->add( "X-RT-Original-Encoding" => $charset )
 229         if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
 230
 231     return unless IsTextualContentType($head->mime_type);
 232
 233     my $body = $entity->bodyhandle;
 234
 235     if ( $enc ne $charset && $body) {
 236         my @lines = $body->as_lines or return;
 237
 238         # {{{ Convert the body
 239         eval {
 240             $RT::Logger->debug("Converting '$charset' to '$enc' for ". $head->mime_type . " - ". ($head->get('subject') || 'Subjectless message'));
 241
 242             # NOTE:: see the comments at the end of the sub.
 243             Encode::_utf8_off( $lines[$_] ) foreach ( 0 .. $#lines );
 244             Encode::from_to( $lines[$_], $charset => $enc ) for ( 0 .. $#lines );
 245         };
 246
 247         if ($@) {
 248             $RT::Logger->error( "Encoding error: " . $@ . " defaulting to ISO-8859-1 -> UTF-8" );
 249             eval {
 250                 Encode::from_to( $lines[$_], 'iso-8859-1' => $enc ) foreach ( 0 .. $#lines );
 251             };
 252             if ($@) {
 253                 $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 254             }
 255         }
 256         # }}}
 257
 258         my $new_body = MIME::Body::InCore->new( \@lines );
 259
 260         # set up the new entity
 261         $head->mime_attr( "content-type" => 'text/plain' )
 262           unless ( $head->mime_attr("content-type") );
 263         $head->mime_attr( "content-type.charset" => $enc );
 264         $entity->bodyhandle($new_body);
 265     }
 266 }
 267
 268 # NOTES:  Why Encode::_utf8_off before Encode::from_to
 269 #
 270 # All the strings in RT are utf-8 now.  Quotes from Encode POD:
 271 #
 272 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 273 # ... The data in $octets must be encoded as octets and not as
 274 # characters in Perl's internal format. ...
 275 #
 276 # Not turning off the UTF-8 flag in the string will prevent the string
 277 # from conversion.
 278
 279 # }}}
 280
 281 # {{{ DecodeMIMEWordsToUTF8
 282
 283 =head2 DecodeMIMEWordsToUTF8 $raw
 284
 285 An utility method which mimics MIME::Words::decode_mimewords, but only
 286 limited functionality.  This function returns an utf-8 string.
 287
 288 It returns the decoded string, or the original string if it's not
 289 encoded.  Since the subroutine converts specified string into utf-8
 290 charset, it should not alter a subject written in English.
 291
 292 Why not use MIME::Words directly?  Because it fails in RT when I
 293 tried.  Maybe it's ok now.
 294
 295 =cut
 296
 297 sub DecodeMIMEWordsToUTF8 {
 298     my $str = shift;
 299     DecodeMIMEWordsToEncoding($str, 'utf-8');
 300 }
 301
 302 sub DecodeMIMEWordsToEncoding {
 303     my $str = shift;
 304     my $enc = shift;
 305
 306     @_ = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gcs;
 307     return ($str) unless (@_);
 308
 309     # add everything that hasn't matched to the end of the latest
 310     # string in array this happen when we have 'key="=?encoded?="; key="plain"'
 311     $_[-1] .= substr($str, pos $str);
 312
 313     $str = "";
 314     while (@_) {
 315         my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 316             (shift, shift, lc shift, shift, shift);
 317
 318         $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 319
 320         if ( $encoding eq 'q' ) {
 321             use MIME::QuotedPrint;
 322             $enc_str =~ tr/_/ /;                # Observed from Outlook Express
 323             $enc_str = decode_qp($enc_str);
 324         } elsif ( $encoding eq 'b' ) {
 325             use MIME::Base64;
 326             $enc_str = decode_base64($enc_str);
 327         } else {
 328             $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
 329             ."only Q(uoted-printable) and B(ase64) are supported");
 330         }
 331
 332         # now we have got a decoded subject, try to convert into the encoding
 333         unless ($charset eq $enc) {
 334             eval { Encode::from_to($enc_str, $charset,  $enc) };
 335             if ($@) {
 336                 $charset = _GuessCharset( $enc_str );
 337                 Encode::from_to($enc_str, $charset, $enc);
 338             }
 339         }
 340
 341         # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
 342         # We _should_ be preserving them encoded until after parsing is completed and
 343         # THEN undo the mime-encoding.
 344         #
 345         # This routine should be translating the existing mimeencoding to utf8 but leaving
 346         # things encoded.
 347         #
 348         # It's legal for headers to contain mime-encoded commas and semicolons which
 349         # should not be treated as address separators. (Encoding == quoting here)
 350         #
 351         # until this is fixed, we must escape any string containing a comma or semicolon
 352         # this is only a bandaid
 353
 354         $enc_str = qq{"$enc_str"} if ($enc_str =~ /[,;]/);
 355         $str .= $prefix . $enc_str . $trailing;
 356     }
 357
 358     # We might have \n without trailing whitespace, which will result in
 359     # invalid headers.
 360     $str =~ s/\n//g;
 361
 362     return ($str)
 363 }
 364
 365 # }}}
 366
 367 # {{{ _FindOrGuessCharset
 368
 369 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 370
 371 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
 372
 373 If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.
 374
 375 =cut
 376
 377 sub _FindOrGuessCharset {
 378     my $entity = shift;
 379     my $head_only = shift;
 380     my $head = $entity->head;
 381
 382     if ( my $charset = $head->mime_attr("content-type.charset") ) {
 383         return $charset;
 384     }
 385
 386     if ( !$head_only and $head->mime_type =~ m{^text/}) {
 387         my $body = $entity->bodyhandle or return;
 388         return _GuessCharset( $body->as_string );
 389     }
 390     else {
 391         # potentially binary data -- don't guess the body
 392         return _GuessCharset( $head->as_string );
 393     }
 394 }
 395
 396 # }}}
 397
 398 # {{{ _GuessCharset
 399
 400 =head2 _GuessCharset STRING
 401
 402 use Encode::Guess to try to figure it out the string's encoding.
 403
 404 =cut
 405
 406 sub _GuessCharset {
 407     my $fallback = 'iso-8859-1';
 408     my $charset;
 409
 410     if ( @RT::EmailInputEncodings and eval { require Encode::Guess; 1 } ) {
 411         Encode::Guess->set_suspects(@RT::EmailInputEncodings);
 412         my $decoder = Encode::Guess->guess( $_[0] );
 413
 414       if ( defined($decoder) ) {
 415         if ( ref $decoder ) {
 416             $charset = $decoder->name;
 417             $RT::Logger->debug("Guessed encoding: $charset");
 418             return $charset;
 419         }
 420         elsif ($decoder =~ /(\S+ or .+)/) {
 421             my %matched = map { $_ => 1 } split(/ or /, $1);
 422             return 'utf-8' if $matched{'utf8'}; # one and only normalization
 423
 424             foreach my $suspect (@RT::EmailInputEncodings) {
 425                 next unless $matched{$suspect};
 426                 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 427                 $charset = $suspect;
 428                 last;
 429             }
 430         }
 431         else {
 432             $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 433         }
 434       }
 435       else {
 436           $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
 437       }
 438     }
 439     else {
 440         $RT::Logger->warning("Cannot Encode::Guess; fallback to $fallback");
 441     }
 442
 443     return($charset || $fallback);
 444 }
 445
 446 # }}}
 447
 448 # {{{ SetMIMEHeadToEncoding
 449
 450 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 451
 452 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 453 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 454 all the time
 455
 456
 457 =cut
 458
 459 sub SetMIMEHeadToEncoding {
 460     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 461
 462     $charset = 'utf-8' if $charset eq 'utf8';
 463     $enc     = 'utf-8' if $enc     eq 'utf8';
 464
 465     return if $charset eq $enc and $preserve_words;
 466
 467     foreach my $tag ( $head->tags ) {
 468         next unless $tag; # seen in wild: headers with no name
 469         my @values = $head->get_all($tag);
 470         $head->delete($tag);
 471         foreach my $value (@values) {
 472             if ( $charset ne $enc ) {
 473
 474                 eval {
 475                     Encode::_utf8_off($value);
 476                     Encode::from_to( $value, $charset => $enc );
 477                 };
 478                 if ($@) {
 479                     $RT::Logger->error( "Encoding error: " . $@
 480                                        . " defaulting to ISO-8859-1 -> UTF-8" );
 481                     eval { Encode::from_to( $value, 'iso-8859-1' => $enc ) };
 482                     if ($@) {
 483                         $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 484                     }
 485                 }
 486             }
 487             $value = DecodeMIMEWordsToEncoding( $value, $enc ) unless $preserve_words;
 488             $head->add( $tag, $value );
 489         }
 490     }
 491
 492 }
 493 # }}}
 494
 495 eval "require RT::I18N_Vendor";
 496 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Vendor.pm});
 497 eval "require RT::I18N_Local";
 498 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Local.pm});
 499
 500 1;  # End of module.
 501