rt/lib/RT/I18N.pm

   1 # BEGIN BPS TAGGED BLOCK {{{
   2 #
   3 # COPYRIGHT:
   4 #
   5 # This software is Copyright (c) 1996-2005 Best Practical Solutions, LLC
   6 #                                          <jesse@bestpractical.com>
   7 #
   8 # (Except where explicitly superseded by other copyright notices)
   9 #
  10 #
  11 # LICENSE:
  12 #
  13 # This work is made available to you under the terms of Version 2 of
  14 # the GNU General Public License. A copy of that license should have
  15 # been provided with this software, but in any event can be snarfed
  16 # from www.gnu.org.
  17 #
  18 # This work is distributed in the hope that it will be useful, but
  19 # WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 # General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU General Public License
  24 # along with this program; if not, write to the Free Software
  25 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  26 #
  27 #
  28 # CONTRIBUTION SUBMISSION POLICY:
  29 #
  30 # (The following paragraph is not intended to limit the rights granted
  31 # to you to modify and distribute this software under the terms of
  32 # the GNU General Public License and is only of importance to you if
  33 # you choose to contribute your changes and enhancements to the
  34 # community by submitting them to Best Practical Solutions, LLC.)
  35 #
  36 # By intentionally submitting any modifications, corrections or
  37 # derivatives to this work, or any other work intended for use with
  38 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
  39 # you are the copyright holder for those contributions and you grant
  40 # Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
  41 # royalty-free, perpetual, license to use, copy, create derivative
  42 # works based on those contributions, and sublicense and distribute
  43 # those contributions and any derivatives thereof.
  44 #
  45 # END BPS TAGGED BLOCK }}}
  46
  47 =head1 NAME
  48
  49 RT::I18N - a base class for localization of RT
  50
  51 =cut
  52
  53 package RT::I18N;
  54
  55 use strict;
  56 use Locale::Maketext 1.04;
  57 use Locale::Maketext::Lexicon 0.25;
  58 use base ('Locale::Maketext::Fuzzy');
  59 use vars qw( %Lexicon );
  60
  61 #If we're running on 5.6, we desperately need Encode::compat. But if we're on 5.8, we don't really need it.
  62 BEGIN { if ($] < 5.007001) {
  63 require Encode::compat;
  64 } }
  65 use Encode;
  66
  67 use MIME::Entity;
  68 use MIME::Head;
  69
  70 # I decree that this project's first language is English.
  71
  72 %Lexicon = (
  73    'TEST_STRING' => 'Concrete Mixer',
  74
  75     '__Content-Type' => 'text/plain; charset=utf-8',
  76
  77   '_AUTO' => 1,
  78   # That means that lookup failures can't happen -- if we get as far
  79   #  as looking for something in this lexicon, and we don't find it,
  80   #  then automagically set $Lexicon{$key} = $key, before possibly
  81   #  compiling it.
  82
  83   # The exception is keys that start with "_" -- they aren't auto-makeable.
  84
  85 );
  86 # End of lexicon.
  87
  88 =head2 Init
  89
  90 Initializes the lexicons used for localization.
  91
  92 =begin testing
  93
  94 use_ok (RT::I18N);
  95 ok(RT::I18N->Init);
  96
  97 =end testing
  98
  99 =cut
 100
 101 sub Init {
 102     require File::Glob;
 103
 104     # Load language-specific functions
 105     foreach my $language ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm")) {
 106         if ($language =~ /^([-\w\s.\/\\~:]+)$/) {
 107             require $1;
 108         }
 109         else {
 110             warn("$language is tainted. not loading");
 111         }
 112     }
 113
 114     my @lang = @RT::LexiconLanguages;
 115     @lang = ('*') unless @lang;
 116
 117     # Acquire all .po files and iterate them into lexicons
 118     Locale::Maketext::Lexicon->import({
 119         _decode => 1, map {
 120             $_  => [
 121                 Gettext => (substr(__FILE__, 0, -3) . "/$_.po"),
 122                 Gettext => "$RT::LocalLexiconPath/*/$_.po",
 123                 Gettext => "$RT::LocalLexiconPath/$_.po",
 124             ],
 125         } @lang
 126     });
 127
 128     return 1;
 129 }
 130
 131 =head2 encoding
 132
 133 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 134 If it can't find anything, it returns 'ISO-8859-1'
 135
 136 =begin testing
 137
 138 ok(my $chinese = RT::I18N->get_handle('zh_tw'));
 139 ok(UNIVERSAL::can($chinese, 'maketext'));
 140 ok($chinese->maketext('__Content-Type') =~ /utf-8/i, "Found the utf-8 charset for traditional chinese in the string ".$chinese->maketext('__Content-Type'));
 141 ok($chinese->encoding eq 'utf-8', "The encoding is 'utf-8' -".$chinese->encoding);
 142
 143 ok(my $en = RT::I18N->get_handle('en'));
 144 ok(UNIVERSAL::can($en, 'maketext'));
 145 ok($en->encoding eq 'utf-8', "The encoding ".$en->encoding." is 'utf-8'");
 146
 147 =end testing
 148
 149
 150 =cut
 151
 152
 153 sub encoding { 'utf-8' }
 154
 155 # {{{ SetMIMEEntityToUTF8
 156
 157 =head2 SetMIMEEntityToUTF8 $entity
 158
 159 An utility method which will try to convert entity body into utf8.
 160 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 161
 162 =cut
 163
 164 sub SetMIMEEntityToUTF8 {
 165     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 166 }
 167
 168 # }}}
 169
 170 # {{{ SetMIMEEntityToEncoding
 171
 172 =head2 SetMIMEEntityToEncoding $entity, $encoding
 173
 174 An utility method which will try to convert entity body into specified
 175 charset encoding (encoded as octets, *not* unicode-strings).  It will
 176 iterate all the entities in $entity, and try to convert each one into
 177 specified charset if whose Content-Type is 'text/plain'.
 178
 179 This method doesn't return anything meaningful.
 180
 181 =cut
 182
 183 sub SetMIMEEntityToEncoding {
 184     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 185
 186     #if ( $entity->is_multipart ) {
 187     #$RT::Logger->crit("This entity is a multipart " . $entity->head->as_string);
 188         SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 189     #}
 190
 191     my $charset = _FindOrGuessCharset($entity) or return;
 192     # one and only normalization
 193     $charset = 'utf-8' if $charset =~ /^utf-?8$/i;
 194     $enc     = 'utf-8' if $enc     =~ /^utf-?8$/i;
 195
 196     SetMIMEHeadToEncoding(
 197         $entity->head,
 198         _FindOrGuessCharset($entity, 1) => $enc,
 199         $preserve_words
 200     );
 201
 202     my $head = $entity->head;
 203
 204     # convert at least MIME word encoded attachment filename
 205     foreach my $attr (qw(content-type.name content-disposition.filename)) {
 206         if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
 207             $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
 208         }
 209     }
 210
 211     # If this is a textual entity, we'd need to preserve its original encoding
 212     $head->add( "X-RT-Original-Encoding" => $charset )
 213         if $head->mime_attr('content-type.charset') or $head->mime_type =~ /^text/;
 214
 215
 216     return unless ( $head->mime_type =~ qr{^(text/plain|message/rfc822)$}i  );
 217
 218
 219     my $body = $entity->bodyhandle;
 220
 221     if ( $enc ne $charset && $body) {
 222         my @lines = $body->as_lines or return;
 223
 224         # {{{ Convert the body
 225         eval {
 226             $RT::Logger->debug("Converting '$charset' to '$enc' for ". $head->mime_type . " - ". ($head->get('subject') || 'Subjectless message'));
 227
 228             # NOTE:: see the comments at the end of the sub.
 229             Encode::_utf8_off( $lines[$_] ) foreach ( 0 .. $#lines );
 230             Encode::from_to( $lines[$_], $charset => $enc ) for ( 0 .. $#lines );
 231         };
 232
 233         if ($@) {
 234             $RT::Logger->error( "Encoding error: " . $@ . " defaulting to ISO-8859-1 -> UTF-8" );
 235             eval {
 236                 Encode::from_to( $lines[$_], 'iso-8859-1' => $enc ) foreach ( 0 .. $#lines );
 237             };
 238             if ($@) {
 239                 $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 240             }
 241         }
 242         # }}}
 243
 244         my $new_body = MIME::Body::InCore->new( \@lines );
 245
 246         # set up the new entity
 247         $head->mime_attr( "content-type" => 'text/plain' )
 248           unless ( $head->mime_attr("content-type") );
 249         $head->mime_attr( "content-type.charset" => $enc );
 250         $entity->bodyhandle($new_body);
 251     }
 252 }
 253
 254 # NOTES:  Why Encode::_utf8_off before Encode::from_to
 255 #
 256 # All the strings in RT are utf-8 now.  Quotes from Encode POD:
 257 #
 258 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 259 # ... The data in $octets must be encoded as octets and not as
 260 # characters in Perl's internal format. ...
 261 #
 262 # Not turning off the UTF-8 flag in the string will prevent the string
 263 # from conversion.
 264
 265 # }}}
 266
 267 # {{{ DecodeMIMEWordsToUTF8
 268
 269 =head2 DecodeMIMEWordsToUTF8 $raw
 270
 271 An utility method which mimics MIME::Words::decode_mimewords, but only
 272 limited functionality.  This function returns an utf-8 string.
 273
 274 It returns the decoded string, or the original string if it's not
 275 encoded.  Since the subroutine converts specified string into utf-8
 276 charset, it should not alter a subject written in English.
 277
 278 Why not use MIME::Words directly?  Because it fails in RT when I
 279 tried.  Maybe it's ok now.
 280
 281 =cut
 282
 283 sub DecodeMIMEWordsToUTF8 {
 284     my $str = shift;
 285     DecodeMIMEWordsToEncoding($str, 'utf-8');
 286 }
 287
 288 sub DecodeMIMEWordsToEncoding {
 289     my $str = shift;
 290     my $enc = shift;
 291
 292
 293     @_ = $str =~ m/([^=]*)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/g;
 294
 295     return ($str) unless (@_);
 296
 297     $str = "";
 298     while (@_) {
 299         my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 300             (shift, shift, shift, shift, shift);
 301
 302         $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 303
 304         if ($encoding eq 'Q' or $encoding eq 'q') {
 305             use MIME::QuotedPrint;
 306             $enc_str =~ tr/_/ /;                # Observed from Outlook Express
 307             $enc_str = decode_qp($enc_str);
 308         } elsif ($encoding eq 'B' or $encoding eq 'b') {
 309             use MIME::Base64;
 310             $enc_str = decode_base64($enc_str);
 311         } else {
 312             $RT::Logger->warning("RT::I18N::DecodeMIMEWordsToCharset got a " .
 313                               "strange encoding: $encoding.");
 314         }
 315
 316         # now we have got a decoded subject, try to convert into the encoding
 317         unless ($charset eq $enc) {
 318             eval { Encode::from_to($enc_str, $charset,  $enc) };
 319             if ($@) {
 320                 $charset = _GuessCharset( $enc_str );
 321                 Encode::from_to($enc_str, $charset, $enc);
 322             }
 323         }
 324
 325         # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
 326         # We _should_ be preserving them encoded until after parsing is completed and
 327         # THEN undo the mime-encoding.
 328         #
 329         # This routine should be translating the existing mimeencoding to utf8 but leaving
 330         # things encoded.
 331         #
 332         # It's legal for headers to contain mime-encoded commas and semicolons which
 333         # should not be treated as address separators. (Encoding == quoting here)
 334         #
 335         # until this is fixed, we must escape any string containing a comma or semicolon
 336         # this is only a bandaid
 337
 338         $enc_str = qq{"$enc_str"} if ($enc_str =~ /[,;]/);
 339         $str .= $prefix . $enc_str . $trailing;
 340     }
 341
 342     return ($str)
 343 }
 344
 345 # }}}
 346
 347 # {{{ _FindOrGuessCharset
 348
 349 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 350
 351 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
 352
 353 If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.
 354
 355 =cut
 356
 357 sub _FindOrGuessCharset {
 358     my $entity = shift;
 359     my $head_only = shift;
 360     my $head = $entity->head;
 361
 362     if ($head->mime_attr("content-type.charset")) {
 363         return $head->mime_attr("content-type.charset");
 364     }
 365
 366     if ( !$head_only and $head->mime_type =~ m{^text/}) {
 367         my $body = $entity->bodyhandle or return;
 368         return _GuessCharset( $body->as_string );
 369     }
 370     else {
 371         # potentially binary data -- don't guess the body
 372         return _GuessCharset( $head->as_string );
 373     }
 374 }
 375
 376 # }}}
 377
 378 # {{{ _GuessCharset
 379
 380 =head2 _GuessCharset STRING
 381
 382 use Encode::Guess to try to figure it out the string's encoding.
 383
 384 =cut
 385
 386 sub _GuessCharset {
 387     my $fallback = 'iso-8859-1';
 388     my $charset;
 389
 390     if ( @RT::EmailInputEncodings and eval { require Encode::Guess; 1 } ) {
 391         Encode::Guess->set_suspects(@RT::EmailInputEncodings);
 392         my $decoder = Encode::Guess->guess( $_[0] );
 393
 394         if ( ref $decoder ) {
 395             $charset = $decoder->name;
 396             $RT::Logger->debug("Guessed encoding: $charset");
 397             return $charset;
 398         }
 399         elsif ($decoder =~ /(\S+ or .+)/) {
 400             my %matched = map { $_ => 1 } split(/ or /, $1);
 401             return 'utf-8' if $matched{'utf8'}; # one and only normalization
 402
 403             foreach my $suspect (@RT::EmailInputEncodings) {
 404                 next unless $matched{$suspect};
 405                 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 406                 $charset = $suspect;
 407                 last;
 408             }
 409         }
 410         else {
 411             $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 412         }
 413     }
 414     else {
 415         $RT::Logger->warning("Cannot Encode::Guess; fallback to $fallback");
 416     }
 417
 418     return($charset || $fallback);
 419 }
 420
 421 # }}}
 422
 423 # {{{ SetMIMEHeadToEncoding
 424
 425 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 426
 427 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 428 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 429 all the time
 430
 431
 432 =cut
 433
 434 sub SetMIMEHeadToEncoding {
 435     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 436
 437     $charset = 'utf-8' if $charset eq 'utf8';
 438     $enc     = 'utf-8' if $enc     eq 'utf8';
 439
 440     return if $charset eq $enc and $preserve_words;
 441
 442     foreach my $tag ( $head->tags ) {
 443         next unless $tag; # seen in wild: headers with no name
 444         my @values = $head->get_all($tag);
 445         $head->delete($tag);
 446         foreach my $value (@values) {
 447             if ( $charset ne $enc ) {
 448
 449                 eval {
 450                     Encode::_utf8_off($value);
 451                     Encode::from_to( $value, $charset => $enc );
 452                 };
 453                 if ($@) {
 454                     $RT::Logger->error( "Encoding error: " . $@
 455                                        . " defaulting to ISO-8859-1 -> UTF-8" );
 456                     eval { Encode::from_to( $value, 'iso-8859-1' => $enc ) };
 457                     if ($@) {
 458                         $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 459                     }
 460                 }
 461             }
 462             $value = DecodeMIMEWordsToEncoding( $value, $enc ) unless $preserve_words;
 463             $head->add( $tag, $value );
 464         }
 465     }
 466
 467 }
 468 # }}}
 469
 470 eval "require RT::I18N_Vendor";
 471 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Vendor.pm});
 472 eval "require RT::I18N_Local";
 473 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Local.pm});
 474
 475 1;  # End of module.
 476