rt/lib/RT/I18N.pm

   1 # BEGIN LICENSE BLOCK
   2 #
   3 # Copyright (c) 1996-2003 Jesse Vincent <jesse@bestpractical.com>
   4 #
   5 # (Except where explictly superceded by other copyright notices)
   6 #
   7 # This work is made available to you under the terms of Version 2 of
   8 # the GNU General Public License. A copy of that license should have
   9 # been provided with this software, but in any event can be snarfed
  10 # from www.gnu.org.
  11 #
  12 # This work is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # General Public License for more details.
  16 #
  17 # Unless otherwise specified, all modifications, corrections or
  18 # extensions to this work which alter its source code become the
  19 # property of Best Practical Solutions, LLC when submitted for
  20 # inclusion in the work.
  21 #
  22 #
  23 # END LICENSE BLOCK
  24 =head1 NAME
  25
  26 RT::I18N - a base class for localization of RT
  27
  28 =cut
  29
  30 package RT::I18N;
  31
  32 use strict;
  33 use Locale::Maketext 1.04;
  34 use Locale::Maketext::Lexicon 0.25;
  35 use base ('Locale::Maketext::Fuzzy');
  36 use vars qw( %Lexicon );
  37
  38 #If we're running on 5.6, we desperately need Encode::compat. But if we're on 5.8, we don't really need it.
  39 BEGIN { if ($] < 5.007001) {
  40 require Encode::compat;
  41 } }
  42 use Encode;
  43
  44 use MIME::Entity;
  45 use MIME::Head;
  46
  47 # I decree that this project's first language is English.
  48
  49 %Lexicon = (
  50    'TEST_STRING' => 'Concrete Mixer',
  51
  52     '__Content-Type' => 'text/plain; charset=utf-8',
  53
  54   '_AUTO' => 1,
  55   # That means that lookup failures can't happen -- if we get as far
  56   #  as looking for something in this lexicon, and we don't find it,
  57   #  then automagically set $Lexicon{$key} = $key, before possibly
  58   #  compiling it.
  59
  60   # The exception is keys that start with "_" -- they aren't auto-makeable.
  61
  62 );
  63 # End of lexicon.
  64
  65 =head2 Init
  66
  67 Initializes the lexicons used for localization.
  68
  69 =begin testing
  70
  71 use_ok (RT::I18N);
  72 ok(RT::I18N->Init);
  73
  74 =end testing
  75
  76 =cut
  77
  78 sub Init {
  79     # Load language-specific functions
  80     foreach my $language ( glob(substr(__FILE__, 0, -3) . "/*.pm")) {
  81         if ($language =~ /^([-\w.\/\\~:]+)$/) {
  82             require $1;
  83         }
  84         else {
  85             warn("$language is tainted. not loading");
  86         }
  87     }
  88
  89     my @lang = @RT::LexiconLanguages;
  90     @lang = ('*') unless @lang;
  91
  92     # Acquire all .po files and iterate them into lexicons
  93     Locale::Maketext::Lexicon->import({
  94         _decode => 1, map {
  95             $_  => [
  96                 Gettext => (substr(__FILE__, 0, -3) . "/$_.po"),
  97                 Gettext => "$RT::LocalLexiconPath/*/$_.po",
  98             ],
  99         } @lang
 100     });
 101
 102     return 1;
 103 }
 104
 105 =head2 encoding
 106
 107 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 108 If it can't find anything, it returns 'ISO-8859-1'
 109
 110 =begin testing
 111
 112 ok(my $chinese = RT::I18N->get_handle('zh_tw'));
 113 ok(UNIVERSAL::can($chinese, 'maketext'));
 114 ok($chinese->maketext('__Content-Type') =~ /utf-8/i, "Found the utf-8 charset for traditional chinese in the string ".$chinese->maketext('__Content-Type'));
 115 ok($chinese->encoding eq 'utf-8', "The encoding is 'utf-8' -".$chinese->encoding);
 116
 117 ok(my $en = RT::I18N->get_handle('en'));
 118 ok(UNIVERSAL::can($en, 'maketext'));
 119 ok($en->encoding eq 'utf-8', "The encoding ".$en->encoding." is 'utf-8'");
 120
 121 =end testing
 122
 123
 124 =cut
 125
 126
 127 sub encoding { 'utf-8' }
 128
 129 # {{{ SetMIMEEntityToUTF8
 130
 131 =head2 SetMIMEEntityToUTF8 $entity
 132
 133 An utility method which will try to convert entity body into utf8.
 134 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 135
 136 =cut
 137
 138 sub SetMIMEEntityToUTF8 {
 139     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 140 }
 141
 142 # }}}
 143
 144 # {{{ SetMIMEEntityToEncoding
 145
 146 =head2 SetMIMEEntityToEncoding $entity, $encoding
 147
 148 An utility method which will try to convert entity body into specified
 149 charset encoding (encoded as octets, *not* unicode-strings).  It will
 150 iterate all the entities in $entity, and try to convert each one into
 151 specified charset if whose Content-Type is 'text/plain'.
 152
 153 This method doesn't return anything meaningful.
 154
 155 =cut
 156
 157 sub SetMIMEEntityToEncoding {
 158     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 159
 160     #if ( $entity->is_multipart ) {
 161     #$RT::Logger->crit("This entity is a multipart " . $entity->head->as_string);
 162         SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 163     #}
 164
 165     my $charset = _FindOrGuessCharset($entity) or return;
 166     # one and only normalization
 167     $charset = 'utf-8' if $charset =~ /^utf-?8$/i;
 168     $enc     = 'utf-8' if $enc     =~ /^utf-?8$/i;
 169
 170     SetMIMEHeadToEncoding(
 171         $entity->head,
 172         _FindOrGuessCharset($entity, 1) => $enc,
 173         $preserve_words
 174     );
 175
 176     my $head = $entity->head;
 177
 178     # convert at least MIME word encoded attachment filename
 179     foreach my $attr (qw(content-type.name content-disposition.filename)) {
 180         if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
 181             $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
 182         }
 183     }
 184
 185     # If this is a textual entity, we'd need to preserve its original encoding
 186     $head->add( "X-RT-Original-Encoding" => $charset )
 187         if $head->mime_attr('content-type.charset') or $head->mime_type =~ /^text/;
 188
 189
 190     return unless ( $head->mime_type =~ qr{^(text/plain|message/rfc822)$}i  );
 191
 192
 193     my $body = $entity->bodyhandle;
 194
 195     if ( $enc ne $charset && $body) {
 196         my @lines = $body->as_lines or return;
 197
 198         # {{{ Convert the body
 199         eval {
 200             $RT::Logger->debug("Converting '$charset' to '$enc' for ". $head->mime_type . " - ". $head->get('subject'));
 201
 202             # NOTE:: see the comments at the end of the sub.
 203             Encode::_utf8_off( $lines[$_] ) foreach ( 0 .. $#lines );
 204             Encode::from_to( $lines[$_], $charset => $enc ) for ( 0 .. $#lines );
 205         };
 206
 207         if ($@) {
 208             $RT::Logger->error( "Encoding error: " . $@ . " defaulting to ISO-8859-1 -> UTF-8" );
 209             eval {
 210                 Encode::from_to( $lines[$_], 'iso-8859-1' => $enc ) foreach ( 0 .. $#lines );
 211             };
 212             if ($@) {
 213                 $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 214             }
 215         }
 216         # }}}
 217
 218         my $new_body = MIME::Body::InCore->new( \@lines );
 219
 220         # set up the new entity
 221         $head->mime_attr( "content-type" => 'text/plain' )
 222           unless ( $head->mime_attr("content-type") );
 223         $head->mime_attr( "content-type.charset" => $enc );
 224         $entity->bodyhandle($new_body);
 225     }
 226 }
 227
 228 # NOTES:  Why Encode::_utf8_off before Encode::from_to
 229 #
 230 # All the strings in RT are utf-8 now.  Quotes from Encode POD:
 231 #
 232 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 233 # ... The data in $octets must be encoded as octets and not as
 234 # characters in Perl's internal format. ...
 235 #
 236 # Not turning off the UTF-8 flag in the string will prevent the string
 237 # from conversion.
 238
 239 # }}}
 240
 241 # {{{ DecodeMIMEWordsToUTF8
 242
 243 =head2 DecodeMIMEWordsToUTF8 $raw
 244
 245 An utility method which mimics MIME::Words::decode_mimewords, but only
 246 limited functionality.  This function returns an utf-8 string.
 247
 248 It returns the decoded string, or the original string if it's not
 249 encoded.  Since the subroutine converts specified string into utf-8
 250 charset, it should not alter a subject written in English.
 251
 252 Why not use MIME::Words directly?  Because it fails in RT when I
 253 tried.  Maybe it's ok now.
 254
 255 =cut
 256
 257 sub DecodeMIMEWordsToUTF8 {
 258     my $str = shift;
 259     DecodeMIMEWordsToEncoding($str, 'utf-8');
 260 }
 261
 262 sub DecodeMIMEWordsToEncoding {
 263     my $str = shift;
 264     my $enc = shift;
 265
 266
 267     @_ = $str =~ m/([^=]*)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/g;
 268
 269     return ($str) unless (@_);
 270
 271     $str = "";
 272     while (@_) {
 273         my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 274             (shift, shift, shift, shift, shift);
 275
 276         $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 277
 278         if ($encoding eq 'Q' or $encoding eq 'q') {
 279             use MIME::QuotedPrint;
 280             $enc_str =~ tr/_/ /;                # Observed from Outlook Express
 281             $enc_str = decode_qp($enc_str);
 282         } elsif ($encoding eq 'B' or $encoding eq 'b') {
 283             use MIME::Base64;
 284             $enc_str = decode_base64($enc_str);
 285         } else {
 286             $RT::Logger->warning("RT::I18N::DecodeMIMEWordsToCharset got a " .
 287                               "strange encoding: $encoding.");
 288         }
 289
 290         # now we have got a decoded subject, try to convert into the encoding
 291         unless ($charset eq $enc) {
 292             eval { Encode::from_to($enc_str, $charset,  $enc) };
 293             if ($@) {
 294                 $charset = _GuessCharset( $enc_str );
 295                 Encode::from_to($enc_str, $charset, $enc);
 296             }
 297         }
 298
 299         $str .= $prefix . $enc_str . $trailing;
 300     }
 301
 302     return ($str)
 303 }
 304
 305 # }}}
 306
 307 # {{{ _FindOrGuessCharset
 308
 309 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 310
 311 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
 312
 313 If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.
 314
 315 =cut
 316
 317 sub _FindOrGuessCharset {
 318     my $entity = shift;
 319     my $head_only = shift;
 320     my $head = $entity->head;
 321
 322     if ($head->mime_attr("content-type.charset")) {
 323         return $head->mime_attr("content-type.charset");
 324     }
 325
 326     if ( !$head_only and $head->mime_type =~ m{^text/}) {
 327         my $body = $entity->bodyhandle or return;
 328         return _GuessCharset( $body->as_string );
 329     }
 330     else {
 331         # potentially binary data -- don't guess the body
 332         return _GuessCharset( $head->as_string );
 333     }
 334 }
 335
 336 # }}}
 337
 338
 339 # {{{ _GuessCharset
 340
 341 =head2 _GuessCharset STRING
 342
 343 use Encode::Guess to try to figure it out the string's encoding.
 344
 345 =cut
 346
 347 sub _GuessCharset {
 348     my $fallback = 'iso-8859-1';
 349     my $charset;
 350
 351     if ( @RT::EmailInputEncodings and eval { require Encode::Guess; 1 } ) {
 352         Encode::Guess->set_suspects(@RT::EmailInputEncodings);
 353         my $decoder = Encode::Guess->guess( $_[0] );
 354
 355         if ( ref $decoder ) {
 356             $charset = $decoder->name;
 357             $RT::Logger->debug("Guessed encoding: $charset");
 358             return $charset;
 359         }
 360         elsif ($decoder =~ /(\S+ or .+)/) {
 361             my %matched = map { $_ => 1 } split(/ or /, $1);
 362             return 'utf-8' if $matched{'utf8'}; # one and only normalization
 363
 364             foreach my $suspect (@RT::EmailInputEncodings) {
 365                 next unless $matched{$suspect};
 366                 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 367                 $charset = $suspect;
 368                 last;
 369             }
 370         }
 371         else {
 372             $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 373         }
 374     }
 375     else {
 376         $RT::Logger->warning("Cannot Encode::Guess; fallback to $fallback");
 377     }
 378
 379     return($charset || $fallback);
 380 }
 381
 382 # }}}
 383
 384 # {{{ SetMIMEHeadToEncoding
 385
 386 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 387
 388 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 389 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 390 all the time
 391
 392
 393 =cut
 394
 395 sub SetMIMEHeadToEncoding {
 396     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 397
 398     $charset = 'utf-8' if $charset eq 'utf8';
 399     $enc     = 'utf-8' if $enc     eq 'utf8';
 400
 401     return if $charset eq $enc and $preserve_words;
 402
 403     foreach my $tag ( $head->tags ) {
 404         my @values = $head->get_all($tag);
 405         $head->delete($tag);
 406         foreach my $value (@values) {
 407             if ( $charset ne $enc ) {
 408
 409                 eval {
 410                     Encode::_utf8_off($value);
 411                     Encode::from_to( $value, $charset => $enc );
 412                 };
 413                 if ($@) {
 414                     $RT::Logger->error( "Encoding error: " . $@
 415                                        . " defaulting to ISO-8859-1 -> UTF-8" );
 416                     eval { Encode::from_to( $value, 'iso-8859-1' => $enc ) };
 417                     if ($@) {
 418                         $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 419                     }
 420                 }
 421             }
 422             $value = DecodeMIMEWordsToEncoding( $value, $enc ) unless $preserve_words;
 423             $head->add( $tag, $value );
 424         }
 425     }
 426
 427 }
 428 # }}}
 429
 430 eval "require RT::I18N_Vendor";
 431 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Vendor.pm});
 432 eval "require RT::I18N_Local";
 433 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Local.pm});
 434
 435 1;  # End of module.
 436