rt/lib/RT/I18N.pm

   1 # {{{ BEGIN BPS TAGGED BLOCK
   2 #
   3 # COPYRIGHT:
   4 #
   5 # This software is Copyright (c) 1996-2004 Best Practical Solutions, LLC
   6 #                                          <jesse@bestpractical.com>
   7 #
   8 # (Except where explicitly superseded by other copyright notices)
   9 #
  10 #
  11 # LICENSE:
  12 #
  13 # This work is made available to you under the terms of Version 2 of
  14 # the GNU General Public License. A copy of that license should have
  15 # been provided with this software, but in any event can be snarfed
  16 # from www.gnu.org.
  17 #
  18 # This work is distributed in the hope that it will be useful, but
  19 # WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 # General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU General Public License
  24 # along with this program; if not, write to the Free Software
  25 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  26 #
  27 #
  28 # CONTRIBUTION SUBMISSION POLICY:
  29 #
  30 # (The following paragraph is not intended to limit the rights granted
  31 # to you to modify and distribute this software under the terms of
  32 # the GNU General Public License and is only of importance to you if
  33 # you choose to contribute your changes and enhancements to the
  34 # community by submitting them to Best Practical Solutions, LLC.)
  35 #
  36 # By intentionally submitting any modifications, corrections or
  37 # derivatives to this work, or any other work intended for use with
  38 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
  39 # you are the copyright holder for those contributions and you grant
  40 # Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
  41 # royalty-free, perpetual, license to use, copy, create derivative
  42 # works based on those contributions, and sublicense and distribute
  43 # those contributions and any derivatives thereof.
  44 #
  45 # }}} END BPS TAGGED BLOCK
  46 =head1 NAME
  47
  48 RT::I18N - a base class for localization of RT
  49
  50 =cut
  51
  52 package RT::I18N;
  53
  54 use strict;
  55 use Locale::Maketext 1.04;
  56 use Locale::Maketext::Lexicon 0.25;
  57 use base ('Locale::Maketext::Fuzzy');
  58 use vars qw( %Lexicon );
  59
  60 #If we're running on 5.6, we desperately need Encode::compat. But if we're on 5.8, we don't really need it.
  61 BEGIN { if ($] < 5.007001) {
  62 require Encode::compat;
  63 } }
  64 use Encode;
  65
  66 use MIME::Entity;
  67 use MIME::Head;
  68
  69 # I decree that this project's first language is English.
  70
  71 %Lexicon = (
  72    'TEST_STRING' => 'Concrete Mixer',
  73
  74     '__Content-Type' => 'text/plain; charset=utf-8',
  75
  76   '_AUTO' => 1,
  77   # That means that lookup failures can't happen -- if we get as far
  78   #  as looking for something in this lexicon, and we don't find it,
  79   #  then automagically set $Lexicon{$key} = $key, before possibly
  80   #  compiling it.
  81
  82   # The exception is keys that start with "_" -- they aren't auto-makeable.
  83
  84 );
  85 # End of lexicon.
  86
  87 =head2 Init
  88
  89 Initializes the lexicons used for localization.
  90
  91 =begin testing
  92
  93 use_ok (RT::I18N);
  94 ok(RT::I18N->Init);
  95
  96 =end testing
  97
  98 =cut
  99
 100 sub Init {
 101     # Load language-specific functions
 102     foreach my $language ( glob(substr(__FILE__, 0, -3) . "/*.pm")) {
 103         if ($language =~ /^([-\w.\/\\~:]+)$/) {
 104             require $1;
 105         }
 106         else {
 107             warn("$language is tainted. not loading");
 108         }
 109     }
 110
 111     my @lang = @RT::LexiconLanguages;
 112     @lang = ('*') unless @lang;
 113
 114     # Acquire all .po files and iterate them into lexicons
 115     Locale::Maketext::Lexicon->import({
 116         _decode => 1, map {
 117             $_  => [
 118                 Gettext => (substr(__FILE__, 0, -3) . "/$_.po"),
 119                 Gettext => "$RT::LocalLexiconPath/*/$_.po",
 120             ],
 121         } @lang
 122     });
 123
 124     return 1;
 125 }
 126
 127 =head2 encoding
 128
 129 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 130 If it can't find anything, it returns 'ISO-8859-1'
 131
 132 =begin testing
 133
 134 ok(my $chinese = RT::I18N->get_handle('zh_tw'));
 135 ok(UNIVERSAL::can($chinese, 'maketext'));
 136 ok($chinese->maketext('__Content-Type') =~ /utf-8/i, "Found the utf-8 charset for traditional chinese in the string ".$chinese->maketext('__Content-Type'));
 137 ok($chinese->encoding eq 'utf-8', "The encoding is 'utf-8' -".$chinese->encoding);
 138
 139 ok(my $en = RT::I18N->get_handle('en'));
 140 ok(UNIVERSAL::can($en, 'maketext'));
 141 ok($en->encoding eq 'utf-8', "The encoding ".$en->encoding." is 'utf-8'");
 142
 143 =end testing
 144
 145
 146 =cut
 147
 148
 149 sub encoding { 'utf-8' }
 150
 151 # {{{ SetMIMEEntityToUTF8
 152
 153 =head2 SetMIMEEntityToUTF8 $entity
 154
 155 An utility method which will try to convert entity body into utf8.
 156 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 157
 158 =cut
 159
 160 sub SetMIMEEntityToUTF8 {
 161     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 162 }
 163
 164 # }}}
 165
 166 # {{{ SetMIMEEntityToEncoding
 167
 168 =head2 SetMIMEEntityToEncoding $entity, $encoding
 169
 170 An utility method which will try to convert entity body into specified
 171 charset encoding (encoded as octets, *not* unicode-strings).  It will
 172 iterate all the entities in $entity, and try to convert each one into
 173 specified charset if whose Content-Type is 'text/plain'.
 174
 175 This method doesn't return anything meaningful.
 176
 177 =cut
 178
 179 sub SetMIMEEntityToEncoding {
 180     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 181
 182     #if ( $entity->is_multipart ) {
 183     #$RT::Logger->crit("This entity is a multipart " . $entity->head->as_string);
 184         SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 185     #}
 186
 187     my $charset = _FindOrGuessCharset($entity) or return;
 188     # one and only normalization
 189     $charset = 'utf-8' if $charset =~ /^utf-?8$/i;
 190     $enc     = 'utf-8' if $enc     =~ /^utf-?8$/i;
 191
 192     SetMIMEHeadToEncoding(
 193         $entity->head,
 194         _FindOrGuessCharset($entity, 1) => $enc,
 195         $preserve_words
 196     );
 197
 198     my $head = $entity->head;
 199
 200     # convert at least MIME word encoded attachment filename
 201     foreach my $attr (qw(content-type.name content-disposition.filename)) {
 202         if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
 203             $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
 204         }
 205     }
 206
 207     # If this is a textual entity, we'd need to preserve its original encoding
 208     $head->add( "X-RT-Original-Encoding" => $charset )
 209         if $head->mime_attr('content-type.charset') or $head->mime_type =~ /^text/;
 210
 211
 212     return unless ( $head->mime_type =~ qr{^(text/plain|message/rfc822)$}i  );
 213
 214
 215     my $body = $entity->bodyhandle;
 216
 217     if ( $enc ne $charset && $body) {
 218         my @lines = $body->as_lines or return;
 219
 220         # {{{ Convert the body
 221         eval {
 222             $RT::Logger->debug("Converting '$charset' to '$enc' for ". $head->mime_type . " - ". $head->get('subject'));
 223
 224             # NOTE:: see the comments at the end of the sub.
 225             Encode::_utf8_off( $lines[$_] ) foreach ( 0 .. $#lines );
 226             Encode::from_to( $lines[$_], $charset => $enc ) for ( 0 .. $#lines );
 227         };
 228
 229         if ($@) {
 230             $RT::Logger->error( "Encoding error: " . $@ . " defaulting to ISO-8859-1 -> UTF-8" );
 231             eval {
 232                 Encode::from_to( $lines[$_], 'iso-8859-1' => $enc ) foreach ( 0 .. $#lines );
 233             };
 234             if ($@) {
 235                 $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 236             }
 237         }
 238         # }}}
 239
 240         my $new_body = MIME::Body::InCore->new( \@lines );
 241
 242         # set up the new entity
 243         $head->mime_attr( "content-type" => 'text/plain' )
 244           unless ( $head->mime_attr("content-type") );
 245         $head->mime_attr( "content-type.charset" => $enc );
 246         $entity->bodyhandle($new_body);
 247     }
 248 }
 249
 250 # NOTES:  Why Encode::_utf8_off before Encode::from_to
 251 #
 252 # All the strings in RT are utf-8 now.  Quotes from Encode POD:
 253 #
 254 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 255 # ... The data in $octets must be encoded as octets and not as
 256 # characters in Perl's internal format. ...
 257 #
 258 # Not turning off the UTF-8 flag in the string will prevent the string
 259 # from conversion.
 260
 261 # }}}
 262
 263 # {{{ DecodeMIMEWordsToUTF8
 264
 265 =head2 DecodeMIMEWordsToUTF8 $raw
 266
 267 An utility method which mimics MIME::Words::decode_mimewords, but only
 268 limited functionality.  This function returns an utf-8 string.
 269
 270 It returns the decoded string, or the original string if it's not
 271 encoded.  Since the subroutine converts specified string into utf-8
 272 charset, it should not alter a subject written in English.
 273
 274 Why not use MIME::Words directly?  Because it fails in RT when I
 275 tried.  Maybe it's ok now.
 276
 277 =cut
 278
 279 sub DecodeMIMEWordsToUTF8 {
 280     my $str = shift;
 281     DecodeMIMEWordsToEncoding($str, 'utf-8');
 282 }
 283
 284 sub DecodeMIMEWordsToEncoding {
 285     my $str = shift;
 286     my $enc = shift;
 287
 288
 289     @_ = $str =~ m/([^=]*)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/g;
 290
 291     return ($str) unless (@_);
 292
 293     $str = "";
 294     while (@_) {
 295         my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 296             (shift, shift, shift, shift, shift);
 297
 298         $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 299
 300         if ($encoding eq 'Q' or $encoding eq 'q') {
 301             use MIME::QuotedPrint;
 302             $enc_str =~ tr/_/ /;                # Observed from Outlook Express
 303             $enc_str = decode_qp($enc_str);
 304         } elsif ($encoding eq 'B' or $encoding eq 'b') {
 305             use MIME::Base64;
 306             $enc_str = decode_base64($enc_str);
 307         } else {
 308             $RT::Logger->warning("RT::I18N::DecodeMIMEWordsToCharset got a " .
 309                               "strange encoding: $encoding.");
 310         }
 311
 312         # now we have got a decoded subject, try to convert into the encoding
 313         unless ($charset eq $enc) {
 314             eval { Encode::from_to($enc_str, $charset,  $enc) };
 315             if ($@) {
 316                 $charset = _GuessCharset( $enc_str );
 317                 Encode::from_to($enc_str, $charset, $enc);
 318             }
 319         }
 320
 321         $str .= $prefix . $enc_str . $trailing;
 322     }
 323
 324     return ($str)
 325 }
 326
 327 # }}}
 328
 329 # {{{ _FindOrGuessCharset
 330
 331 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 332
 333 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
 334
 335 If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.
 336
 337 =cut
 338
 339 sub _FindOrGuessCharset {
 340     my $entity = shift;
 341     my $head_only = shift;
 342     my $head = $entity->head;
 343
 344     if ($head->mime_attr("content-type.charset")) {
 345         return $head->mime_attr("content-type.charset");
 346     }
 347
 348     if ( !$head_only and $head->mime_type =~ m{^text/}) {
 349         my $body = $entity->bodyhandle or return;
 350         return _GuessCharset( $body->as_string );
 351     }
 352     else {
 353         # potentially binary data -- don't guess the body
 354         return _GuessCharset( $head->as_string );
 355     }
 356 }
 357
 358 # }}}
 359
 360
 361 # {{{ _GuessCharset
 362
 363 =head2 _GuessCharset STRING
 364
 365 use Encode::Guess to try to figure it out the string's encoding.
 366
 367 =cut
 368
 369 sub _GuessCharset {
 370     my $fallback = 'iso-8859-1';
 371     my $charset;
 372
 373     if ( @RT::EmailInputEncodings and eval { require Encode::Guess; 1 } ) {
 374         Encode::Guess->set_suspects(@RT::EmailInputEncodings);
 375         my $decoder = Encode::Guess->guess( $_[0] );
 376
 377         if ( ref $decoder ) {
 378             $charset = $decoder->name;
 379             $RT::Logger->debug("Guessed encoding: $charset");
 380             return $charset;
 381         }
 382         elsif ($decoder =~ /(\S+ or .+)/) {
 383             my %matched = map { $_ => 1 } split(/ or /, $1);
 384             return 'utf-8' if $matched{'utf8'}; # one and only normalization
 385
 386             foreach my $suspect (@RT::EmailInputEncodings) {
 387                 next unless $matched{$suspect};
 388                 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 389                 $charset = $suspect;
 390                 last;
 391             }
 392         }
 393         else {
 394             $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 395         }
 396     }
 397     else {
 398         $RT::Logger->warning("Cannot Encode::Guess; fallback to $fallback");
 399     }
 400
 401     return($charset || $fallback);
 402 }
 403
 404 # }}}
 405
 406 # {{{ SetMIMEHeadToEncoding
 407
 408 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 409
 410 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 411 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 412 all the time
 413
 414
 415 =cut
 416
 417 sub SetMIMEHeadToEncoding {
 418     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 419
 420     $charset = 'utf-8' if $charset eq 'utf8';
 421     $enc     = 'utf-8' if $enc     eq 'utf8';
 422
 423     return if $charset eq $enc and $preserve_words;
 424
 425     foreach my $tag ( $head->tags ) {
 426         my @values = $head->get_all($tag);
 427         $head->delete($tag);
 428         foreach my $value (@values) {
 429             if ( $charset ne $enc ) {
 430
 431                 eval {
 432                     Encode::_utf8_off($value);
 433                     Encode::from_to( $value, $charset => $enc );
 434                 };
 435                 if ($@) {
 436                     $RT::Logger->error( "Encoding error: " . $@
 437                                        . " defaulting to ISO-8859-1 -> UTF-8" );
 438                     eval { Encode::from_to( $value, 'iso-8859-1' => $enc ) };
 439                     if ($@) {
 440                         $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 441                     }
 442                 }
 443             }
 444             $value = DecodeMIMEWordsToEncoding( $value, $enc ) unless $preserve_words;
 445             $head->add( $tag, $value );
 446         }
 447     }
 448
 449 }
 450 # }}}
 451
 452 eval "require RT::I18N_Vendor";
 453 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Vendor.pm});
 454 eval "require RT::I18N_Local";
 455 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Local.pm});
 456
 457 1;  # End of module.
 458