rt/lib/RT/I18N.pm

   1 # BEGIN BPS TAGGED BLOCK {{{
   2 #
   3 # COPYRIGHT:
   4 #
   5 # This software is Copyright (c) 1996-2007 Best Practical Solutions, LLC
   6 #                                          <jesse@bestpractical.com>
   7 #
   8 # (Except where explicitly superseded by other copyright notices)
   9 #
  10 #
  11 # LICENSE:
  12 #
  13 # This work is made available to you under the terms of Version 2 of
  14 # the GNU General Public License. A copy of that license should have
  15 # been provided with this software, but in any event can be snarfed
  16 # from www.gnu.org.
  17 #
  18 # This work is distributed in the hope that it will be useful, but
  19 # WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 # General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU General Public License
  24 # along with this program; if not, write to the Free Software
  25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26 # 02110-1301 or visit their web page on the internet at
  27 # http://www.gnu.org/copyleft/gpl.html.
  28 #
  29 #
  30 # CONTRIBUTION SUBMISSION POLICY:
  31 #
  32 # (The following paragraph is not intended to limit the rights granted
  33 # to you to modify and distribute this software under the terms of
  34 # the GNU General Public License and is only of importance to you if
  35 # you choose to contribute your changes and enhancements to the
  36 # community by submitting them to Best Practical Solutions, LLC.)
  37 #
  38 # By intentionally submitting any modifications, corrections or
  39 # derivatives to this work, or any other work intended for use with
  40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
  41 # you are the copyright holder for those contributions and you grant
  42 # Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
  43 # royalty-free, perpetual, license to use, copy, create derivative
  44 # works based on those contributions, and sublicense and distribute
  45 # those contributions and any derivatives thereof.
  46 #
  47 # END BPS TAGGED BLOCK }}}
  48 =head1 NAME
  49
  50 RT::I18N - a base class for localization of RT
  51
  52 =cut
  53
  54 package RT::I18N;
  55
  56 use strict;
  57 use warnings;
  58
  59 use Locale::Maketext 1.04;
  60 use Locale::Maketext::Lexicon 0.25;
  61 use base ('Locale::Maketext::Fuzzy');
  62
  63 use Encode;
  64 use MIME::Entity;
  65 use MIME::Head;
  66
  67 # I decree that this project's first language is English.
  68
  69 our %Lexicon = (
  70    'TEST_STRING' => 'Concrete Mixer',
  71
  72     '__Content-Type' => 'text/plain; charset=utf-8',
  73
  74   '_AUTO' => 1,
  75   # That means that lookup failures can't happen -- if we get as far
  76   #  as looking for something in this lexicon, and we don't find it,
  77   #  then automagically set $Lexicon{$key} = $key, before possibly
  78   #  compiling it.
  79
  80   # The exception is keys that start with "_" -- they aren't auto-makeable.
  81
  82 );
  83 # End of lexicon.
  84
  85 =head2 Init
  86
  87 Initializes the lexicons used for localization.
  88
  89 =begin testing
  90
  91 use_ok (RT::I18N);
  92 ok(RT::I18N->Init);
  93
  94 =end testing
  95
  96 =cut
  97
  98 sub Init {
  99     require File::Glob;
 100
 101     # Load language-specific functions
 102     foreach my $language ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm")) {
 103         if ($language =~ /^([-\w\s.\/\\~:]+)$/) {
 104             require $1;
 105         }
 106         else {
 107             warn("$language is tainted. not loading");
 108         }
 109     }
 110
 111     my @lang = @RT::LexiconLanguages;
 112     @lang = ('*') unless @lang;
 113
 114     # Acquire all .po files and iterate them into lexicons
 115     Locale::Maketext::Lexicon->import({
 116         _decode => 1, map {
 117             $_  => [
 118                 Gettext => (substr(__FILE__, 0, -3) . "/$_.po"),
 119                 Gettext => "$RT::LocalLexiconPath/*/$_.po",
 120                 Gettext => "$RT::LocalLexiconPath/$_.po",
 121             ],
 122         } @lang
 123     });
 124
 125     return 1;
 126 }
 127
 128 =head2 encoding
 129
 130 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 131 If it can't find anything, it returns 'ISO-8859-1'
 132
 133 =begin testing
 134
 135 ok(my $chinese = RT::I18N->get_handle('zh_tw'));
 136 ok(UNIVERSAL::can($chinese, 'maketext'));
 137 ok($chinese->maketext('__Content-Type') =~ /utf-8/i, "Found the utf-8 charset for traditional chinese in the string ".$chinese->maketext('__Content-Type'));
 138 ok($chinese->encoding eq 'utf-8', "The encoding is 'utf-8' -".$chinese->encoding);
 139
 140 ok(my $en = RT::I18N->get_handle('en'));
 141 ok(UNIVERSAL::can($en, 'maketext'));
 142 ok($en->encoding eq 'utf-8', "The encoding ".$en->encoding." is 'utf-8'");
 143
 144 =end testing
 145
 146
 147 =cut
 148
 149
 150 sub encoding { 'utf-8' }
 151
 152 # {{{ SetMIMEEntityToUTF8
 153
 154 =head2 SetMIMEEntityToUTF8 $entity
 155
 156 An utility method which will try to convert entity body into utf8.
 157 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 158
 159 =cut
 160
 161 sub SetMIMEEntityToUTF8 {
 162     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 163 }
 164
 165 # }}}
 166
 167 # {{{ SetMIMEEntityToEncoding
 168
 169 =head2 SetMIMEEntityToEncoding $entity, $encoding
 170
 171 An utility method which will try to convert entity body into specified
 172 charset encoding (encoded as octets, *not* unicode-strings).  It will
 173 iterate all the entities in $entity, and try to convert each one into
 174 specified charset if whose Content-Type is 'text/plain'.
 175
 176 This method doesn't return anything meaningful.
 177
 178 =cut
 179
 180 sub SetMIMEEntityToEncoding {
 181     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 182
 183     # do the same for parts first of all
 184     SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 185
 186     my $charset = _FindOrGuessCharset($entity) or return;
 187     # one and only normalization
 188     $charset = 'utf-8' if $charset =~ /^utf-?8$/i;
 189     $enc     = 'utf-8' if $enc     =~ /^utf-?8$/i;
 190
 191     SetMIMEHeadToEncoding(
 192         $entity->head,
 193         _FindOrGuessCharset($entity, 1) => $enc,
 194         $preserve_words
 195     );
 196
 197     my $head = $entity->head;
 198
 199     # convert at least MIME word encoded attachment filename
 200     foreach my $attr (qw(content-type.name content-disposition.filename)) {
 201         if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
 202             $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
 203         }
 204     }
 205
 206     # If this is a textual entity, we'd need to preserve its original encoding
 207     $head->add( "X-RT-Original-Encoding" => $charset )
 208         if $head->mime_attr('content-type.charset') or $head->mime_type =~ /^text/;
 209
 210
 211     return unless ( $head->mime_type =~ qr{^(text/plain|message/rfc822)$}i  );
 212
 213
 214     my $body = $entity->bodyhandle;
 215
 216     if ( $enc ne $charset && $body) {
 217         my @lines = $body->as_lines or return;
 218
 219         # {{{ Convert the body
 220         eval {
 221             $RT::Logger->debug("Converting '$charset' to '$enc' for ". $head->mime_type . " - ". ($head->get('subject') || 'Subjectless message'));
 222
 223             # NOTE:: see the comments at the end of the sub.
 224             Encode::_utf8_off( $lines[$_] ) foreach ( 0 .. $#lines );
 225             Encode::from_to( $lines[$_], $charset => $enc ) for ( 0 .. $#lines );
 226         };
 227
 228         if ($@) {
 229             $RT::Logger->error( "Encoding error: " . $@ . " defaulting to ISO-8859-1 -> UTF-8" );
 230             eval {
 231                 Encode::from_to( $lines[$_], 'iso-8859-1' => $enc ) foreach ( 0 .. $#lines );
 232             };
 233             if ($@) {
 234                 $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 235             }
 236         }
 237         # }}}
 238
 239         my $new_body = MIME::Body::InCore->new( \@lines );
 240
 241         # set up the new entity
 242         $head->mime_attr( "content-type" => 'text/plain' )
 243           unless ( $head->mime_attr("content-type") );
 244         $head->mime_attr( "content-type.charset" => $enc );
 245         $entity->bodyhandle($new_body);
 246     }
 247 }
 248
 249 # NOTES:  Why Encode::_utf8_off before Encode::from_to
 250 #
 251 # All the strings in RT are utf-8 now.  Quotes from Encode POD:
 252 #
 253 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 254 # ... The data in $octets must be encoded as octets and not as
 255 # characters in Perl's internal format. ...
 256 #
 257 # Not turning off the UTF-8 flag in the string will prevent the string
 258 # from conversion.
 259
 260 # }}}
 261
 262 # {{{ DecodeMIMEWordsToUTF8
 263
 264 =head2 DecodeMIMEWordsToUTF8 $raw
 265
 266 An utility method which mimics MIME::Words::decode_mimewords, but only
 267 limited functionality.  This function returns an utf-8 string.
 268
 269 It returns the decoded string, or the original string if it's not
 270 encoded.  Since the subroutine converts specified string into utf-8
 271 charset, it should not alter a subject written in English.
 272
 273 Why not use MIME::Words directly?  Because it fails in RT when I
 274 tried.  Maybe it's ok now.
 275
 276 =cut
 277
 278 sub DecodeMIMEWordsToUTF8 {
 279     my $str = shift;
 280     DecodeMIMEWordsToEncoding($str, 'utf-8');
 281 }
 282
 283 sub DecodeMIMEWordsToEncoding {
 284     my $str = shift;
 285     my $enc = shift;
 286
 287     @_ = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gc;
 288     return ($str) unless (@_);
 289
 290     # add everything that hasn't matched to the end of the latest
 291     # string in array this happen when we have 'key="=?encoded?="; key="plain"'
 292     $_[-1] .= substr($str, pos $str);
 293
 294     $str = "";
 295     while (@_) {
 296         my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 297             (shift, shift, lc shift, shift, shift);
 298
 299         $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 300
 301         if ( $encoding eq 'q' ) {
 302             use MIME::QuotedPrint;
 303             $enc_str =~ tr/_/ /;                # Observed from Outlook Express
 304             $enc_str = decode_qp($enc_str);
 305         } elsif ( $encoding eq 'b' ) {
 306             use MIME::Base64;
 307             $enc_str = decode_base64($enc_str);
 308         } else {
 309             $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
 310             ."only Q(uoted-printable) and B(ase64) are supported");
 311         }
 312
 313         # now we have got a decoded subject, try to convert into the encoding
 314         unless ($charset eq $enc) {
 315             eval { Encode::from_to($enc_str, $charset,  $enc) };
 316             if ($@) {
 317                 $charset = _GuessCharset( $enc_str );
 318                 Encode::from_to($enc_str, $charset, $enc);
 319             }
 320         }
 321
 322         # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
 323         # We _should_ be preserving them encoded until after parsing is completed and
 324         # THEN undo the mime-encoding.
 325         #
 326         # This routine should be translating the existing mimeencoding to utf8 but leaving
 327         # things encoded.
 328         #
 329         # It's legal for headers to contain mime-encoded commas and semicolons which
 330         # should not be treated as address separators. (Encoding == quoting here)
 331         #
 332         # until this is fixed, we must escape any string containing a comma or semicolon
 333         # this is only a bandaid
 334
 335         $enc_str = qq{"$enc_str"} if ($enc_str =~ /[,;]/);
 336         $str .= $prefix . $enc_str . $trailing;
 337     }
 338
 339     # We might have \n without trailing whitespace, which will result in
 340     # invalid headers.
 341     $str =~ s/\n//g;
 342
 343     return ($str)
 344 }
 345
 346 # }}}
 347
 348 # {{{ _FindOrGuessCharset
 349
 350 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 351
 352 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
 353
 354 If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.
 355
 356 =cut
 357
 358 sub _FindOrGuessCharset {
 359     my $entity = shift;
 360     my $head_only = shift;
 361     my $head = $entity->head;
 362
 363     if ( my $charset = $head->mime_attr("content-type.charset") ) {
 364         return $charset;
 365     }
 366
 367     if ( !$head_only and $head->mime_type =~ m{^text/}) {
 368         my $body = $entity->bodyhandle or return;
 369         return _GuessCharset( $body->as_string );
 370     }
 371     else {
 372         # potentially binary data -- don't guess the body
 373         return _GuessCharset( $head->as_string );
 374     }
 375 }
 376
 377 # }}}
 378
 379 # {{{ _GuessCharset
 380
 381 =head2 _GuessCharset STRING
 382
 383 use Encode::Guess to try to figure it out the string's encoding.
 384
 385 =cut
 386
 387 sub _GuessCharset {
 388     my $fallback = 'iso-8859-1';
 389     my $charset;
 390
 391     if ( @RT::EmailInputEncodings and eval { require Encode::Guess; 1 } ) {
 392         Encode::Guess->set_suspects(@RT::EmailInputEncodings);
 393         my $decoder = Encode::Guess->guess( $_[0] );
 394
 395         if ( ref $decoder ) {
 396             $charset = $decoder->name;
 397             $RT::Logger->debug("Guessed encoding: $charset");
 398             return $charset;
 399         }
 400         elsif ($decoder =~ /(\S+ or .+)/) {
 401             my %matched = map { $_ => 1 } split(/ or /, $1);
 402             return 'utf-8' if $matched{'utf8'}; # one and only normalization
 403
 404             foreach my $suspect (@RT::EmailInputEncodings) {
 405                 next unless $matched{$suspect};
 406                 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 407                 $charset = $suspect;
 408                 last;
 409             }
 410         }
 411         else {
 412             $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 413         }
 414     }
 415     else {
 416         $RT::Logger->warning("Cannot Encode::Guess; fallback to $fallback");
 417     }
 418
 419     return($charset || $fallback);
 420 }
 421
 422 # }}}
 423
 424 # {{{ SetMIMEHeadToEncoding
 425
 426 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 427
 428 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 429 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 430 all the time
 431
 432
 433 =cut
 434
 435 sub SetMIMEHeadToEncoding {
 436     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 437
 438     $charset = 'utf-8' if $charset eq 'utf8';
 439     $enc     = 'utf-8' if $enc     eq 'utf8';
 440
 441     return if $charset eq $enc and $preserve_words;
 442
 443     foreach my $tag ( $head->tags ) {
 444         next unless $tag; # seen in wild: headers with no name
 445         my @values = $head->get_all($tag);
 446         $head->delete($tag);
 447         foreach my $value (@values) {
 448             if ( $charset ne $enc ) {
 449
 450                 eval {
 451                     Encode::_utf8_off($value);
 452                     Encode::from_to( $value, $charset => $enc );
 453                 };
 454                 if ($@) {
 455                     $RT::Logger->error( "Encoding error: " . $@
 456                                        . " defaulting to ISO-8859-1 -> UTF-8" );
 457                     eval { Encode::from_to( $value, 'iso-8859-1' => $enc ) };
 458                     if ($@) {
 459                         $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 460                     }
 461                 }
 462             }
 463             $value = DecodeMIMEWordsToEncoding( $value, $enc ) unless $preserve_words;
 464             $head->add( $tag, $value );
 465         }
 466     }
 467
 468 }
 469 # }}}
 470
 471 eval "require RT::I18N_Vendor";
 472 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Vendor.pm});
 473 eval "require RT::I18N_Local";
 474 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Local.pm});
 475
 476 1;  # End of module.
 477