rt/lib/RT/I18N.pm

   1 # BEGIN LICENSE BLOCK
   2 #
   3 # Copyright (c) 1996-2003 Jesse Vincent <jesse@bestpractical.com>
   4 #
   5 # (Except where explictly superceded by other copyright notices)
   6 #
   7 # This work is made available to you under the terms of Version 2 of
   8 # the GNU General Public License. A copy of that license should have
   9 # been provided with this software, but in any event can be snarfed
  10 # from www.gnu.org.
  11 #
  12 # This work is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # General Public License for more details.
  16 #
  17 # Unless otherwise specified, all modifications, corrections or
  18 # extensions to this work which alter its source code become the
  19 # property of Best Practical Solutions, LLC when submitted for
  20 # inclusion in the work.
  21 #
  22 #
  23 # END LICENSE BLOCK
  24 =head1 NAME
  25
  26 RT::I18N - a base class for localization of RT
  27
  28 =cut
  29
  30 package RT::I18N;
  31
  32 use strict;
  33 use Locale::Maketext 1.04;
  34 use Locale::Maketext::Lexicon 0.25;
  35 use base ('Locale::Maketext::Fuzzy');
  36 use vars qw( %Lexicon );
  37
  38 #If we're running on 5.6, we desperately need Encode::compat. But if we're on 5.8, we don't really need it.
  39 BEGIN { if ($] < 5.007001) {
  40 require Encode::compat;
  41 } }
  42 use Encode;
  43
  44 use MIME::Entity;
  45 use MIME::Head;
  46
  47 # I decree that this project's first language is English.
  48
  49 %Lexicon = (
  50    'TEST_STRING' => 'Concrete Mixer',
  51
  52     '__Content-Type' => 'text/plain; charset=utf-8',
  53
  54   '_AUTO' => 1,
  55   # That means that lookup failures can't happen -- if we get as far
  56   #  as looking for something in this lexicon, and we don't find it,
  57   #  then automagically set $Lexicon{$key} = $key, before possibly
  58   #  compiling it.
  59
  60   # The exception is keys that start with "_" -- they aren't auto-makeable.
  61
  62 );
  63 # End of lexicon.
  64
  65 =head2 Init
  66
  67 Initializes the lexicons used for localization.
  68
  69 =begin testing
  70
  71 use_ok (RT::I18N);
  72 ok(RT::I18N->Init);
  73
  74 =end testing
  75
  76 =cut
  77
  78 sub Init {
  79     # Load language-specific functions
  80     foreach my $language ( glob(substr(__FILE__, 0, -3) . "/*.pm")) {
  81         if ($language =~ /^([-\w.\/\\~:]+)$/) {
  82             require $1;
  83         }
  84         else {
  85             warn("$language is tainted. not loading");
  86         }
  87     }
  88
  89     my @lang = @RT::LexiconLanguages;
  90     @lang = ('*') unless @lang;
  91
  92     # Acquire all .po files and iterate them into lexicons
  93     Locale::Maketext::Lexicon->import({
  94         _decode => 1, map {
  95             $_  => [
  96                 Gettext => (substr(__FILE__, 0, -3) . "/$_.po"),
  97                 Gettext => "$RT::LocalLexiconPath/*/$_.po",
  98             ],
  99         } @lang
 100     });
 101
 102     return 1;
 103 }
 104
 105 =head2 encoding
 106
 107 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 108 If it can't find anything, it returns 'ISO-8859-1'
 109
 110 =begin testing
 111
 112 ok(my $chinese = RT::I18N->get_handle('zh_tw'));
 113 ok(UNIVERSAL::can($chinese, 'maketext'));
 114 ok($chinese->maketext('__Content-Type') =~ /utf-8/i, "Found the utf-8 charset for traditional chinese in the string ".$chinese->maketext('__Content-Type'));
 115 ok($chinese->encoding eq 'utf-8', "The encoding is 'utf-8' -".$chinese->encoding);
 116
 117 ok(my $en = RT::I18N->get_handle('en'));
 118 ok(UNIVERSAL::can($en, 'maketext'));
 119 ok($en->encoding eq 'utf-8', "The encoding ".$en->encoding." is 'utf-8'");
 120
 121 =end testing
 122
 123
 124 =cut
 125
 126
 127 sub encoding { 'utf-8' }
 128
 129 # {{{ SetMIMEEntityToUTF8
 130
 131 =head2 SetMIMEEntityToUTF8 $entity
 132
 133 An utility method which will try to convert entity body into utf8.
 134 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 135
 136 =cut
 137
 138 sub SetMIMEEntityToUTF8 {
 139     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 140 }
 141
 142 # }}}
 143
 144 # {{{ SetMIMEEntityToEncoding
 145
 146 =head2 SetMIMEEntityToEncoding $entity, $encoding
 147
 148 An utility method which will try to convert entity body into specified
 149 charset encoding (encoded as octets, *not* unicode-strings).  It will
 150 iterate all the entities in $entity, and try to convert each one into
 151 specified charset if whose Content-Type is 'text/plain'.
 152
 153 This method doesn't return anything meaningful.
 154
 155 =cut
 156
 157 sub SetMIMEEntityToEncoding {
 158     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 159
 160     #if ( $entity->is_multipart ) {
 161     #$RT::Logger->crit("This entity is a multipart " . $entity->head->as_string);
 162         SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 163     #}
 164
 165     my $charset = _FindOrGuessCharset($entity) or return;
 166     # one and only normalization
 167     $charset = 'utf-8' if $charset eq 'utf8';
 168     $enc     = 'utf-8' if $enc     eq 'utf8';
 169
 170     SetMIMEHeadToEncoding($entity->head, $charset => $enc, $preserve_words);
 171
 172     my $head = $entity->head;
 173
 174     # convert at least MIME word encoded attachment filename
 175     foreach my $attr (qw(content-type.name content-disposition.filename)) {
 176         if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
 177             $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
 178         }
 179     }
 180
 181     # If this is a textual entity, we'd need to preserve its original encoding
 182     $head->add( "X-RT-Original-Encoding" => $charset )
 183         if $head->mime_attr('content-type.charset') or $head->mime_type =~ /^text/;
 184
 185
 186     return unless ( $head->mime_type =~ qr{^(text/plain|message/rfc822)$}i  );
 187
 188
 189     my $body = $entity->bodyhandle;
 190
 191     if ( $enc ne $charset && $body) {
 192         my @lines = $body->as_lines or return;
 193
 194         # {{{ Convert the body
 195         eval {
 196             $RT::Logger->debug("Converting '$charset' to '$enc' for ". $head->mime_type . " - ". $head->get('subject'));
 197
 198             # NOTE:: see the comments at the end of the sub.
 199             Encode::_utf8_off( $lines[$_] ) foreach ( 0 .. $#lines );
 200             Encode::from_to( $lines[$_], $charset => $enc ) for ( 0 .. $#lines );
 201         };
 202
 203         if ($@) {
 204             $RT::Logger->error( "Encoding error: " . $@ . " defaulting to ISO-8859-1 -> UTF-8" );
 205             eval {
 206                 Encode::from_to( $lines[$_], 'iso-8859-1' => $enc ) foreach ( 0 .. $#lines );
 207             };
 208             if ($@) {
 209                 $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 210             }
 211         }
 212         # }}}
 213
 214         my $new_body = MIME::Body::InCore->new( \@lines );
 215
 216         # set up the new entity
 217         $head->mime_attr( "content-type" => 'text/plain' )
 218           unless ( $head->mime_attr("content-type") );
 219         $head->mime_attr( "content-type.charset" => $enc );
 220         $entity->bodyhandle($new_body);
 221     }
 222 }
 223
 224 # NOTES:  Why Encode::_utf8_off before Encode::from_to
 225 #
 226 # All the strings in RT are utf-8 now.  Quotes from Encode POD:
 227 #
 228 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 229 # ... The data in $octets must be encoded as octets and not as
 230 # characters in Perl's internal format. ...
 231 #
 232 # Not turning off the UTF-8 flag in the string will prevent the string
 233 # from conversion.
 234
 235 # }}}
 236
 237 # {{{ DecodeMIMEWordsToUTF8
 238
 239 =head2 DecodeMIMEWordsToUTF8 $raw
 240
 241 An utility method which mimics MIME::Words::decode_mimewords, but only
 242 limited functionality.  This function returns an utf-8 string.
 243
 244 It returns the decoded string, or the original string if it's not
 245 encoded.  Since the subroutine converts specified string into utf-8
 246 charset, it should not alter a subject written in English.
 247
 248 Why not use MIME::Words directly?  Because it fails in RT when I
 249 tried.  Maybe it's ok now.
 250
 251 =cut
 252
 253 sub DecodeMIMEWordsToUTF8 {
 254     my $str = shift;
 255     DecodeMIMEWordsToEncoding($str, 'utf-8');
 256 }
 257
 258 sub DecodeMIMEWordsToEncoding {
 259     my $str = shift;
 260     my $enc = shift;
 261
 262
 263     @_ = $str =~ m/([^=]*)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/g;
 264
 265     return ($str) unless (@_);
 266
 267     $str = "";
 268     while (@_) {
 269         my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 270             (shift, shift, shift, shift, shift);
 271
 272         $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 273
 274         if ($encoding eq 'Q' or $encoding eq 'q') {
 275             use MIME::QuotedPrint;
 276             $enc_str =~ tr/_/ /;                # Observed from Outlook Express
 277             $enc_str = decode_qp($enc_str);
 278         } elsif ($encoding eq 'B' or $encoding eq 'b') {
 279             use MIME::Base64;
 280             $enc_str = decode_base64($enc_str);
 281         } else {
 282             $RT::Logger->warning("RT::I18N::DecodeMIMEWordsToCharset got a " .
 283                               "strange encoding: $encoding.");
 284         }
 285
 286         # now we have got a decoded subject, try to convert into the encoding
 287         unless ($charset eq $enc) {
 288             eval { Encode::from_to($enc_str, $charset,  $enc) };
 289             if ($@) {
 290                 $charset = _GuessCharset( $enc_str );
 291                 Encode::from_to($enc_str, $charset, $enc);
 292             }
 293         }
 294
 295         $str .= $prefix . $enc_str . $trailing;
 296     }
 297
 298     return ($str)
 299 }
 300
 301 # }}}
 302
 303 # {{{ _FindOrGuessCharset
 304
 305 =head2 _FindOrGuessCharset MIME::Entity
 306
 307 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that,
 308 will use Encode::Guess to try to figure it out
 309
 310 =cut
 311
 312 sub _FindOrGuessCharset {
 313     my $entity = shift;
 314     my $head = $entity->head;
 315
 316     if ($head->mime_attr("content-type.charset")) {
 317         return $head->mime_attr("content-type.charset");
 318     }
 319
 320     if ( $head->mime_type =~ m{^text/}) {
 321         my $body = $entity->bodyhandle or return;
 322         return _GuessCharset( $head->as_string . $body->as_string );
 323     }
 324     else {
 325         # potentially binary data -- don't guess the body
 326         return _GuessCharset( $head->as_string );
 327     }
 328 }
 329
 330 # }}}
 331
 332
 333 # {{{ _GuessCharset
 334
 335 =head2 _GuessCharset STRING
 336
 337 use Encode::Guess to try to figure it out the string's encoding.
 338
 339 =cut
 340
 341 sub _GuessCharset {
 342     my $fallback = 'iso-8859-1';
 343     my $charset;
 344
 345     if ( @RT::EmailInputEncodings and eval { require Encode::Guess; 1 } ) {
 346         Encode::Guess->set_suspects(@RT::EmailInputEncodings);
 347         my $decoder = Encode::Guess->guess( $_[0] );
 348
 349         if ( ref $decoder ) {
 350             $charset = $decoder->name;
 351             $RT::Logger->debug("Guessed encoding: $charset");
 352             return $charset;
 353         }
 354         elsif ($decoder =~ /(\S+ or .+)/) {
 355             my %matched = map { $_ => 1 } split(/ or /, $1);
 356             return 'utf-8' if $matched{'utf8'}; # one and only normalization
 357
 358             foreach my $suspect (@RT::EmailInputEncodings) {
 359                 next unless $matched{$suspect};
 360                 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 361                 $charset = $suspect;
 362                 last;
 363             }
 364         }
 365         else {
 366             $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 367         }
 368     }
 369     else {
 370         $RT::Logger->warning("Cannot Encode::Guess; fallback to $fallback");
 371     }
 372
 373     return($charset || $fallback);
 374 }
 375
 376 # }}}
 377
 378 # {{{ SetMIMEHeadToEncoding
 379
 380 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 381
 382 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 383 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 384 all the time
 385
 386
 387 =cut
 388
 389 sub SetMIMEHeadToEncoding {
 390     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 391
 392     $charset = 'utf-8' if $charset eq 'utf8';
 393     $enc     = 'utf-8' if $enc     eq 'utf8';
 394
 395     return if $charset eq $enc and $preserve_words;
 396
 397     foreach my $tag ( $head->tags ) {
 398         my @values = $head->get_all($tag);
 399         $head->delete($tag);
 400         foreach my $value (@values) {
 401             if ( $charset ne $enc ) {
 402
 403                 eval {
 404                     Encode::_utf8_off($value);
 405                     Encode::from_to( $value, $charset => $enc );
 406                 };
 407                 if ($@) {
 408                     $RT::Logger->error( "Encoding error: " . $@
 409                                        . " defaulting to ISO-8859-1 -> UTF-8" );
 410                     eval { Encode::from_to( $value, 'iso-8859-1' => $enc ) };
 411                     if ($@) {
 412                         $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 413                     }
 414                 }
 415             }
 416             $value = DecodeMIMEWordsToEncoding( $value, $enc ) unless $preserve_words;
 417             $head->add( $tag, $value );
 418         }
 419     }
 420
 421 }
 422 # }}}
 423
 424 eval "require RT::I18N_Vendor";
 425 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Vendor.pm});
 426 eval "require RT::I18N_Local";
 427 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Local.pm});
 428
 429 1;  # End of module.
 430