rt/lib/RT/I18N.pm

   1 # BEGIN BPS TAGGED BLOCK {{{
   2 #
   3 # COPYRIGHT:
   4 #
   5 # This software is Copyright (c) 1996-2009 Best Practical Solutions, LLC
   6 #                                          <jesse@bestpractical.com>
   7 #
   8 # (Except where explicitly superseded by other copyright notices)
   9 #
  10 #
  11 # LICENSE:
  12 #
  13 # This work is made available to you under the terms of Version 2 of
  14 # the GNU General Public License. A copy of that license should have
  15 # been provided with this software, but in any event can be snarfed
  16 # from www.gnu.org.
  17 #
  18 # This work is distributed in the hope that it will be useful, but
  19 # WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 # General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU General Public License
  24 # along with this program; if not, write to the Free Software
  25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26 # 02110-1301 or visit their web page on the internet at
  27 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
  28 #
  29 #
  30 # CONTRIBUTION SUBMISSION POLICY:
  31 #
  32 # (The following paragraph is not intended to limit the rights granted
  33 # to you to modify and distribute this software under the terms of
  34 # the GNU General Public License and is only of importance to you if
  35 # you choose to contribute your changes and enhancements to the
  36 # community by submitting them to Best Practical Solutions, LLC.)
  37 #
  38 # By intentionally submitting any modifications, corrections or
  39 # derivatives to this work, or any other work intended for use with
  40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
  41 # you are the copyright holder for those contributions and you grant
  42 # Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
  43 # royalty-free, perpetual, license to use, copy, create derivative
  44 # works based on those contributions, and sublicense and distribute
  45 # those contributions and any derivatives thereof.
  46 #
  47 # END BPS TAGGED BLOCK }}}
  48
  49 =head1 NAME
  50
  51 RT::I18N - a base class for localization of RT
  52
  53 =cut
  54
  55 package RT::I18N;
  56
  57 use strict;
  58 use warnings;
  59
  60 use Locale::Maketext 1.04;
  61 use Locale::Maketext::Lexicon 0.25;
  62 use base ('Locale::Maketext::Fuzzy');
  63
  64 use Encode;
  65 use MIME::Entity;
  66 use MIME::Head;
  67
  68 # I decree that this project's first language is English.
  69
  70 our %Lexicon = (
  71    'TEST_STRING' => 'Concrete Mixer',
  72
  73     '__Content-Type' => 'text/plain; charset=utf-8',
  74
  75   '_AUTO' => 1,
  76   # That means that lookup failures can't happen -- if we get as far
  77   #  as looking for something in this lexicon, and we don't find it,
  78   #  then automagically set $Lexicon{$key} = $key, before possibly
  79   #  compiling it.
  80
  81   # The exception is keys that start with "_" -- they aren't auto-makeable.
  82
  83 );
  84 # End of lexicon.
  85
  86 =head2 Init
  87
  88 Initializes the lexicons used for localization.
  89
  90
  91 =cut
  92
  93 sub Init {
  94     require File::Glob;
  95
  96     my @lang = RT->Config->Get('LexiconLanguages');
  97     @lang = ('*') unless @lang;
  98
  99     # load default functions
 100     require substr(__FILE__, 0, -3) . '/i_default.pm';
 101
 102     # Load language-specific functions
 103     foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
 104         unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
 105             warn("$file is tainted. not loading");
 106             next;
 107         }
 108         $file = $1;
 109
 110         my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
 111         next unless grep $_ eq '*' || $_ eq $lang, @lang;
 112         require $file;
 113     }
 114
 115     my %import;
 116     foreach my $l ( @lang ) {
 117         $import{$l} = [
 118             Gettext => (substr(__FILE__, 0, -3) . "/$l.po"),
 119             Gettext => "$RT::LocalLexiconPath/*/$l.po",
 120             Gettext => "$RT::LocalLexiconPath/$l.po",
 121         ];
 122         push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
 123     }
 124
 125     # Acquire all .po files and iterate them into lexicons
 126     Locale::Maketext::Lexicon->import({ _decode => 1, %import });
 127
 128     return 1;
 129 }
 130
 131 =head2 encoding
 132
 133 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
 134 If it can't find anything, it returns 'ISO-8859-1'
 135
 136
 137
 138 =cut
 139
 140
 141 sub encoding { 'utf-8' }
 142
 143 # {{{ SetMIMEEntityToUTF8
 144
 145 =head2 SetMIMEEntityToUTF8 $entity
 146
 147 An utility function which will try to convert entity body into utf8.
 148 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
 149
 150 =cut
 151
 152 sub SetMIMEEntityToUTF8 {
 153     RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
 154 }
 155
 156 # }}}
 157
 158 # {{{ IsTextualContentType
 159
 160 =head2 IsTextualContentType $type
 161
 162 An utility function that determines whether $type is I<textual>, meaning
 163 that it can sensibly be converted to Unicode text.
 164
 165 Currently, it returns true iff $type matches this regular expression
 166 (case-insensitively):
 167
 168     ^(?:text/(?:plain|html)|message/rfc822)\b
 169
 170 # }}}
 171
 172 =cut
 173
 174 sub IsTextualContentType {
 175     my $type = shift;
 176     ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
 177 }
 178
 179 # {{{ SetMIMEEntityToEncoding
 180
 181 =head2 SetMIMEEntityToEncoding $entity, $encoding
 182
 183 An utility function which will try to convert entity body into specified
 184 charset encoding (encoded as octets, *not* unicode-strings).  It will
 185 iterate all the entities in $entity, and try to convert each one into
 186 specified charset if whose Content-Type is 'text/plain'.
 187
 188 This function doesn't return anything meaningful.
 189
 190 =cut
 191
 192 sub SetMIMEEntityToEncoding {
 193     my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
 194
 195     # do the same for parts first of all
 196     SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
 197
 198     my $charset = _FindOrGuessCharset($entity) or return;
 199     # one and only normalization
 200     $charset = 'utf-8' if $charset =~ /^utf-?8$/i;
 201     $enc     = 'utf-8' if $enc     =~ /^utf-?8$/i;
 202
 203     SetMIMEHeadToEncoding(
 204         $entity->head,
 205         _FindOrGuessCharset($entity, 1) => $enc,
 206         $preserve_words
 207     );
 208
 209     my $head = $entity->head;
 210
 211     # convert at least MIME word encoded attachment filename
 212     foreach my $attr (qw(content-type.name content-disposition.filename)) {
 213         if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
 214             $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
 215         }
 216     }
 217
 218     # If this is a textual entity, we'd need to preserve its original encoding
 219     $head->replace( "X-RT-Original-Encoding" => $charset )
 220         if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
 221
 222     return unless IsTextualContentType($head->mime_type);
 223
 224     my $body = $entity->bodyhandle;
 225
 226     if ( $enc ne $charset && $body) {
 227         my @lines = $body->as_lines or return;
 228
 229         # {{{ Convert the body
 230         eval {
 231             $RT::Logger->debug("Converting '$charset' to '$enc' for ". $head->mime_type . " - ". ($head->get('subject') || 'Subjectless message'));
 232
 233             # NOTE:: see the comments at the end of the sub.
 234             Encode::_utf8_off( $lines[$_] ) foreach ( 0 .. $#lines );
 235             Encode::from_to( $lines[$_], $charset => $enc ) for ( 0 .. $#lines );
 236         };
 237
 238         if ($@) {
 239             $RT::Logger->error( "Encoding error: " . $@ . " defaulting to ISO-8859-1 -> UTF-8" );
 240             eval {
 241                 Encode::from_to( $lines[$_], 'iso-8859-1' => $enc ) foreach ( 0 .. $#lines );
 242             };
 243             if ($@) {
 244                 $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 245             }
 246         }
 247         # }}}
 248
 249         my $new_body = MIME::Body::InCore->new( \@lines );
 250
 251         # set up the new entity
 252         $head->mime_attr( "content-type" => 'text/plain' )
 253           unless ( $head->mime_attr("content-type") );
 254         $head->mime_attr( "content-type.charset" => $enc );
 255         $entity->bodyhandle($new_body);
 256     }
 257 }
 258
 259 # NOTES:  Why Encode::_utf8_off before Encode::from_to
 260 #
 261 # All the strings in RT are utf-8 now.  Quotes from Encode POD:
 262 #
 263 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 264 # ... The data in $octets must be encoded as octets and not as
 265 # characters in Perl's internal format. ...
 266 #
 267 # Not turning off the UTF-8 flag in the string will prevent the string
 268 # from conversion.
 269
 270 # }}}
 271
 272 # {{{ DecodeMIMEWordsToUTF8
 273
 274 =head2 DecodeMIMEWordsToUTF8 $raw
 275
 276 An utility method which mimics MIME::Words::decode_mimewords, but only
 277 limited functionality.  This function returns an utf-8 string.
 278
 279 It returns the decoded string, or the original string if it's not
 280 encoded.  Since the subroutine converts specified string into utf-8
 281 charset, it should not alter a subject written in English.
 282
 283 Why not use MIME::Words directly?  Because it fails in RT when I
 284 tried.  Maybe it's ok now.
 285
 286 =cut
 287
 288 sub DecodeMIMEWordsToUTF8 {
 289     my $str = shift;
 290     DecodeMIMEWordsToEncoding($str, 'utf-8');
 291 }
 292
 293 sub DecodeMIMEWordsToEncoding {
 294     my $str = shift;
 295     my $enc = shift;
 296
 297     @_ = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gcs;
 298     return ($str) unless (@_);
 299
 300     # add everything that hasn't matched to the end of the latest
 301     # string in array this happen when we have 'key="=?encoded?="; key="plain"'
 302     $_[-1] .= substr($str, pos $str);
 303
 304     $str = "";
 305     while (@_) {
 306         my ($prefix, $charset, $encoding, $enc_str, $trailing) =
 307             (shift, shift, lc shift, shift, shift);
 308
 309         $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express
 310
 311         if ( $encoding eq 'q' ) {
 312             use MIME::QuotedPrint;
 313             $enc_str =~ tr/_/ /;                # Observed from Outlook Express
 314             $enc_str = decode_qp($enc_str);
 315         } elsif ( $encoding eq 'b' ) {
 316             use MIME::Base64;
 317             $enc_str = decode_base64($enc_str);
 318         } else {
 319             $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
 320             ."only Q(uoted-printable) and B(ase64) are supported");
 321         }
 322
 323         # now we have got a decoded subject, try to convert into the encoding
 324         unless ($charset eq $enc) {
 325             eval { Encode::from_to($enc_str, $charset,  $enc) };
 326             if ($@) {
 327                 $charset = _GuessCharset( $enc_str );
 328                 Encode::from_to($enc_str, $charset, $enc);
 329             }
 330         }
 331
 332         # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
 333         # We _should_ be preserving them encoded until after parsing is completed and
 334         # THEN undo the mime-encoding.
 335         #
 336         # This routine should be translating the existing mimeencoding to utf8 but leaving
 337         # things encoded.
 338         #
 339         # It's legal for headers to contain mime-encoded commas and semicolons which
 340         # should not be treated as address separators. (Encoding == quoting here)
 341         #
 342         # until this is fixed, we must escape any string containing a comma or semicolon
 343         # this is only a bandaid
 344
 345         # Some _other_ MUAs encode quotes _already_, and double quotes
 346         # confuse us a lot, so only quote it if it isn't quoted
 347         # already.
 348         $enc_str = qq{"$enc_str"} if $enc_str =~ /[,;]/ and $enc_str !~ /^".*"$/;
 349
 350         $str .= $prefix . $enc_str . $trailing;
 351     }
 352
 353     # We might have \n without trailing whitespace, which will result in
 354     # invalid headers.
 355     $str =~ s/\n//g;
 356
 357     return ($str)
 358 }
 359
 360 # }}}
 361
 362 # {{{ _FindOrGuessCharset
 363
 364 =head2 _FindOrGuessCharset MIME::Entity, $head_only
 365
 366 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
 367
 368 If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.
 369
 370 =cut
 371
 372 sub _FindOrGuessCharset {
 373     my $entity = shift;
 374     my $head_only = shift;
 375     my $head = $entity->head;
 376
 377     if ( my $charset = $head->mime_attr("content-type.charset") ) {
 378         return $charset;
 379     }
 380
 381     if ( !$head_only and $head->mime_type =~ m{^text/}) {
 382         my $body = $entity->bodyhandle or return;
 383         return _GuessCharset( $body->as_string );
 384     }
 385     else {
 386         # potentially binary data -- don't guess the body
 387         return _GuessCharset( $head->as_string );
 388     }
 389 }
 390
 391 # }}}
 392
 393 # {{{ _GuessCharset
 394
 395 =head2 _GuessCharset STRING
 396
 397 use Encode::Guess to try to figure it out the string's encoding.
 398
 399 =cut
 400
 401 sub _GuessCharset {
 402     my $fallback = 'iso-8859-1';
 403
 404     # if $_[0] is null/empty, we don't guess its encoding
 405     return $fallback unless defined $_[0] && length $_[0];
 406
 407     my $charset;
 408     my @encodings = RT->Config->Get('EmailInputEncodings');
 409     if ( @encodings and eval { require Encode::Guess; 1 } ) {
 410         Encode::Guess->set_suspects( @encodings );
 411         my $decoder = Encode::Guess->guess( $_[0] );
 412
 413       if ( defined($decoder) ) {
 414         if ( ref $decoder ) {
 415             $charset = $decoder->name;
 416             $RT::Logger->debug("Guessed encoding: $charset");
 417             return $charset;
 418         }
 419         elsif ($decoder =~ /(\S+ or .+)/) {
 420             my %matched = map { $_ => 1 } split(/ or /, $1);
 421             return 'utf-8' if $matched{'utf8'}; # one and only normalization
 422
 423             foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
 424                 next unless $matched{$suspect};
 425                 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
 426                 $charset = $suspect;
 427                 last;
 428             }
 429         }
 430         else {
 431             $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
 432         }
 433       }
 434       else {
 435           $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
 436       }
 437     }
 438     elsif ( @encodings && $@ ) {
 439         $RT::Logger->error("You have set EmailInputEncodings, but we couldn't load Encode::Guess: $@");
 440     } else {
 441         $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
 442     }
 443
 444     return ($charset || $fallback);
 445 }
 446
 447 # }}}
 448
 449 # {{{ SetMIMEHeadToEncoding
 450
 451 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
 452
 453 Converts a MIME Head from one encoding to another. This totally violates the RFC.
 454 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
 455 all the time
 456
 457
 458 =cut
 459
 460 sub SetMIMEHeadToEncoding {
 461     my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
 462
 463     $charset = 'utf-8' if $charset eq 'utf8';
 464     $enc     = 'utf-8' if $enc     eq 'utf8';
 465
 466     return if $charset eq $enc and $preserve_words;
 467
 468     foreach my $tag ( $head->tags ) {
 469         next unless $tag; # seen in wild: headers with no name
 470         my @values = $head->get_all($tag);
 471         $head->delete($tag);
 472         foreach my $value (@values) {
 473             if ( $charset ne $enc ) {
 474
 475                 eval {
 476                     Encode::_utf8_off($value);
 477                     Encode::from_to( $value, $charset => $enc );
 478                 };
 479                 if ($@) {
 480                     $RT::Logger->error( "Encoding error: " . $@
 481                                        . " defaulting to ISO-8859-1 -> UTF-8" );
 482                     eval { Encode::from_to( $value, 'iso-8859-1' => $enc ) };
 483                     if ($@) {
 484                         $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
 485                     }
 486                 }
 487             }
 488             $value = DecodeMIMEWordsToEncoding( $value, $enc ) unless $preserve_words;
 489             $head->add( $tag, $value );
 490         }
 491     }
 492
 493 }
 494 # }}}
 495
 496 eval "require RT::I18N_Vendor";
 497 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Vendor.pm});
 498 eval "require RT::I18N_Local";
 499 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Local.pm});
 500
 501 1;  # End of module.
 502