1 # BEGIN BPS TAGGED BLOCK {{{
5 # This software is Copyright (c) 1996-2015 Best Practical Solutions, LLC
6 # <sales@bestpractical.com>
8 # (Except where explicitly superseded by other copyright notices)
13 # This work is made available to you under the terms of Version 2 of
14 # the GNU General Public License. A copy of that license should have
15 # been provided with this software, but in any event can be snarfed
18 # This work is distributed in the hope that it will be useful, but
19 # WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # General Public License for more details.
23 # You should have received a copy of the GNU General Public License
24 # along with this program; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 # 02110-1301 or visit their web page on the internet at
27 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
30 # CONTRIBUTION SUBMISSION POLICY:
32 # (The following paragraph is not intended to limit the rights granted
33 # to you to modify and distribute this software under the terms of
34 # the GNU General Public License and is only of importance to you if
35 # you choose to contribute your changes and enhancements to the
36 # community by submitting them to Best Practical Solutions, LLC.)
38 # By intentionally submitting any modifications, corrections or
39 # derivatives to this work, or any other work intended for use with
40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
41 # you are the copyright holder for those contributions and you grant
42 # Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
43 # royalty-free, perpetual, license to use, copy, create derivative
44 # works based on those contributions, and sublicense and distribute
45 # those contributions and any derivatives thereof.
47 # END BPS TAGGED BLOCK }}}
51 RT::I18N - a base class for localization of RT
61 use Locale::Maketext 1.04;
62 use Locale::Maketext::Lexicon 0.25;
63 use base 'Locale::Maketext::Fuzzy';
69 # I decree that this project's first language is English.
72 'TEST_STRING' => 'Concrete Mixer',
74 '__Content-Type' => 'text/plain; charset=utf-8',
77 # That means that lookup failures can't happen -- if we get as far
78 # as looking for something in this lexicon, and we don't find it,
79 # then automagically set $Lexicon{$key} = $key, before possibly
82 # The exception is keys that start with "_" -- they aren't auto-makeable.
89 Initializes the lexicons used for localization.
96 my @lang = RT->Config->Get('LexiconLanguages');
97 @lang = ('*') unless @lang;
99 # load default functions
100 require substr(__FILE__, 0, -3) . '/i_default.pm';
102 # Load language-specific functions
103 foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
104 my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
105 next unless grep $_ eq '*' || $_ eq $lang, @lang;
110 foreach my $l ( @lang ) {
112 Gettext => $RT::LexiconPath."/$l.po",
114 push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
115 push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
116 Gettext => $RT::LocalLexiconPath."/$l.po");
119 # Acquire all .po files and iterate them into lexicons
120 Locale::Maketext::Lexicon->import({ _decode => 1, %import });
128 foreach my $k (keys %{RT::I18N::} ) {
129 next if $k eq 'main::';
130 next unless index($k, '::', -2) >= 0;
131 next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
133 my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
134 # run fetch to force load
135 my $tmp = $lex->{'foo'};
136 # XXX: untie may fail with "untie attempted
137 # while 1 inner references still exist"
138 # TODO: untie that has to lower fetch impact
139 # untie %$lex if tied %$lex;
145 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
146 If it can't find anything, it returns 'ISO-8859-1'
153 sub encoding { 'utf-8' }
156 =head2 SetMIMEEntityToUTF8 $entity
158 An utility function which will try to convert entity body into utf8.
159 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
163 sub SetMIMEEntityToUTF8 {
164 RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
169 =head2 IsTextualContentType $type
171 An utility function that determines whether $type is I<textual>, meaning
172 that it can sensibly be converted to Unicode text.
174 Currently, it returns true iff $type matches this regular expression
175 (case-insensitively):
177 ^(?:text/(?:plain|html)|message/rfc822)\b
182 sub IsTextualContentType {
184 ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
188 =head2 SetMIMEEntityToEncoding Entity => ENTITY, Encoding => ENCODING, PreserveWords => BOOL, IsOut => BOOL
190 An utility function which will try to convert entity body into specified
191 charset encoding (encoded as octets, *not* unicode-strings). It will
192 iterate all the entities in $entity, and try to convert each one into
193 specified charset if whose Content-Type is 'text/plain'.
195 If PreserveWords is true, values in mime head will be decoded.(default is false)
197 Incoming and outgoing mails are handled differently, if IsOut is true(default
198 is false), it'll be treated as outgoing mail, otherwise incomding mail:
202 2) if found then try to convert to utf-8 in croak mode, return if success
204 4) if guessed differently then try to convert to utf-8 in croak mode, return
206 5) mark part as application/octet-stream instead of falling back to any
211 2) if didn't find then do nothing, send as is, let MUA deal with it
212 3) if found then try to convert it to outgoing encoding in croak mode, return
214 4) do nothing otherwise, keep original encoding
216 This function doesn't return anything meaningful.
220 sub SetMIMEEntityToEncoding {
221 my ( $entity, $enc, $preserve_words, $is_out );
224 ( $entity, $enc, $preserve_words ) = @_;
230 PreserveWords => undef,
235 $entity = $args{Entity};
236 $enc = $args{Encoding};
237 $preserve_words = $args{PreserveWords};
238 $is_out = $args{IsOut};
241 unless ( $entity && $enc ) {
242 RT->Logger->error("Missing Entity or Encoding arguments");
246 # do the same for parts first of all
247 SetMIMEEntityToEncoding(
250 PreserveWords => $preserve_words,
252 ) foreach $entity->parts;
254 my $head = $entity->head;
256 my $charset = _FindOrGuessCharset($entity);
258 unless( Encode::find_encoding($charset) ) {
259 $RT::Logger->warning("Encoding '$charset' is not supported");
263 unless ( $charset ) {
264 $head->replace( "X-RT-Original-Content-Type" => $head->mime_attr('Content-Type') );
265 $head->mime_attr('Content-Type' => 'application/octet-stream');
269 SetMIMEHeadToEncoding(
271 From => _FindOrGuessCharset( $entity, 1 ),
273 PreserveWords => $preserve_words,
277 # If this is a textual entity, we'd need to preserve its original encoding
278 $head->replace( "X-RT-Original-Encoding" => Encode::encode( "UTF-8", $charset ) )
279 if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
281 return unless IsTextualContentType($head->mime_type);
283 my $body = $entity->bodyhandle;
285 if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) {
286 my $string = $body->as_string or return;
287 RT::Util::assert_bytes($string);
289 $RT::Logger->debug( "Converting '$charset' to '$enc' for "
290 . $head->mime_type . " - "
291 . ( Encode::decode("UTF-8",$head->get('subject')) || 'Subjectless message' ) );
293 my $orig_string = $string;
294 ( my $success, $string ) = EncodeFromToWithCroak( $orig_string, $charset => $enc );
299 my $guess = _GuessCharset($orig_string);
300 if ( $guess && $guess ne $charset ) {
301 $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" );
302 ( $success, $string ) = EncodeFromToWithCroak( $orig_string, $guess, $enc );
303 $error = $string unless $success;
307 $RT::Logger->error( "Encoding error: " . $error . " falling back to application/octet-stream" );
308 $head->mime_attr( "content-type" => 'application/octet-stream' );
313 my $new_body = MIME::Body::InCore->new($string);
315 # set up the new entity
316 $head->mime_attr( "content-type" => 'text/plain' )
317 unless ( $head->mime_attr("content-type") );
318 $head->mime_attr( "content-type.charset" => $enc );
319 $entity->bodyhandle($new_body);
323 =head2 DecodeMIMEWordsToUTF8 $raw
325 An utility method which mimics MIME::Words::decode_mimewords, but only
326 limited functionality. Despite its name, this function returns the
327 bytes of the string, in UTF-8.
331 sub DecodeMIMEWordsToUTF8 {
333 return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
336 sub DecodeMIMEWordsToEncoding {
338 my $to_charset = _CanonicalizeCharset(shift);
339 my $field = shift || '';
340 $RT::Logger->warning(
341 "DecodeMIMEWordsToEncoding was called without field name."
342 ."It's known to cause troubles with decoding fields properly."
345 # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
346 # We _should_ be preserving them encoded until after parsing is completed and
347 # THEN undo the mime-encoding.
349 # This routine should be translating the existing mimeencoding to utf8 but leaving
352 # It's legal for headers to contain mime-encoded commas and semicolons which
353 # should not be treated as address separators. (Encoding == quoting here)
355 # until this is fixed, we must escape any string containing a comma or semicolon
356 # this is only a bandaid
358 # Some _other_ MUAs encode quotes _already_, and double quotes
359 # confuse us a lot, so only quote it if it isn't quoted
362 # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
363 # continuations, and similar syntax from RFC 2231
364 if ($field =~ /^Content-/i) {
365 # This concatenates continued parameters and normalizes encoded params
366 # to QB encoded-words which we handle below
367 my $params = MIME::Field::ParamVal->parse_params($str);
368 foreach my $v ( values %$params ) {
369 $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
370 # de-quote in case those were hidden inside encoded part
371 $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/;
373 $str = bless({}, 'MIME::Field::ParamVal')->set($params)->stringify;
375 elsif ( $field =~ /^(?:Resent-)?(?:To|From|B?Cc|Sender|Reply-To)$/i ) {
376 my @addresses = RT::EmailParser->ParseEmailAddress( $str );
377 foreach my $address ( @addresses ) {
378 foreach my $field (qw(phrase comment)) {
379 my $v = $address->$field() or next;
380 $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
381 if ( $field eq 'phrase' ) {
382 # de-quote in case quoted value were hidden inside encoded part
383 $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/;
385 $address->$field($v);
388 $str = join ', ', map $_->format, @addresses;
391 $str = _DecodeMIMEWordsToEncoding( $str, $to_charset );
395 # We might have \n without trailing whitespace, which will result in
402 sub _DecodeMIMEWordsToEncoding {
404 my $to_charset = shift;
406 # Pre-parse by removing all whitespace between encoded words
407 my $encoded_word = qr/
410 (?:\*[^?]+)? # optional '*language'
414 ([^?]+) # encoded string
417 $str =~ s/($encoded_word)\s+(?=$encoded_word)/$1/g;
419 # Also merge quoted-printable sections together, in case multiple
420 # octets of a single encoded character were split between chunks.
421 # Though not valid according to RFC 2047, this has been seen in the
423 1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i;
425 # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
426 # custom MIME word decoding and charset canonicalization. We can't do this
427 # until we parse before decode, instead of the other way around.
428 my @list = $str =~ m/(.*?) # prefix
432 return $str unless @list;
434 # add everything that hasn't matched to the end of the latest
435 # string in array this happen when we have 'key="=?encoded?="; key="plain"'
436 $list[-1] .= substr($str, pos $str);
440 my ($prefix, $charset, $encoding, $enc_str, $trailing) =
442 $charset = _CanonicalizeCharset($charset);
443 $encoding = lc $encoding;
445 $trailing =~ s/\s?\t?$//; # Observed from Outlook Express
447 if ( $encoding eq 'q' ) {
448 use MIME::QuotedPrint;
449 $enc_str =~ tr/_/ /; # Observed from Outlook Express
450 $enc_str = decode_qp($enc_str);
451 } elsif ( $encoding eq 'b' ) {
453 $enc_str = decode_base64($enc_str);
455 $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
456 ."only Q(uoted-printable) and B(ase64) are supported");
459 # now we have got a decoded subject, try to convert into the encoding
460 if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
461 if ( Encode::find_encoding($charset) ) {
462 Encode::from_to( $enc_str, $charset, $to_charset );
464 $RT::Logger->warning("Charset '$charset' is not supported");
465 $enc_str =~ s/[^[:print:]]/\357\277\275/g;
466 Encode::from_to( $enc_str, 'UTF-8', $to_charset )
467 unless $to_charset eq 'utf-8';
470 $str .= $prefix . $enc_str . $trailing;
477 =head2 _FindOrGuessCharset MIME::Entity, $head_only
479 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
481 If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's.
485 sub _FindOrGuessCharset {
487 my $head_only = shift;
488 my $head = $entity->head;
490 if ( my $charset = $head->mime_attr("content-type.charset") ) {
491 return _CanonicalizeCharset($charset);
494 if ( !$head_only and $head->mime_type =~ m{^text/} ) {
495 my $body = $entity->bodyhandle or return;
496 return _GuessCharset( $body->as_string );
500 # potentially binary data -- don't guess the body
501 return _GuessCharset( $head->as_string );
507 =head2 _GuessCharset STRING
509 use Encode::Guess to try to figure it out the string's encoding.
513 use constant HAS_ENCODE_GUESS => Encode::Guess->require;
514 use constant HAS_ENCODE_DETECT => Encode::Detect::Detector->require;
517 my $fallback = _CanonicalizeCharset('iso-8859-1');
519 # if $_[0] is null/empty, we don't guess its encoding
521 unless defined $_[0] && length $_[0];
523 my @encodings = RT->Config->Get('EmailInputEncodings');
524 unless ( @encodings ) {
525 $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
529 if ( $encodings[0] eq '*' ) {
531 if ( HAS_ENCODE_DETECT ) {
532 my $charset = Encode::Detect::Detector::detect( $_[0] );
534 $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
535 return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
538 $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
543 "You requested to guess encoding, but we couldn't"
544 ." load Encode::Detect::Detector module"
549 unless ( @encodings ) {
550 $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
554 unless ( HAS_ENCODE_GUESS ) {
555 $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
559 Encode::Guess->set_suspects( @encodings );
560 my $decoder = Encode::Guess->guess( $_[0] );
561 unless ( defined $decoder ) {
562 $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
566 if ( ref $decoder ) {
567 my $charset = $decoder->name;
568 $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
569 return _CanonicalizeCharset( $charset );
571 elsif ($decoder =~ /(\S+ or .+)/) {
572 my %matched = map { $_ => 1 } split(/ or /, $1);
573 return 'utf-8' if $matched{'utf8'}; # one and only normalization
575 foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
576 next unless $matched{$suspect};
577 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
578 return _CanonicalizeCharset( $suspect );
582 $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
588 =head2 _CanonicalizeCharset NAME
590 canonicalize charset, return lowercase version.
591 special cases are: gb2312 => gbk, utf8 => utf-8
595 sub _CanonicalizeCharset {
596 my $charset = lc shift;
597 return $charset unless $charset;
599 # Canonicalize aliases if they're known
600 if (my $canonical = Encode::resolve_alias($charset)) {
601 $charset = $canonical;
604 if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
607 elsif ( $charset eq 'euc-cn' ) {
608 # gbk is superset of gb2312/euc-cn so it's safe
611 elsif ( $charset =~ /^(?:(?:big5(-1984|-2003|ext|plus))|cccii|unisys|euc-tw|gb18030|(?:cns11643-\d+))$/ ) {
612 unless ( Encode::HanExtra->require ) {
613 RT->Logger->error("Please install Encode::HanExtra to handle $charset");
623 =head2 SetMIMEHeadToEncoding MIMEHead => HEAD, From => OLD_ENCODING, To => NEW_Encoding, PreserveWords => BOOL, IsOut => BOOL
625 Converts a MIME Head from one encoding to another. This totally violates the RFC.
626 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
632 sub SetMIMEHeadToEncoding {
633 my ( $head, $charset, $enc, $preserve_words, $is_out );
636 ( $head, $charset, $enc, $preserve_words ) = @_;
643 PreserveWords => undef,
649 $charset = $args{From};
651 $preserve_words = $args{PreserveWords};
652 $is_out = $args{IsOut};
655 unless ( $head && $charset && $enc ) {
657 "Missing Head or From or To arguments");
661 $charset = _CanonicalizeCharset($charset);
662 $enc = _CanonicalizeCharset($enc);
664 return if $charset eq $enc and $preserve_words;
666 RT::Util::assert_bytes( $head->as_string );
667 foreach my $tag ( $head->tags ) {
668 next unless $tag; # seen in wild: headers with no name
669 my @values = $head->get_all($tag);
671 foreach my $value (@values) {
672 if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) {
673 my $orig_value = $value;
674 ( my $success, $value ) = EncodeFromToWithCroak( $orig_value, $charset => $enc );
678 $value = $orig_value;
679 $head->add( $tag, $value );
683 my $guess = _GuessCharset($orig_value);
684 if ( $guess && $guess ne $charset ) {
685 $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" );
686 ( $success, $value ) = EncodeFromToWithCroak( $orig_value, $guess, $enc );
687 $error = $value unless $success;
691 $RT::Logger->error( "Encoding error: " . $error . " forcing conversion to $charset => $enc" );
692 $value = $orig_value;
693 Encode::from_to( $value, $charset => $enc );
698 $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
699 unless $preserve_words;
701 # We intentionally add a leading space when re-adding the
702 # header; Mail::Header strips it before storing, but it
703 # serves to prevent it from "helpfully" canonicalizing
704 # $head->add("Subject", "Subject: foo") into the same as
705 # $head->add("Subject", "foo");
706 $head->add( $tag, " " . $value );
712 =head2 EncodeFromToWithCroak $string, $from, $to
714 Try to encode string from encoding $from to encoding $to in croak mode
716 return (1, $encoded_string) if success, otherwise (0, $error)
720 sub EncodeFromToWithCroak {
727 $string = Encode::encode( $to, Encode::decode( $from, $string ), Encode::FB_CROAK );
729 return $@ ? ( 0, $@ ) : ( 1, $string );
732 RT::Base->_ImportOverlays();