1 # BEGIN BPS TAGGED BLOCK {{{
5 # This software is Copyright (c) 1996-2019 Best Practical Solutions, LLC
6 # <sales@bestpractical.com>
8 # (Except where explicitly superseded by other copyright notices)
13 # This work is made available to you under the terms of Version 2 of
14 # the GNU General Public License. A copy of that license should have
15 # been provided with this software, but in any event can be snarfed
18 # This work is distributed in the hope that it will be useful, but
19 # WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # General Public License for more details.
23 # You should have received a copy of the GNU General Public License
24 # along with this program; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 # 02110-1301 or visit their web page on the internet at
27 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
30 # CONTRIBUTION SUBMISSION POLICY:
32 # (The following paragraph is not intended to limit the rights granted
33 # to you to modify and distribute this software under the terms of
34 # the GNU General Public License and is only of importance to you if
35 # you choose to contribute your changes and enhancements to the
36 # community by submitting them to Best Practical Solutions, LLC.)
38 # By intentionally submitting any modifications, corrections or
39 # derivatives to this work, or any other work intended for use with
40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
41 # you are the copyright holder for those contributions and you grant
42 # Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
43 # royalty-free, perpetual, license to use, copy, create derivative
44 # works based on those contributions, and sublicense and distribute
45 # those contributions and any derivatives thereof.
47 # END BPS TAGGED BLOCK }}}
51 RT::I18N - a base class for localization of RT
62 use Locale::Maketext 1.04;
63 use Locale::Maketext::Lexicon 0.25;
64 use base 'Locale::Maketext::Fuzzy';
70 # I decree that this project's first language is English.
73 'TEST_STRING' => 'Concrete Mixer',
75 '__Content-Type' => 'text/plain; charset=utf-8',
78 # That means that lookup failures can't happen -- if we get as far
79 # as looking for something in this lexicon, and we don't find it,
80 # then automagically set $Lexicon{$key} = $key, before possibly
83 # The exception is keys that start with "_" -- they aren't auto-makeable.
90 Initializes the lexicons used for localization.
97 my @lang = RT->Config->Get('LexiconLanguages');
98 @lang = ('*') unless @lang;
100 # load default functions
101 require substr(Cwd::abs_path(__FILE__), 0, -3) . '/i_default.pm';
103 # Load language-specific functions
104 foreach my $file ( File::Glob::bsd_glob(substr(Cwd::abs_path(__FILE__), 0, -3) . "/*.pm") ) {
105 my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
106 next unless grep $_ eq '*' || $_ eq $lang, @lang;
111 foreach my $l ( @lang ) {
113 Gettext => $RT::LexiconPath."/$l.po",
115 push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
116 push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
117 Gettext => $RT::LocalLexiconPath."/$l.po");
120 # Acquire all .po files and iterate them into lexicons
121 Locale::Maketext::Lexicon->import({ _decode => 1, %import });
129 foreach my $k (keys %{RT::I18N::} ) {
130 next if $k eq 'main::';
131 next unless index($k, '::', -2) >= 0;
132 next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
134 my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
135 # run fetch to force load
136 my $tmp = $lex->{'foo'};
137 # XXX: untie may fail with "untie attempted
138 # while 1 inner references still exist"
139 # TODO: untie that has to lower fetch impact
140 # untie %$lex if tied %$lex;
146 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
147 If it can't find anything, it returns 'ISO-8859-1'
154 sub encoding { 'utf-8' }
157 =head2 SetMIMEEntityToUTF8 $entity
159 An utility function which will try to convert entity body into utf8.
160 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
164 sub SetMIMEEntityToUTF8 {
165 RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
170 =head2 IsTextualContentType $type
172 An utility function that determines whether $type is I<textual>, meaning
173 that it can sensibly be converted to Unicode text.
175 Currently, it returns true iff $type matches this regular expression
176 (case-insensitively):
178 ^(?:text/(?:plain|html)|message/rfc822)\b
183 sub IsTextualContentType {
185 ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
189 =head2 SetMIMEEntityToEncoding Entity => ENTITY, Encoding => ENCODING, PreserveWords => BOOL, IsOut => BOOL
191 An utility function which will try to convert entity body into specified
192 charset encoding (encoded as octets, *not* unicode-strings). It will
193 iterate all the entities in $entity, and try to convert each one into
194 specified charset if whose Content-Type is 'text/plain'.
196 If PreserveWords is true, values in mime head will be decoded.(default is false)
198 Incoming and outgoing mails are handled differently, if IsOut is true(default
199 is false), it'll be treated as outgoing mail, otherwise incomding mail:
203 2) if found then try to convert to utf-8 in croak mode, return if success
205 4) if guessed differently then try to convert to utf-8 in croak mode, return
207 5) mark part as application/octet-stream instead of falling back to any
212 2) if didn't find then do nothing, send as is, let MUA deal with it
213 3) if found then try to convert it to outgoing encoding in croak mode, return
215 4) do nothing otherwise, keep original encoding
217 This function doesn't return anything meaningful.
221 sub SetMIMEEntityToEncoding {
222 my ( $entity, $enc, $preserve_words, $is_out );
225 ( $entity, $enc, $preserve_words ) = @_;
231 PreserveWords => undef,
236 $entity = $args{Entity};
237 $enc = $args{Encoding};
238 $preserve_words = $args{PreserveWords};
239 $is_out = $args{IsOut};
242 unless ( $entity && $enc ) {
243 RT->Logger->error("Missing Entity or Encoding arguments");
247 # do the same for parts first of all
248 SetMIMEEntityToEncoding(
251 PreserveWords => $preserve_words,
253 ) foreach $entity->parts;
255 my $head = $entity->head;
257 my $charset = _FindOrGuessCharset($entity);
259 unless( Encode::find_encoding($charset) ) {
260 $RT::Logger->warning("Encoding '$charset' is not supported");
264 unless ( $charset ) {
265 $head->replace( "X-RT-Original-Content-Type" => $head->mime_attr('Content-Type') );
266 $head->mime_attr('Content-Type' => 'application/octet-stream');
270 SetMIMEHeadToEncoding(
272 From => _FindOrGuessCharset( $entity, 1 ),
274 PreserveWords => $preserve_words,
278 # If this is a textual entity, we'd need to preserve its original encoding
279 $head->replace( "X-RT-Original-Encoding" => Encode::encode( "UTF-8", $charset ) )
280 if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
282 return unless IsTextualContentType($head->mime_type);
284 my $body = $entity->bodyhandle;
286 if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) {
287 my $string = $body->as_string or return;
288 RT::Util::assert_bytes($string);
290 $RT::Logger->debug( "Converting '$charset' to '$enc' for "
291 . $head->mime_type . " - "
292 . ( Encode::decode("UTF-8",$head->get('subject')) || 'Subjectless message' ) );
294 my $orig_string = $string;
295 ( my $success, $string ) = EncodeFromToWithCroak( $orig_string, $charset => $enc );
300 my $guess = _GuessCharset($orig_string);
301 if ( $guess && $guess ne $charset ) {
302 $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" );
303 ( $success, $string ) = EncodeFromToWithCroak( $orig_string, $guess, $enc );
304 $error = $string unless $success;
308 $RT::Logger->error( "Encoding error: " . $error . " falling back to application/octet-stream" );
309 $head->mime_attr( "content-type" => 'application/octet-stream' );
314 my $new_body = MIME::Body::InCore->new($string);
316 # set up the new entity
317 $head->mime_attr( "content-type" => 'text/plain' )
318 unless ( $head->mime_attr("content-type") );
319 $head->mime_attr( "content-type.charset" => $enc );
320 $entity->bodyhandle($new_body);
324 =head2 DecodeMIMEWordsToUTF8 $raw
326 An utility method which mimics MIME::Words::decode_mimewords, but only
327 limited functionality. Despite its name, this function returns the
328 bytes of the string, in UTF-8.
332 sub DecodeMIMEWordsToUTF8 {
334 return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
337 sub DecodeMIMEWordsToEncoding {
339 my $to_charset = _CanonicalizeCharset(shift);
340 my $field = shift || '';
341 $RT::Logger->warning(
342 "DecodeMIMEWordsToEncoding was called without field name."
343 ."It's known to cause troubles with decoding fields properly."
346 # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
347 # We _should_ be preserving them encoded until after parsing is completed and
348 # THEN undo the mime-encoding.
350 # This routine should be translating the existing mimeencoding to utf8 but leaving
353 # It's legal for headers to contain mime-encoded commas and semicolons which
354 # should not be treated as address separators. (Encoding == quoting here)
356 # until this is fixed, we must escape any string containing a comma or semicolon
357 # this is only a bandaid
359 # Some _other_ MUAs encode quotes _already_, and double quotes
360 # confuse us a lot, so only quote it if it isn't quoted
363 # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
364 # continuations, and similar syntax from RFC 2231
365 if ($field =~ /^Content-/i) {
366 # This concatenates continued parameters and normalizes encoded params
367 # to QB encoded-words which we handle below
368 my $params = MIME::Field::ParamVal->parse_params($str);
369 foreach my $v ( values %$params ) {
370 $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
371 # de-quote in case those were hidden inside encoded part
372 $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/;
374 $str = bless({}, 'MIME::Field::ParamVal')->set($params)->stringify;
376 elsif ( $field =~ /^(?:Resent-)?(?:To|From|B?Cc|Sender|Reply-To)$/i ) {
377 my @addresses = RT::EmailParser->ParseEmailAddress( $str );
378 foreach my $address ( @addresses ) {
379 foreach my $field (qw(phrase comment)) {
380 my $v = $address->$field() or next;
381 $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
382 if ( $field eq 'phrase' ) {
383 # de-quote in case quoted value were hidden inside encoded part
384 $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/;
386 $address->$field($v);
389 $str = join ', ', map $_->format, @addresses;
392 $str = _DecodeMIMEWordsToEncoding( $str, $to_charset );
396 # We might have \n without trailing whitespace, which will result in
403 sub _DecodeMIMEWordsToEncoding {
405 my $to_charset = shift;
407 # Pre-parse by removing all whitespace between encoded words
408 my $encoded_word = qr/
411 (?:\*[^?]+)? # optional '*language'
415 ([^?]+) # encoded string
418 $str =~ s/($encoded_word)\s+(?=$encoded_word)/$1/g;
420 # Also merge quoted-printable sections together, in case multiple
421 # octets of a single encoded character were split between chunks.
422 # Though not valid according to RFC 2047, this has been seen in the
424 1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i;
426 # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
427 # custom MIME word decoding and charset canonicalization. We can't do this
428 # until we parse before decode, instead of the other way around.
429 my @list = $str =~ m/(.*?) # prefix
433 return $str unless @list;
435 # add everything that hasn't matched to the end of the latest
436 # string in array this happen when we have 'key="=?encoded?="; key="plain"'
437 $list[-1] .= substr($str, pos $str);
441 my ($prefix, $charset, $encoding, $enc_str, $trailing) =
443 $charset = _CanonicalizeCharset($charset);
444 $encoding = lc $encoding;
446 if ( $encoding eq 'q' ) {
447 use MIME::QuotedPrint;
448 $enc_str =~ tr/_/ /; # RFC 2047, 4.2 (2)
449 $enc_str = decode_qp($enc_str);
450 } elsif ( $encoding eq 'b' ) {
452 $enc_str = decode_base64($enc_str);
454 $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
455 ."only Q(uoted-printable) and B(ase64) are supported");
458 # now we have got a decoded subject, try to convert into the encoding
459 if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
460 if ( Encode::find_encoding($charset) ) {
461 Encode::from_to( $enc_str, $charset, $to_charset );
463 $RT::Logger->warning("Charset '$charset' is not supported");
464 $enc_str =~ s/[^[:print:]]/\357\277\275/g;
465 Encode::from_to( $enc_str, 'UTF-8', $to_charset )
466 unless $to_charset eq 'utf-8';
469 $str .= $prefix . $enc_str . $trailing;
476 =head2 _FindOrGuessCharset MIME::Entity, $head_only
478 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
480 If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's.
484 sub _FindOrGuessCharset {
486 my $head_only = shift;
487 my $head = $entity->head;
489 if ( my $charset = $head->mime_attr("content-type.charset") ) {
490 return _CanonicalizeCharset($charset);
493 if ( !$head_only and $head->mime_type =~ m{^text/} ) {
494 my $body = $entity->bodyhandle or return;
495 return _GuessCharset( $body->as_string );
499 # potentially binary data -- don't guess the body
500 return _GuessCharset( $head->as_string );
506 =head2 _GuessCharset STRING
508 use Encode::Guess to try to figure it out the string's encoding.
512 use constant HAS_ENCODE_GUESS => Encode::Guess->require;
513 use constant HAS_ENCODE_DETECT => Encode::Detect::Detector->require;
516 my $fallback = _CanonicalizeCharset('iso-8859-1');
518 # if $_[0] is null/empty, we don't guess its encoding
520 unless defined $_[0] && length $_[0];
522 my @encodings = RT->Config->Get('EmailInputEncodings');
523 unless ( @encodings ) {
524 $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
528 if ( $encodings[0] eq '*' ) {
530 if ( HAS_ENCODE_DETECT ) {
531 my $charset = Encode::Detect::Detector::detect( $_[0] );
533 $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
534 return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
537 $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
542 "You requested to guess encoding, but we couldn't"
543 ." load Encode::Detect::Detector module"
548 unless ( @encodings ) {
549 $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
553 unless ( HAS_ENCODE_GUESS ) {
554 $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
558 Encode::Guess->set_suspects( @encodings );
559 my $decoder = Encode::Guess->guess( $_[0] );
560 unless ( defined $decoder ) {
561 $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
565 if ( ref $decoder ) {
566 my $charset = $decoder->name;
567 $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
568 return _CanonicalizeCharset( $charset );
570 elsif ($decoder =~ /(\S+ or .+)/) {
571 my %matched = map { $_ => 1 } split(/ or /, $1);
572 return 'utf-8' if $matched{'utf8'}; # one and only normalization
574 foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
575 next unless $matched{$suspect};
576 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
577 return _CanonicalizeCharset( $suspect );
581 $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
587 =head2 _CanonicalizeCharset NAME
589 canonicalize charset, return lowercase version.
590 special cases are: gb2312 => gbk, utf8 => utf-8
594 sub _CanonicalizeCharset {
595 my $charset = lc shift;
596 return $charset unless $charset;
598 # Canonicalize aliases if they're known
599 if (my $canonical = Encode::resolve_alias($charset)) {
600 $charset = $canonical;
603 if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
606 elsif ( $charset eq 'euc-cn' ) {
607 # gbk is superset of gb2312/euc-cn so it's safe
610 elsif ( $charset =~ /^(?:(?:big5(-1984|-2003|ext|plus))|cccii|unisys|euc-tw|gb18030|(?:cns11643-\d+))$/ ) {
611 unless ( Encode::HanExtra->require ) {
612 RT->Logger->error("Please install Encode::HanExtra to handle $charset");
622 =head2 SetMIMEHeadToEncoding MIMEHead => HEAD, From => OLD_ENCODING, To => NEW_Encoding, PreserveWords => BOOL, IsOut => BOOL
624 Converts a MIME Head from one encoding to another. This totally violates the RFC.
625 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
631 sub SetMIMEHeadToEncoding {
632 my ( $head, $charset, $enc, $preserve_words, $is_out );
635 ( $head, $charset, $enc, $preserve_words ) = @_;
642 PreserveWords => undef,
648 $charset = $args{From};
650 $preserve_words = $args{PreserveWords};
651 $is_out = $args{IsOut};
654 unless ( $head && $charset && $enc ) {
656 "Missing Head or From or To arguments");
660 $charset = _CanonicalizeCharset($charset);
661 $enc = _CanonicalizeCharset($enc);
663 return if $charset eq $enc and $preserve_words;
665 RT::Util::assert_bytes( $head->as_string );
666 foreach my $tag ( $head->tags ) {
667 next unless $tag; # seen in wild: headers with no name
668 my @values = $head->get_all($tag);
670 foreach my $value (@values) {
671 if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) {
672 my $orig_value = $value;
673 ( my $success, $value ) = EncodeFromToWithCroak( $orig_value, $charset => $enc );
677 $value = $orig_value;
678 $head->add( $tag, $value );
682 my $guess = _GuessCharset($orig_value);
683 if ( $guess && $guess ne $charset ) {
684 $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" );
685 ( $success, $value ) = EncodeFromToWithCroak( $orig_value, $guess, $enc );
686 $error = $value unless $success;
690 $RT::Logger->error( "Encoding error: " . $error . " forcing conversion to $charset => $enc" );
691 $value = $orig_value;
692 Encode::from_to( $value, $charset => $enc );
697 $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
698 unless $preserve_words;
700 # We intentionally add a leading space when re-adding the
701 # header; Mail::Header strips it before storing, but it
702 # serves to prevent it from "helpfully" canonicalizing
703 # $head->add("Subject", "Subject: foo") into the same as
704 # $head->add("Subject", "foo");
705 $head->add( $tag, " " . $value );
711 =head2 EncodeFromToWithCroak $string, $from, $to
713 Try to encode string from encoding $from to encoding $to in croak mode
715 return (1, $encoded_string) if success, otherwise (0, $error)
719 sub EncodeFromToWithCroak {
726 $string = Encode::encode( $to, Encode::decode( $from, $string ), Encode::FB_CROAK );
728 return $@ ? ( 0, $@ ) : ( 1, $string );
731 RT::Base->_ImportOverlays();