1 # BEGIN BPS TAGGED BLOCK {{{
5 # This software is Copyright (c) 1996-2015 Best Practical Solutions, LLC
6 # <sales@bestpractical.com>
8 # (Except where explicitly superseded by other copyright notices)
13 # This work is made available to you under the terms of Version 2 of
14 # the GNU General Public License. A copy of that license should have
15 # been provided with this software, but in any event can be snarfed
18 # This work is distributed in the hope that it will be useful, but
19 # WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # General Public License for more details.
23 # You should have received a copy of the GNU General Public License
24 # along with this program; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 # 02110-1301 or visit their web page on the internet at
27 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
30 # CONTRIBUTION SUBMISSION POLICY:
32 # (The following paragraph is not intended to limit the rights granted
33 # to you to modify and distribute this software under the terms of
34 # the GNU General Public License and is only of importance to you if
35 # you choose to contribute your changes and enhancements to the
36 # community by submitting them to Best Practical Solutions, LLC.)
38 # By intentionally submitting any modifications, corrections or
39 # derivatives to this work, or any other work intended for use with
40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
41 # you are the copyright holder for those contributions and you grant
42 # Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
43 # royalty-free, perpetual, license to use, copy, create derivative
44 # works based on those contributions, and sublicense and distribute
45 # those contributions and any derivatives thereof.
47 # END BPS TAGGED BLOCK }}}
51 RT::I18N - a base class for localization of RT
61 use Locale::Maketext 1.04;
62 use Locale::Maketext::Lexicon 0.25;
63 use base 'Locale::Maketext::Fuzzy';
69 # I decree that this project's first language is English.
72 'TEST_STRING' => 'Concrete Mixer',
74 '__Content-Type' => 'text/plain; charset=utf-8',
77 # That means that lookup failures can't happen -- if we get as far
78 # as looking for something in this lexicon, and we don't find it,
79 # then automagically set $Lexicon{$key} = $key, before possibly
82 # The exception is keys that start with "_" -- they aren't auto-makeable.
89 Initializes the lexicons used for localization.
96 my @lang = RT->Config->Get('LexiconLanguages');
97 @lang = ('*') unless @lang;
99 # load default functions
100 require substr(__FILE__, 0, -3) . '/i_default.pm';
102 # Load language-specific functions
103 foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
104 unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
105 warn("$file is tainted. not loading");
110 my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
111 next unless grep $_ eq '*' || $_ eq $lang, @lang;
116 foreach my $l ( @lang ) {
118 Gettext => $RT::LexiconPath."/$l.po",
120 push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
121 push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
122 Gettext => $RT::LocalLexiconPath."/$l.po");
125 # Acquire all .po files and iterate them into lexicons
126 Locale::Maketext::Lexicon->import({ _decode => 1, %import });
134 foreach my $k (keys %{RT::I18N::} ) {
135 next if $k eq 'main::';
136 next unless index($k, '::', -2) >= 0;
137 next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
139 my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
140 # run fetch to force load
141 my $tmp = $lex->{'foo'};
142 # XXX: untie may fail with "untie attempted
143 # while 1 inner references still exist"
144 # TODO: untie that has to lower fetch impact
145 # untie %$lex if tied %$lex;
151 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
152 If it can't find anything, it returns 'ISO-8859-1'
159 sub encoding { 'utf-8' }
162 =head2 SetMIMEEntityToUTF8 $entity
164 An utility function which will try to convert entity body into utf8.
165 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
169 sub SetMIMEEntityToUTF8 {
170 RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
175 =head2 IsTextualContentType $type
177 An utility function that determines whether $type is I<textual>, meaning
178 that it can sensibly be converted to Unicode text.
180 Currently, it returns true iff $type matches this regular expression
181 (case-insensitively):
183 ^(?:text/(?:plain|html)|message/rfc822)\b
188 sub IsTextualContentType {
190 ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
194 =head2 SetMIMEEntityToEncoding $entity, $encoding
196 An utility function which will try to convert entity body into specified
197 charset encoding (encoded as octets, *not* unicode-strings). It will
198 iterate all the entities in $entity, and try to convert each one into
199 specified charset if whose Content-Type is 'text/plain'.
201 This function doesn't return anything meaningful.
205 sub SetMIMEEntityToEncoding {
206 my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
208 # do the same for parts first of all
209 SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
211 my $head = $entity->head;
213 my $charset = _FindOrGuessCharset($entity);
215 unless( Encode::find_encoding($charset) ) {
216 $RT::Logger->warning("Encoding '$charset' is not supported");
220 unless ( $charset ) {
221 $head->replace( "X-RT-Original-Content-Type" => $head->mime_attr('Content-Type') );
222 $head->mime_attr('Content-Type' => 'application/octet-stream');
226 SetMIMEHeadToEncoding(
228 _FindOrGuessCharset($entity, 1) => $enc,
232 # If this is a textual entity, we'd need to preserve its original encoding
233 $head->replace( "X-RT-Original-Encoding" => Encode::encode( "UTF-8", $charset ) )
234 if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
236 return unless IsTextualContentType($head->mime_type);
238 my $body = $entity->bodyhandle;
240 if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) {
241 my $string = $body->as_string or return;
242 RT::Util::assert_bytes($string);
244 $RT::Logger->debug( "Converting '$charset' to '$enc' for "
245 . $head->mime_type . " - "
246 . ( Encode::decode("UTF-8",$head->get('subject')) || 'Subjectless message' ) );
250 $string = Encode::encode( $enc, Encode::decode( $charset, $string) );
253 my $new_body = MIME::Body::InCore->new($string);
255 # set up the new entity
256 $head->mime_attr( "content-type" => 'text/plain' )
257 unless ( $head->mime_attr("content-type") );
258 $head->mime_attr( "content-type.charset" => $enc );
259 $entity->bodyhandle($new_body);
263 =head2 DecodeMIMEWordsToUTF8 $raw
265 An utility method which mimics MIME::Words::decode_mimewords, but only
266 limited functionality. Despite its name, this function returns the
267 bytes of the string, in UTF-8.
271 sub DecodeMIMEWordsToUTF8 {
273 return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
276 sub DecodeMIMEWordsToEncoding {
278 my $to_charset = _CanonicalizeCharset(shift);
279 my $field = shift || '';
281 # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
282 # continuations, and similar syntax from RFC 2231
283 if ($field =~ /^Content-(Type|Disposition)/i) {
284 # This concatenates continued parameters and normalizes encoded params
285 # to QB encoded-words which we handle below
286 $str = MIME::Field::ParamVal->parse($str)->stringify;
289 # Pre-parse by removing all whitespace between encoded words
290 my $encoded_word = qr/
293 (?:\*[^?]+)? # optional '*language'
297 ([^?]+) # encoded string
300 $str =~ s/($encoded_word)\s+(?=$encoded_word)/$1/g;
302 # Also merge quoted-printable sections together, in case multiple
303 # octets of a single encoded character were split between chunks.
304 # Though not valid according to RFC 2047, this has been seen in the
306 1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i;
308 # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
309 # custom MIME word decoding and charset canonicalization. We can't do this
310 # until we parse before decode, instead of the other way around.
311 my @list = $str =~ m/(.*?) # prefix
317 # add everything that hasn't matched to the end of the latest
318 # string in array this happen when we have 'key="=?encoded?="; key="plain"'
319 $list[-1] .= substr($str, pos $str);
323 my ($prefix, $charset, $encoding, $enc_str, $trailing) =
325 $charset = _CanonicalizeCharset($charset);
326 $encoding = lc $encoding;
328 $trailing =~ s/\s?\t?$//; # Observed from Outlook Express
330 if ( $encoding eq 'q' ) {
331 use MIME::QuotedPrint;
332 $enc_str =~ tr/_/ /; # Observed from Outlook Express
333 $enc_str = decode_qp($enc_str);
334 } elsif ( $encoding eq 'b' ) {
336 $enc_str = decode_base64($enc_str);
338 $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
339 ."only Q(uoted-printable) and B(ase64) are supported");
342 # now we have got a decoded subject, try to convert into the encoding
343 if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
344 if ( Encode::find_encoding($charset) ) {
345 Encode::from_to( $enc_str, $charset, $to_charset );
347 $RT::Logger->warning("Charset '$charset' is not supported");
348 $enc_str =~ s/[^[:print:]]/\357\277\275/g;
349 Encode::from_to( $enc_str, 'UTF-8', $to_charset )
350 unless $to_charset eq 'utf-8';
354 # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
355 # We _should_ be preserving them encoded until after parsing is completed and
356 # THEN undo the mime-encoding.
358 # This routine should be translating the existing mimeencoding to utf8 but leaving
361 # It's legal for headers to contain mime-encoded commas and semicolons which
362 # should not be treated as address separators. (Encoding == quoting here)
364 # until this is fixed, we must escape any string containing a comma or semicolon
365 # this is only a bandaid
367 # Some _other_ MUAs encode quotes _already_, and double quotes
368 # confuse us a lot, so only quote it if it isn't quoted
370 $enc_str = qq{"$enc_str"}
371 if $enc_str =~ /[,;]/
372 and $enc_str !~ /^".*"$/
373 and $prefix !~ /"$/ and $trailing !~ /^"/
374 and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
376 $str .= $prefix . $enc_str . $trailing;
380 # We might have \n without trailing whitespace, which will result in
389 =head2 _FindOrGuessCharset MIME::Entity, $head_only
391 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
393 If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's.
397 sub _FindOrGuessCharset {
399 my $head_only = shift;
400 my $head = $entity->head;
402 if ( my $charset = $head->mime_attr("content-type.charset") ) {
403 return _CanonicalizeCharset($charset);
406 if ( !$head_only and $head->mime_type =~ m{^text/} ) {
407 my $body = $entity->bodyhandle or return;
408 return _GuessCharset( $body->as_string );
412 # potentially binary data -- don't guess the body
413 return _GuessCharset( $head->as_string );
419 =head2 _GuessCharset STRING
421 use Encode::Guess to try to figure it out the string's encoding.
425 use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
426 use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
429 my $fallback = _CanonicalizeCharset('iso-8859-1');
431 # if $_[0] is null/empty, we don't guess its encoding
433 unless defined $_[0] && length $_[0];
435 my @encodings = RT->Config->Get('EmailInputEncodings');
436 unless ( @encodings ) {
437 $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
441 if ( $encodings[0] eq '*' ) {
443 if ( HAS_ENCODE_DETECT ) {
444 my $charset = Encode::Detect::Detector::detect( $_[0] );
446 $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
447 return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
450 $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
455 "You requested to guess encoding, but we couldn't"
456 ." load Encode::Detect::Detector module"
461 unless ( @encodings ) {
462 $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
466 unless ( HAS_ENCODE_GUESS ) {
467 $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
471 Encode::Guess->set_suspects( @encodings );
472 my $decoder = Encode::Guess->guess( $_[0] );
473 unless ( defined $decoder ) {
474 $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
478 if ( ref $decoder ) {
479 my $charset = $decoder->name;
480 $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
481 return _CanonicalizeCharset( $charset );
483 elsif ($decoder =~ /(\S+ or .+)/) {
484 my %matched = map { $_ => 1 } split(/ or /, $1);
485 return 'utf-8' if $matched{'utf8'}; # one and only normalization
487 foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
488 next unless $matched{$suspect};
489 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
490 return _CanonicalizeCharset( $suspect );
494 $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
500 =head2 _CanonicalizeCharset NAME
502 canonicalize charset, return lowercase version.
503 special cases are: gb2312 => gbk, utf8 => utf-8
507 sub _CanonicalizeCharset {
508 my $charset = lc shift;
509 return $charset unless $charset;
511 # Canonicalize aliases if they're known
512 if (my $canonical = Encode::resolve_alias($charset)) {
513 $charset = $canonical;
516 if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
519 elsif ( $charset eq 'euc-cn' ) {
520 # gbk is superset of gb2312/euc-cn so it's safe
522 # XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
523 # but needs Encode::HanExtra installed
531 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
533 Converts a MIME Head from one encoding to another. This totally violates the RFC.
534 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
540 sub SetMIMEHeadToEncoding {
541 my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
543 $charset = _CanonicalizeCharset($charset);
544 $enc = _CanonicalizeCharset($enc);
546 return if $charset eq $enc and $preserve_words;
548 RT::Util::assert_bytes( $head->as_string );
549 foreach my $tag ( $head->tags ) {
550 next unless $tag; # seen in wild: headers with no name
551 my @values = $head->get_all($tag);
553 foreach my $value (@values) {
554 if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) {
556 $value = Encode::encode( $enc, Encode::decode( $charset, $value) );
558 $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
559 unless $preserve_words;
561 # We intentionally add a leading space when re-adding the
562 # header; Mail::Header strips it before storing, but it
563 # serves to prevent it from "helpfully" canonicalizing
564 # $head->add("Subject", "Subject: foo") into the same as
565 # $head->add("Subject", "foo");
566 $head->add( $tag, " " . $value );
572 RT::Base->_ImportOverlays();