1 # BEGIN BPS TAGGED BLOCK {{{
5 # This software is Copyright (c) 1996-2007 Best Practical Solutions, LLC
6 # <jesse@bestpractical.com>
8 # (Except where explicitly superseded by other copyright notices)
13 # This work is made available to you under the terms of Version 2 of
14 # the GNU General Public License. A copy of that license should have
15 # been provided with this software, but in any event can be snarfed
18 # This work is distributed in the hope that it will be useful, but
19 # WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # General Public License for more details.
23 # You should have received a copy of the GNU General Public License
24 # along with this program; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 # 02110-1301 or visit their web page on the internet at
27 # http://www.gnu.org/copyleft/gpl.html.
30 # CONTRIBUTION SUBMISSION POLICY:
32 # (The following paragraph is not intended to limit the rights granted
33 # to you to modify and distribute this software under the terms of
34 # the GNU General Public License and is only of importance to you if
35 # you choose to contribute your changes and enhancements to the
36 # community by submitting them to Best Practical Solutions, LLC.)
38 # By intentionally submitting any modifications, corrections or
39 # derivatives to this work, or any other work intended for use with
40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
41 # you are the copyright holder for those contributions and you grant
42 # Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
43 # royalty-free, perpetual, license to use, copy, create derivative
44 # works based on those contributions, and sublicense and distribute
45 # those contributions and any derivatives thereof.
47 # END BPS TAGGED BLOCK }}}
50 RT::I18N - a base class for localization of RT
59 use Locale::Maketext 1.04;
60 use Locale::Maketext::Lexicon 0.25;
61 use base ('Locale::Maketext::Fuzzy');
67 # I decree that this project's first language is English.
70 'TEST_STRING' => 'Concrete Mixer',
72 '__Content-Type' => 'text/plain; charset=utf-8',
75 # That means that lookup failures can't happen -- if we get as far
76 # as looking for something in this lexicon, and we don't find it,
77 # then automagically set $Lexicon{$key} = $key, before possibly
80 # The exception is keys that start with "_" -- they aren't auto-makeable.
87 Initializes the lexicons used for localization.
101 # Load language-specific functions
102 foreach my $language ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm")) {
103 if ($language =~ /^([-\w\s.\/\\~:]+)$/) {
107 warn("$language is tainted. not loading");
111 my @lang = @RT::LexiconLanguages;
112 @lang = ('*') unless @lang;
114 # Acquire all .po files and iterate them into lexicons
115 Locale::Maketext::Lexicon->import({
118 Gettext => (substr(__FILE__, 0, -3) . "/$_.po"),
119 Gettext => "$RT::LocalLexiconPath/*/$_.po",
120 Gettext => "$RT::LocalLexiconPath/$_.po",
130 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
131 If it can't find anything, it returns 'ISO-8859-1'
135 ok(my $chinese = RT::I18N->get_handle('zh_tw'));
136 ok(UNIVERSAL::can($chinese, 'maketext'));
137 ok($chinese->maketext('__Content-Type') =~ /utf-8/i, "Found the utf-8 charset for traditional chinese in the string ".$chinese->maketext('__Content-Type'));
138 ok($chinese->encoding eq 'utf-8', "The encoding is 'utf-8' -".$chinese->encoding);
140 ok(my $en = RT::I18N->get_handle('en'));
141 ok(UNIVERSAL::can($en, 'maketext'));
142 ok($en->encoding eq 'utf-8', "The encoding ".$en->encoding." is 'utf-8'");
150 sub encoding { 'utf-8' }
152 # {{{ SetMIMEEntityToUTF8
154 =head2 SetMIMEEntityToUTF8 $entity
156 An utility function which will try to convert entity body into utf8.
157 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
161 sub SetMIMEEntityToUTF8 {
162 RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
167 # {{{ IsTextualContentType
169 =head2 IsTextualContentType $type
171 An utility function that determines whether $type is I<textual>, meaning
172 that it can sensibly be converted to Unicode text.
174 Currently, it returns true iff $type matches this regular expression
175 (case-insensitively):
177 ^(?:text/(?:plain|html)|message/rfc822)\b
183 sub IsTextualContentType {
185 ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
188 # {{{ SetMIMEEntityToEncoding
190 =head2 SetMIMEEntityToEncoding $entity, $encoding
192 An utility function which will try to convert entity body into specified
193 charset encoding (encoded as octets, *not* unicode-strings). It will
194 iterate all the entities in $entity, and try to convert each one into
195 specified charset if whose Content-Type is 'text/plain'.
197 This function doesn't return anything meaningful.
201 sub SetMIMEEntityToEncoding {
202 my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
204 # do the same for parts first of all
205 SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
207 my $charset = _FindOrGuessCharset($entity) or return;
208 # one and only normalization
209 $charset = 'utf-8' if $charset =~ /^utf-?8$/i;
210 $enc = 'utf-8' if $enc =~ /^utf-?8$/i;
212 SetMIMEHeadToEncoding(
214 _FindOrGuessCharset($entity, 1) => $enc,
218 my $head = $entity->head;
220 # convert at least MIME word encoded attachment filename
221 foreach my $attr (qw(content-type.name content-disposition.filename)) {
222 if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
223 $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
227 # If this is a textual entity, we'd need to preserve its original encoding
228 $head->add( "X-RT-Original-Encoding" => $charset )
229 if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
231 return unless IsTextualContentType($head->mime_type);
233 my $body = $entity->bodyhandle;
235 if ( $enc ne $charset && $body) {
236 my @lines = $body->as_lines or return;
238 # {{{ Convert the body
240 $RT::Logger->debug("Converting '$charset' to '$enc' for ". $head->mime_type . " - ". ($head->get('subject') || 'Subjectless message'));
242 # NOTE:: see the comments at the end of the sub.
243 Encode::_utf8_off( $lines[$_] ) foreach ( 0 .. $#lines );
244 Encode::from_to( $lines[$_], $charset => $enc ) for ( 0 .. $#lines );
248 $RT::Logger->error( "Encoding error: " . $@ . " defaulting to ISO-8859-1 -> UTF-8" );
250 Encode::from_to( $lines[$_], 'iso-8859-1' => $enc ) foreach ( 0 .. $#lines );
253 $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
258 my $new_body = MIME::Body::InCore->new( \@lines );
260 # set up the new entity
261 $head->mime_attr( "content-type" => 'text/plain' )
262 unless ( $head->mime_attr("content-type") );
263 $head->mime_attr( "content-type.charset" => $enc );
264 $entity->bodyhandle($new_body);
268 # NOTES: Why Encode::_utf8_off before Encode::from_to
270 # All the strings in RT are utf-8 now. Quotes from Encode POD:
272 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
273 # ... The data in $octets must be encoded as octets and not as
274 # characters in Perl's internal format. ...
276 # Not turning off the UTF-8 flag in the string will prevent the string
281 # {{{ DecodeMIMEWordsToUTF8
283 =head2 DecodeMIMEWordsToUTF8 $raw
285 An utility method which mimics MIME::Words::decode_mimewords, but only
286 limited functionality. This function returns an utf-8 string.
288 It returns the decoded string, or the original string if it's not
289 encoded. Since the subroutine converts specified string into utf-8
290 charset, it should not alter a subject written in English.
292 Why not use MIME::Words directly? Because it fails in RT when I
293 tried. Maybe it's ok now.
297 sub DecodeMIMEWordsToUTF8 {
299 DecodeMIMEWordsToEncoding($str, 'utf-8');
302 sub DecodeMIMEWordsToEncoding {
306 @_ = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gcs;
307 return ($str) unless (@_);
309 # add everything that hasn't matched to the end of the latest
310 # string in array this happen when we have 'key="=?encoded?="; key="plain"'
311 $_[-1] .= substr($str, pos $str);
315 my ($prefix, $charset, $encoding, $enc_str, $trailing) =
316 (shift, shift, lc shift, shift, shift);
318 $trailing =~ s/\s?\t?$//; # Observed from Outlook Express
320 if ( $encoding eq 'q' ) {
321 use MIME::QuotedPrint;
322 $enc_str =~ tr/_/ /; # Observed from Outlook Express
323 $enc_str = decode_qp($enc_str);
324 } elsif ( $encoding eq 'b' ) {
326 $enc_str = decode_base64($enc_str);
328 $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
329 ."only Q(uoted-printable) and B(ase64) are supported");
332 # now we have got a decoded subject, try to convert into the encoding
333 unless ($charset eq $enc) {
334 eval { Encode::from_to($enc_str, $charset, $enc) };
336 $charset = _GuessCharset( $enc_str );
337 Encode::from_to($enc_str, $charset, $enc);
341 # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
342 # We _should_ be preserving them encoded until after parsing is completed and
343 # THEN undo the mime-encoding.
345 # This routine should be translating the existing mimeencoding to utf8 but leaving
348 # It's legal for headers to contain mime-encoded commas and semicolons which
349 # should not be treated as address separators. (Encoding == quoting here)
351 # until this is fixed, we must escape any string containing a comma or semicolon
352 # this is only a bandaid
354 $enc_str = qq{"$enc_str"} if ($enc_str =~ /[,;]/);
355 $str .= $prefix . $enc_str . $trailing;
358 # We might have \n without trailing whitespace, which will result in
367 # {{{ _FindOrGuessCharset
369 =head2 _FindOrGuessCharset MIME::Entity, $head_only
371 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
373 If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's.
377 sub _FindOrGuessCharset {
379 my $head_only = shift;
380 my $head = $entity->head;
382 if ( my $charset = $head->mime_attr("content-type.charset") ) {
386 if ( !$head_only and $head->mime_type =~ m{^text/}) {
387 my $body = $entity->bodyhandle or return;
388 return _GuessCharset( $body->as_string );
391 # potentially binary data -- don't guess the body
392 return _GuessCharset( $head->as_string );
400 =head2 _GuessCharset STRING
402 use Encode::Guess to try to figure it out the string's encoding.
407 my $fallback = 'iso-8859-1';
410 if ( @RT::EmailInputEncodings and eval { require Encode::Guess; 1 } ) {
411 Encode::Guess->set_suspects(@RT::EmailInputEncodings);
412 my $decoder = Encode::Guess->guess( $_[0] );
414 if ( defined($decoder) ) {
415 if ( ref $decoder ) {
416 $charset = $decoder->name;
417 $RT::Logger->debug("Guessed encoding: $charset");
420 elsif ($decoder =~ /(\S+ or .+)/) {
421 my %matched = map { $_ => 1 } split(/ or /, $1);
422 return 'utf-8' if $matched{'utf8'}; # one and only normalization
424 foreach my $suspect (@RT::EmailInputEncodings) {
425 next unless $matched{$suspect};
426 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
432 $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
436 $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
440 $RT::Logger->warning("Cannot Encode::Guess; fallback to $fallback");
443 return($charset || $fallback);
448 # {{{ SetMIMEHeadToEncoding
450 =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
452 Converts a MIME Head from one encoding to another. This totally violates the RFC.
453 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
459 sub SetMIMEHeadToEncoding {
460 my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
462 $charset = 'utf-8' if $charset eq 'utf8';
463 $enc = 'utf-8' if $enc eq 'utf8';
465 return if $charset eq $enc and $preserve_words;
467 foreach my $tag ( $head->tags ) {
468 next unless $tag; # seen in wild: headers with no name
469 my @values = $head->get_all($tag);
471 foreach my $value (@values) {
472 if ( $charset ne $enc ) {
475 Encode::_utf8_off($value);
476 Encode::from_to( $value, $charset => $enc );
479 $RT::Logger->error( "Encoding error: " . $@
480 . " defaulting to ISO-8859-1 -> UTF-8" );
481 eval { Encode::from_to( $value, 'iso-8859-1' => $enc ) };
483 $RT::Logger->crit( "Totally failed to convert to utf-8: " . $@ . " I give up" );
487 $value = DecodeMIMEWordsToEncoding( $value, $enc ) unless $preserve_words;
488 $head->add( $tag, $value );
495 eval "require RT::I18N_Vendor";
496 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Vendor.pm});
497 eval "require RT::I18N_Local";
498 die $@ if ($@ && $@ !~ qr{^Can't locate RT/I18N_Local.pm});