RT 4.2.11, ticket#13852

author: Ivan Kohler <ivan@freeside.biz> 2015-07-09 22:18:55 -0700
committer: Ivan Kohler <ivan@freeside.biz> 2015-07-09 22:18:55 -0700
commit: 1c538bfabc2cd31f27067505f0c3d1a46cba6ef0 (patch)
tree: 96922ad4459eda1e649327fd391d60c58d454c53 /rt/devel/tools/extract-message-catalog
parent: 4f5619288413a185e9933088d9dd8c5afbc55dfa (diff)
1 files changed, 197 insertions, 168 deletions
diff --git a/rt/devel/tools/extract-message-catalog b/rt/devel/tools/extract-message-catalog
index cba84d274..5dd89b87e 100644
--- a/rt/devel/tools/extract-message-catalog
+++ b/rt/devel/tools/extract-message-catalog
@@ -51,10 +51,14 @@
 use strict;
 use warnings;
 
+use open qw/ :std :encoding(UTF-8) /;
+
 use File::Find;
 use File::Copy;
 use Regexp::Common;
 use Carp;
+use Locale::PO;
+$| = 1;
 
 # po dir is for extensions
 @ARGV = (<share/po/*.po>, <share/po/*.pot>, <po/*.po>, <po/*.pot>) unless @ARGV;
@@ -67,17 +71,37 @@ File::Find::find( { wanted => \&extract_strings_from_code, follow => 1 }, qw(bin
 
 # ensure proper escaping and [_1] => %1 transformation
 foreach my $str ( sort keys %FILECAT ) {
-    my $entry = $FILECAT{$str};
-    my $oldstr = $str;
+    my $entry = delete $FILECAT{$str};
+    next unless @{$entry};
+
+    my ($filename, $line) = @{ $entry->[0] };
+    my $location = "$filename line $line" . (@{$entry} > 1 ? " (and ".(@{$entry}-1)." other places)" : "");
+
+    if ($str =~ /^\s/m || $str =~ /\s$/m || $str =~ /\\n$/m) {
+        warn "Extraneous whitespace in '$str' at $location\n";
+    }
+    if (grep {$_->[3]} @{$entry} and $str =~ /([\$\@]\w+)/) {
+        warn "Interpolated variable '$1' in '$str' at $location\n";
+    }
 
-    $str =~ s/\\/\\\\/g;
-    $str =~ s/\"/\\"/g;
+    my $escape = sub { $_ = shift; s/\b_(\d+)/%$1/; $_ };
     $str =~ s/((?<!~)(?:~~)*)\[_(\d+)\]/$1%$2/g;
-    $str =~ s/((?<!~)(?:~~)*)\[([A-Za-z#*]\w*),([^\]]+)\]/"$1%$2(".escape($3).")"/eg;
+    $str =~ s/((?<!~)(?:~~)*)\[([A-Za-z#*]\w*),([^\]]+)\]/"$1%$2(".$escape->($3).")"/eg;
     $str =~ s/~([\[\]])/$1/g;
 
-    delete $FILECAT{$oldstr};
-    $FILECAT{$str} = $entry;
+    my $po = Locale::PO->new(-msgid => $str, -msgstr => "");
+    $po->reference( join ( ' ', sort map $_->[0].":".$_->[1], @{ $entry } ) );
+    my %seen;
+    my @vars;
+    foreach my $find ( sort { $a->[2] cmp $b->[2] } grep { $_->[2] } @{ $entry } ) {
+        my ( $file, $line, $var ) = @{$find};
+        $var =~ s/^\s*,\s*//;
+        $var =~ s/\s*$//;
+        push @vars, "($var)" unless $seen{$var}++;
+    }
+    $po->automatic( join( "\n", @vars) );
+
+    $FILECAT{$po->msgid} = $po;
 }
 
 # update all language dictionaries
@@ -93,28 +117,6 @@ foreach my $dict (@ARGV) {
     update($lang, $dict);
 }
 
-# warn about various red flags in loc strings
-foreach my $str ( sort keys %FILECAT ) {
-    my $entry = $FILECAT{$str};
-    my $entry_count = @$entry;
-
-    # doesn't exist in the current codebase, ignore for now
-    next if $entry_count == 0;
-
-    my ($filename, $line) = @{ $entry->[0] };
-
-    my $location = "$filename line $line" . ($entry_count > 1 ? " (and ".($entry_count-1)." other places)" : "");
-
-    if ($str =~ /^\s/m || $str =~ /\s$/m || $str =~ /\\n$/m) {
-        warn "Extraneous whitespace in '$str' at $location\n";
-    }
-
-    if ($str =~ /([\$\@]\w+)/) {
-        warn "Interpolated variable '$1' in '$str' at $location\n";
-    }
-}
-
-
 sub extract_strings_from_code {
     my $file = $_;
 
@@ -129,19 +131,22 @@ sub extract_strings_from_code {
     return if ( /^[\.#]/ );
     return if ( -f "$_.in" );
 
-    print "Looking at $File::Find::name\n";
+    print "Looking at $File::Find::name";
     my $filename = $File::Find::name;
     $filename =~ s'^\./'';
     $filename =~ s'\.in$'';
 
     unless (open _, '<', $file) {
-        print "Cannot open $file for reading ($!), skipping.\n";
+        print "\n  Cannot open $file for reading ($!), skipping.\n\n";
         return;
     }
 
+    my $errors = 0;
+
     my $re_space_wo_nl = qr{(?!\n)\s};
     my $re_loc_suffix = qr{$re_space_wo_nl* \# $re_space_wo_nl* loc $re_space_wo_nl* $}mx;
     my $re_loc_qw_suffix = qr{$re_space_wo_nl* \# $re_space_wo_nl* loc_qw $re_space_wo_nl* $}mx;
+    my $re_loc_paren_suffix = qr{$re_space_wo_nl* \# $re_space_wo_nl* loc \(\) $re_space_wo_nl* $}mx;
     my $re_loc_pair_suffix = qr{$re_space_wo_nl* \# $re_space_wo_nl* loc_pair $re_space_wo_nl* $}mx;
     my $re_loc_left_pair_suffix = qr{$re_space_wo_nl* \# $re_space_wo_nl* loc_left_pair $re_space_wo_nl* $}mx;
     my $re_delim = $RE{delimited}{-delim=>q{'"}}{-keep};
@@ -154,8 +159,7 @@ sub extract_strings_from_code {
         my ( $all, $vars, $str ) = ( $1, $2, $3 );
         $vars =~ s/[\n\r]//g;
         $line += ( $all =~ tr/\n/\n/ );
-        $str =~ s/\\'/\'/g;
-        #print "STR IS $str\n";
+        $str =~ s/\\(['"\\])/$1/g;
         push @{ $FILECAT{$str} }, [ $filename, $line, $vars ];
     }
 
@@ -167,45 +171,73 @@ sub extract_strings_from_code {
         $line += ( $all =~ tr/\n/\n/ );
 
         my ( $vars, $str );
-        if ( $match =~
-                /\(\s*($re_delim)(.*?)\s*\)$/so ) {
+        next unless ( $match =~ /\(\s*($re_delim)(.*?)\s*\)$/so );
 
-            $str = substr( $1, 1, -1 );       # $str comes before $vars now
-            $vars = $9;
-        }
-        else {
-            next;
-        }
+        my $interp = (substr($1,0,1) eq '"' ? 1 : 0);
+        $str = substr( $1, 1, -1 );       # $str comes before $vars now
+        $vars = $9;
 
         $vars =~ s/[\n\r]//g;
-        $str  =~ s/\\'/\'/g;
+        $str  =~ s/\\(['"\\])/$1/g;
 
-        push @{ $FILECAT{$str} }, [ $filename, $line, $vars ];
+        push @{ $FILECAT{$str} }, [ $filename, $line, $vars, $interp ];
     }
 
+    my %seen;
     # Comment-based mark: "..." # loc
     $line = 1;
     pos($_) = 0;
-    while (m/\G(.*?($re_delim)[\}\)\],;]*$re_loc_suffix)/smgo) {
+    while (m/\G(.*?($re_delim)[ \{\}\)\],;]*$re_loc_suffix)/smgo) {
         my ( $all, $str ) = ( $1, $2 );
         $line += ( $all =~ tr/\n/\n/ );
+        $seen{$line}++;
         unless ( defined $str ) {
-            warn "Couldn't process loc at $filename:$line";
+            print "\n" unless $errors++;
+            print "  Couldn't process loc at $filename:$line:\n  $str\n";
             next;
         }
+        my $interp = (substr($str,0,1) eq '"' ? 1 : 0);
         $str = substr($str, 1, -1);
-	$str =~ s/\\'/\'/g;
-	push @{ $FILECAT{$str} }, [ $filename, $line, '' ];
+        $str =~ s/\\(['"\\])/$1/g;
+        push @{ $FILECAT{$str} }, [ $filename, $line, '', $interp ];
+    }
+
+    # Comment-based mark for list to loc():  ("...", $foo, $bar)  # loc()
+    $line = 1;
+    pos($_) = 0;
+    while (m/\G(.*? $RE{balanced}{-parens=>'()'}{-keep} [ \{\}\)\],;]* $re_loc_paren_suffix)/sgx) {
+        my ( $all, $match ) = ( $1, $2 );
+        $line += ( $all =~ tr/\n/\n/ );
+
+        my ( $vars, $str );
+        unless ( $match =~
+                /\(\s*($re_delim)(.*?)\s*\)$/so ) {
+            print "\n" unless $errors++;
+            print "  Failed to match delimited against $match, line $line";
+            next;
+        }
+
+        my $interp = (substr($1,0,1) eq '"' ? 1 : 0);
+        $str = substr( $1, 1, -1 );       # $str comes before $vars now
+        $vars = $9;
+        $seen{$line}++;
+
+        $vars =~ s/[\n\r]//g;
+        $str  =~ s/\\(['"\\])/$1/g;
+
+        push @{ $FILECAT{$str} }, [ $filename, $line, $vars, $interp ];
     }
 
     # Comment-based qw mark: "qw(...)" # loc_qw
     $line = 1;
     pos($_) = 0;
-    while (m/\G(.*?(?:qw\(([^)]+)\)\s*[\{\}\)\],; ]*)?$re_loc_qw_suffix)/smgo) {
+    while (m/\G(.*?(?:qw\(([^)]+)\)[ \{\}\)\],;]*)?$re_loc_qw_suffix)/smgo) {
         my ( $all, $str ) = ( $1, $2 );
         $line += ( $all =~ tr/\n/\n/ );
+        $seen{$line}++;
         unless ( defined $str ) {
-            warn "Couldn't process loc_qw at $filename:$line";
+            print "\n" unless $errors++;
+            print "  Couldn't process loc_qw at $filename:$line:\n  $str\n";
             next;
         }
         foreach my $value (split ' ', $str) {
@@ -219,167 +251,164 @@ sub extract_strings_from_code {
     while (m/\G(.*?(?:(\w+|$re_delim)\s*=>[^#\n]+?)?$re_loc_left_pair_suffix)/smgo) {
         my ( $all, $key ) = ( $1, $2 );
         $line += ( $all =~ tr/\n/\n/ );
+        $seen{$line}++;
         unless ( defined $key ) {
-            warn "Couldn't process loc_left_pair at $filename:$line";
+            print "\n" unless $errors++;
+            print "  Couldn't process loc_left_pair at $filename:$line:\n  $key\n";
             next;
         }
-	$key  =~ s/\\'/\'/g;
-	push @{ $FILECAT{$key} }, [ $filename, $line, '' ];
+        my $interp = (substr($key,0,1) eq '"' ? 1 : 0);
+        $key =~ s/\\(['"\\])/$1/g if $key =~ s/^(['"])(.*)\1$/$2/g; # dequote potentially quoted string
+        push @{ $FILECAT{$key} }, [ $filename, $line, '', $interp ];
     }
 
     # Comment-based pair mark: "..." => "..." # loc_pair
     $line = 1;
     pos($_) = 0;
-    while (m/\G(.*?(?:(\w+)\s*=>\s*($re_delim)[\}\)\],;]*)?$re_loc_pair_suffix)/smgo) {
-        my ( $all, $key, $val ) = ( $1, $2, $3 );
+    while (m/\G(.*?(?:(\w+|$re_delim)\s*=>\s*($re_delim)[ \{\}\)\],;]*)?$re_loc_pair_suffix)/smgo) {
+        my ( $all, $key, $val ) = ( $1, $2, $10 );
+        $line += ( $all =~ tr/\n/\n/ );
+        $seen{$line}++;
+        unless ( defined $key && defined $val ) {
+            print "\n" unless $errors++;
+            print "  Couldn't process loc_pair at $filename:$line:\n  $key\n  $val\n";
+            next;
+        }
+        my $interp_key = (substr($key,0,1) eq '"' ? 1 : 0);
+        $key =~ s/\\(['"\\])/$1/g if $key =~ s/^(['"])(.*)\1$/$2/g; # dequote potentially quoted string
+        push @{ $FILECAT{$key} }, [ $filename, $line, '', $interp_key ];
+
+        my $interp_val = (substr($val,0,1) eq '"' ? 1 : 0);
+        $val = substr($val, 1, -1);    # dequote always quoted string
+        $val  =~ s/\\(['"\\])/$1/g;
+        push @{ $FILECAT{$val} }, [ $filename, $line, '', $interp_val ];
+    }
+
+    # Specific key  foo => "...", #loc{foo}
+    $line = 1;
+    pos($_) = 0;
+    while (m/\G(.*?(\w+|$re_delim)\s*=>\s*($re_delim)(?-s:.*?)\#$re_space_wo_nl*loc\{\2\}$re_space_wo_nl*)$/smgo) {
+        my ( $all, $key, $val ) = ( $1, $2, $10 );
         $line += ( $all =~ tr/\n/\n/ );
+        $seen{$line}++;
         unless ( defined $key && defined $val ) {
-            warn "Couldn't process loc_pair at $filename:$line";
+            warn "Couldn't process loc_pair at $filename:$line:\n  $key\n  $val\n";
             next;
         }
-	$val = substr($val, 1, -1);
-	$key  =~ s/\\'/\'/g;
-	$val  =~ s/\\'/\'/g;
-	push @{ $FILECAT{$key} }, [ $filename, $line, '' ];
-	push @{ $FILECAT{$val} }, [ $filename, $line, '' ];
+        $val = substr($val, 1, -1);    # dequote always quoted string
+        $val  =~ s/\\(['"])/$1/g;
+        push @{ $FILECAT{$val} }, [ $filename, $line, '' ];
+    }
+
+    # Check for ones we missed
+    $line = 1;
+    pos($_) = 0;
+    while (m/\G(.*? \# $re_space_wo_nl* (loc (_\w+|\(\)|{$re_delim})?) $re_space_wo_nl* $)/smgox) {
+        my ($all, $loc_type) = ($1, $2);
+        $line += ( $all =~ tr/\n/\n/ );
+        next if $seen{$line};
+        print "\n" unless $errors++;
+        print "  $loc_type that did not match, line $line of $filename\n";
+    }
+
+    if ($errors) {
+        print "\n"
+    } else {
+        print "\r", " " x 100, "\r";
     }
 
     close (_);
 }
 
+sub uniq {
+    my %seen;
+    return grep { !$seen{$_}++ } @_;
+}
+
 sub update {
     my $lang = shift;
     my $file = shift;
-    my ( %Lexicon, %Header);
-    my $out = '';
 
     unless (!-e $file or -w $file) {
-	warn "Can't write to $lang, skipping...\n";
-	return;
+        warn "Can't write to $lang, skipping...\n";
+        return;
     }
 
-    print "Updating $lang...\n";
-
-    my @lines;
-    @lines = (<LEXICON>) if open LEXICON, '<', $file;
-    @lines = grep { !/^(#(:|\.)\s*|$)/ } @lines;
-    while (@lines) {
-        my $msghdr = "";
-        $msghdr .= shift @lines while ( $lines[0] && $lines[0] !~ /^(#~ )?msgid/ );
-        
-        my $msgid  = "";
-
-# '#~ ' is the prefix of launchpad for msg that's not found the the source
-# we'll remove the prefix later so we can still show them with our own mark
-
-        $msgid .= shift @lines while ( $lines[0] && $lines[0] =~ /^(#~ )?(msgid|")/ );
-        my $msgstr = "";
-        $msgstr .= shift @lines while ( $lines[0] && $lines[0] =~ /^(#~ )?(msgstr|")/ );
-
-        last unless $msgid;
+    my $is_english = ( $lang =~ /^en(?:[^A-Za-z]|$)/ );
 
-        chomp $msgid;
-        chomp $msgstr;
+    print "Updating $lang";
+    my $lexicon = Locale::PO->load_file_ashash( $file, "utf-8" );
 
-        $msgid  =~ s/^#~ //mg;
-        $msgstr =~ s/^#~ //mg;
+    # Default to the empty string for new ones
+    $lexicon->{$_->msgid} ||= $_
+        for values %FILECAT;
 
-        $msgid  =~ s/^msgid "(.*)"\s*?$/$1/m    or warn "$msgid in $file";
+    my $errors = 0;
+    for my $msgid ( keys %{$lexicon} ) {
+        my $entry = $lexicon->{$msgid};
 
-        if ( $msgid eq '' ) {
-            # null msgid, msgstr will have head info
-            $msgstr =~ s/^msgstr "(.*)"\s*?$/$1/ms or warn "$msgstr  in $file";
-        }
-        else {
-            $msgstr =~ s/^msgstr "(.*)"\s*?$/$1/m or warn "$msgstr  in $file";
+        # Don't output empty translations for english
+        if (not length $entry->dequote($entry->msgstr) and $is_english) {
+            delete $lexicon->{$msgid};
+            next;
         }
 
-        if ( $msgid ne ''  ) {
-            for my $msg ( \$msgid, \$msgstr ) {
-                if ( $$msg =~ /\n/ ) {
-                    my @lines = split /\n/, $$msg;
-                    $$msg =
-                      shift @lines;   # first line don't need to handle any more
-                    for (@lines) {
-                        if (/^"(.*)"\s*$/) {
-                            $$msg .= $1;
-                        }
-                    }
-                }
-
-                # convert \\n back to \n
-                $$msg =~ s/(?!\\)\\n/\n/g;
-            }
-        }
+        # The PO properties at the top are always fine to leave as-is
+        next if not length $entry->dequote($msgid);
 
-        $Lexicon{$msgid} = $msgstr;
-        $Header{$msgid}  = $msghdr;
-    }
-
-    my $is_english = ( $lang =~ /^en(?:[^A-Za-z]|$)/ );
-
-    foreach my $str ( keys %FILECAT ) {
-        $Lexicon{$str} ||= '';
-    }
-    foreach ( sort keys %Lexicon ) {
-        my $f = join ( ' ', sort map $_->[0].":".$_->[1], @{ $FILECAT{$_} } );
-        my $nospace = $_;
-        $nospace =~ s/ +$//;
-
-        if ( !$Lexicon{$_} and $Lexicon{$nospace} ) {
-            $Lexicon{$_} =
-              $Lexicon{$nospace} . ( ' ' x ( length($_) - length($nospace) ) );
+        # Not found in source?  Drop it
+        my $source = $FILECAT{$msgid};
+        if (not $source) {
+            delete $lexicon->{$msgid};
+            next;
         }
 
-        next if !length( $Lexicon{$_} ) and $is_english;
-
-        my %seen;
-        $out .= $Header{$_} if exists $Header{$_};
+        # Pull in the properties from the source
+        $entry->reference( $source->reference );
+        $entry->automatic( $source->automatic );
 
+        my $fail = validate_msgstr($lang,
+                                   map {$entry->dequote($_)}
+                                       $entry->msgid, $entry->msgstr);
+        next unless $fail;
+        print "\n" unless $errors++;
+        print $fail;
+    }
 
+    my @order = map {$_->[0]}
+                sort {$a->[1] cmp $b->[1]}
+                map {[$_, $_->dequote($_->msgid)]}
+                values %{$lexicon};
 
-        next if (!$f && $_ && !$Lexicon{$_});
-        if ( $f && $f !~ /^\s+$/ ) {
+    Locale::PO->save_file_fromarray($file, \@order, "utf-8")
+          or die "Couldn't update '$file': $!";
 
-            $out .= "#: $f\n";
-        }
-        elsif ($_) {
-            $out .= "#: NOT FOUND IN SOURCE\n";
-        }
-        foreach my $entry ( sort { $a->[2] cmp $b->[2] } grep { $_->[2] } @{ $FILECAT{$_} } ) {
-            my ( $file, $line, $var ) = @{$entry};
-            $var =~ s/^\s*,\s*//;
-            $var =~ s/\s*$//;
-            $out .= "#. ($var)\n" unless $seen{$var}++;
-        }
-        $out .= 'msgid ' . fmt($_) . "msgstr \"$Lexicon{$_}\"\n\n";
+    if ($errors) {
+        print "\n";
+    } else {
+        print "\r", " "x100, "\r";
     }
-
-    open PO, '>', $file or die "Couldn't open '$file' for writing: $!";
-    print PO $out;
-    close PO;
-
     return 1;
 }
 
-sub escape {
-    my $text = shift;
-    $text =~ s/\b_(\d+)/%$1/;
-    return $text;
-}
+sub validate_msgstr {
+    my $lang   = shift;
+    my $msgid  = shift;
+    my $msgstr = shift;
 
-sub fmt {
-    my $str = shift;
-    return "\"$str\"\n" unless $str =~ /\n/;
+    return if not defined $msgstr or $msgstr eq ''; # no translation for this string
 
-    my $multi_line = ($str =~ /\n(?!\z)/);
-    $str =~ s/\n/\\n"\n"/g;
+    # we uniq because a string can use a placeholder more than once
+    # (eg %1 %quant(%1, ...) like in our czech localization
+    my @expected_variables = uniq($msgid =~ /%\d+/g);
+    my @got_variables = uniq($msgstr =~ /%\d+/g);
 
-    if ($str =~ /\n"$/) {
-        chop $str;
-    }
-    else {
-        $str .= "\"\n";
-    }
-    return $multi_line ? qq(""\n"$str) : qq("$str);
+    # this catches the case where expected uses %1,%2 and got uses %1,%3
+    # unlike a simple @expected_variables == @got_variables
+    my $expected = join ", ", sort @expected_variables;
+    my $got      = join ", ", sort @got_variables;
+    return if $expected eq $got;
+
+    return "  expected (" . $expected . ") in msgid: $msgid\n" .
+           "       got (" . $got      . ") in msgstr: $msgstr\n";
 }
author	Ivan Kohler <ivan@freeside.biz>	2015-07-09 22:18:55 -0700
committer	Ivan Kohler <ivan@freeside.biz>	2015-07-09 22:18:55 -0700
commit	1c538bfabc2cd31f27067505f0c3d1a46cba6ef0 (patch)
tree	96922ad4459eda1e649327fd391d60c58d454c53 /rt/devel/tools/extract-message-catalog
parent	4f5619288413a185e9933088d9dd8c5afbc55dfa (diff)