diff options
-rw-r--r-- | gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm | 382 |
1 files changed, 217 insertions, 165 deletions
diff --git a/gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm b/gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm index 9e9cca5344d..8a1551a1640 100644 --- a/gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm +++ b/gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm @@ -1,145 +1,158 @@ package Encode::Alias; use strict; +use warnings; +no warnings 'redefine'; use Encode; -our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; -our $DEBUG = 0; +our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +sub DEBUG () { 0 } use base qw(Exporter); # Public, encouraged API is exported by default -our @EXPORT = - qw ( - define_alias - find_alias - ); +our @EXPORT = + qw ( + define_alias + find_alias +); -our @Alias; # ordered matching list -our %Alias; # cached known aliases +our @Alias; # ordered matching list +our %Alias; # cached known aliases -sub find_alias -{ +sub find_alias { my $class = shift; - local $_ = shift; - unless (exists $Alias{$_}) - { - $Alias{$_} = undef; # Recursion guard - for (my $i=0; $i < @Alias; $i += 2) - { - my $alias = $Alias[$i]; - my $val = $Alias[$i+1]; - my $new; - if (ref($alias) eq 'Regexp' && $_ =~ $alias) - { - $DEBUG and warn "eval $val"; - $new = eval $val; - # $@ and warn "$val, $@"; - } - elsif (ref($alias) eq 'CODE') - { - $DEBUG and warn "$alias", "->", "($val)"; - $new = $alias->($val); - } - elsif (lc($_) eq lc($alias)) - { - $new = $val; - } - if (defined($new)) - { - next if $new eq $_; # avoid (direct) recursion on bugs - $DEBUG and warn "$alias, $new"; - my $enc = (ref($new)) ? $new : Encode::find_encoding($new); - if ($enc) - { - $Alias{$_} = $enc; - last; - } - } - } + my $find = shift; + unless ( exists $Alias{$find} ) { + $Alias{$find} = undef; # Recursion guard + for ( my $i = 0 ; $i < @Alias ; $i += 2 ) { + my $alias = $Alias[$i]; + my $val = $Alias[ $i + 1 ]; + my $new; + if ( ref($alias) eq 'Regexp' && $find =~ $alias ) { + DEBUG and warn "eval $val"; + $new = eval $val; + DEBUG and $@ and warn "$val, $@"; + } + elsif ( ref($alias) eq 'CODE' ) { + DEBUG and warn "$alias", "->", "($find)"; + $new = $alias->($find); + } + elsif ( lc($find) eq lc($alias) ) { + $new = $val; + } + if ( defined($new) ) { + next if $new eq $find; # avoid (direct) recursion on bugs + DEBUG and warn "$alias, $new"; + my $enc = + ( ref($new) ) ? $new : Encode::find_encoding($new); + if ($enc) { + $Alias{$find} = $enc; + last; + } + } + } + + # case insensitive search when canonical is not in all lowercase + # RT ticket #7835 + unless ( $Alias{$find} ) { + my $lcfind = lc($find); + for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule ) + { + $lcfind eq lc($name) or next; + $Alias{$find} = Encode::find_encoding($name); + DEBUG and warn "$find => $name"; + } + } } - if ($DEBUG){ - my $name; - if (my $e = $Alias{$_}){ - $name = $e->name; - }else{ - $name = ""; - } - warn "find_alias($class, $_)->name = $name"; + if (DEBUG) { + my $name; + if ( my $e = $Alias{$find} ) { + $name = $e->name; + } + else { + $name = ""; + } + warn "find_alias($class, $find)->name = $name"; } - return $Alias{$_}; + return $Alias{$find}; } -sub define_alias -{ - while (@_) - { - my ($alias,$name) = splice(@_,0,2); - unshift(@Alias, $alias => $name); # newer one has precedence - # clear %Alias cache to allow overrides - if (ref($alias)){ - my @a = keys %Alias; - for my $k (@a){ - if (ref($alias) eq 'Regexp' && $k =~ $alias) - { - $DEBUG and warn "delete \$Alias\{$k\}"; - delete $Alias{$k}; - } - elsif (ref($alias) eq 'CODE') - { - $DEBUG and warn "delete \$Alias\{$k\}"; - delete $Alias{$alias->($name)}; - } - } - }else{ - $DEBUG and warn "delete \$Alias\{$alias\}"; - delete $Alias{$alias}; - } +sub define_alias { + while (@_) { + my ( $alias, $name ) = splice( @_, 0, 2 ); + unshift( @Alias, $alias => $name ); # newer one has precedence + if ( ref($alias) ) { + + # clear %Alias cache to allow overrides + my @a = keys %Alias; + for my $k (@a) { + if ( ref($alias) eq 'Regexp' && $k =~ $alias ) { + DEBUG and warn "delete \$Alias\{$k\}"; + delete $Alias{$k}; + } + elsif ( ref($alias) eq 'CODE' ) { + DEBUG and warn "delete \$Alias\{$k\}"; + delete $Alias{ $alias->($name) }; + } + } + } + else { + DEBUG and warn "delete \$Alias\{$alias\}"; + delete $Alias{$alias}; + } } } # Allow latin-1 style names as well - # 0 1 2 3 4 5 6 7 8 9 10 +# 0 1 2 3 4 5 6 7 8 9 10 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); + # Allow winlatin1 style names as well -our %Winlatin2cp = ( - 'latin1' => 1252, - 'latin2' => 1250, - 'cyrillic' => 1251, - 'greek' => 1253, - 'turkish' => 1254, - 'hebrew' => 1255, - 'arabic' => 1256, - 'baltic' => 1257, - 'vietnamese' => 1258, - ); +our %Winlatin2cp = ( + 'latin1' => 1252, + 'latin2' => 1250, + 'cyrillic' => 1251, + 'greek' => 1253, + 'turkish' => 1254, + 'hebrew' => 1255, + 'arabic' => 1256, + 'baltic' => 1257, + 'vietnamese' => 1258, +); init_aliases(); -sub undef_aliases{ +sub undef_aliases { @Alias = (); %Alias = (); } -sub init_aliases -{ +sub init_aliases { undef_aliases(); # Try all-lower-case version should all else fails define_alias( qr/^(.*)$/ => '"\L$1"' ); # UTF/UCS stuff - define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); - define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', - qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', - qr/^iso-10646-1$/i => '"UCS-2BE"' ); - define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"', - qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"', - qr/^UTF(16|32)$/i => '"UTF-$1"', - ); + define_alias( qr/^UTF-?7$/i => '"UTF-7"' ); + define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); + define_alias( + qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', + qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', + qr/^iso-10646-1$/i => '"UCS-2BE"' + ); + define_alias( + qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', + qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', + qr/^UTF-?(16|32)$/i => '"UTF-$1"', + ); + # ASCII - define_alias(qr/^(?:US-?)ascii$/i => '"ascii"'); - define_alias('C' => 'ascii'); - define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"'); + define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' ); + define_alias( 'C' => 'ascii' ); + define_alias( qr/\bISO[-_]?646[-_]?US$/i => '"ascii"' ); + define_alias( '646' => 'ascii' ); + # Allow variants of iso-8859-1 etc. define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); @@ -147,7 +160,9 @@ sub init_aliases define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); # More HP stuff. - define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' ); + define_alias( + qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => + '"${1}8"' ); # The Official name of ASCII. define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); @@ -157,27 +172,33 @@ sub init_aliases # has been redefined as the euro symbol.) define_alias( qr/^(.+)\@euro$/i => '"$1"' ); - define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i - => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' ); - - define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| - hebrew|arabic|baltic|vietnamese)$/ix => - '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' ); - - # Common names for non-latin prefered MIME names - define_alias( 'ascii' => 'US-ascii', - 'cyrillic' => 'iso-8859-5', - 'arabic' => 'iso-8859-6', - 'greek' => 'iso-8859-7', - 'hebrew' => 'iso-8859-8', - 'thai' => 'iso-8859-11', - 'tis620' => 'iso-8859-11', - ); + define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i => +'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' + ); + + define_alias( + qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| + hebrew|arabic|baltic|vietnamese)$/ix => + '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' + ); + + # Common names for non-latin preferred MIME names + define_alias( + 'ascii' => 'US-ascii', + 'cyrillic' => 'iso-8859-5', + 'arabic' => 'iso-8859-6', + 'greek' => 'iso-8859-7', + 'hebrew' => 'iso-8859-8', + 'thai' => 'iso-8859-11', + ); + # RT #20781 + define_alias(qr/\btis-?620\b/i => '"iso-8859-11"'); # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. # And Microsoft has their own naming (again, surprisingly). - # And windows-* is registered in IANA! - define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"'); + # And windows-* is registered in IANA! + define_alias( + qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' ); # Sometimes seen with a leading zero. # define_alias( qr/\bcp037\b/i => '"cp37"'); @@ -185,47 +206,56 @@ sub init_aliases # Mac Mappings # predefined in *.ucm; unneeded # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); - define_alias( qr/^mac_(.*)$/i => '"mac$1"'); + define_alias( qr/^mac_(.*)$/i => '"mac$1"' ); + # Ououououou. gone. They are differente! # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); - + # Standardize on the dashed versions. - # define_alias( qr/\butf8$/i => 'utf-8' ); - define_alias( qr/\bkoi8r$/i => 'koi8-r' ); - define_alias( qr/\bkoi8u$/i => 'koi8-u' ); + define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); + + unless ($Encode::ON_EBCDIC) { - unless ($Encode::ON_EBCDIC){ # for Encode::CN - define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); - define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); - # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) - # CP936 doesn't have vendor-addon for GBK, so they're identical. - define_alias( qr/^gbk$/i => '"cp936"'); - # This fixes gb2312 vs. euc-cn confusion, practically - define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' ); - # for Encode::JP - define_alias( qr/\bjis$/i => '"7bit-jis"' ); - define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); - define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); - define_alias( qr/\bujis$/i => '"euc-jp"' ); - define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); - define_alias( qr/\bsjis$/i => '"shiftjis"' ); + define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); + define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); + + # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) + # CP936 doesn't have vendor-addon for GBK, so they're identical. + define_alias( qr/^gbk$/i => '"cp936"' ); + + # This fixes gb2312 vs. euc-cn confusion, practically + define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); + + # for Encode::JP + define_alias( qr/\bjis$/i => '"7bit-jis"' ); + define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); + define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); + define_alias( qr/\bujis$/i => '"euc-jp"' ); + define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); + define_alias( qr/\bsjis$/i => '"shiftjis"' ); + define_alias( qr/\bwindows-31j$/i => '"cp932"' ); + # for Encode::KR - define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); - define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); - # This fixes ksc5601 vs. euc-kr confusion, practically - define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); - define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); - define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); + define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); + define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); + + # This fixes ksc5601 vs. euc-kr confusion, practically + define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); + define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); + define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); + # for Encode::TW - define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); - define_alias( qr/\bbig5-?et(?:en)$/i => '"big5-eten"' ); - define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); - define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); - define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); + define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); + define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); + define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); + define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); + define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); } + # utf8 is blessed :) - define_alias( qr/^UTF-8$/i => '"utf8"',); + define_alias( qr/^UTF-8$/i => '"utf-8-strict"' ); + # At last, Map white space and _ to '-' define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); } @@ -277,25 +307,47 @@ in order to allow C<$1> etc. to be substituted. The example is one way to alias names as used in X11 fonts to the MIME names for the iso-8859-* family. Note the double quotes inside the single quotes. +(or, you don't have to do this yourself because this example is predefined) + If you are using a regex here, you have to use the quotes as shown or it won't work. Also note that regex handling is tricky even for the -experienced. Use it with caution. +experienced. Use this feature with caution. =item As a code reference, e.g.: - define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , ''); + define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); -In this case, C<$_> will be set to the name that is being looked up and -I<ENCODING> is passed to the sub as its first argument. The example -is another way to alias names as used in X11 fonts to the MIME names -for the iso-8859-* family. +The same effect as the example above in a different way. The coderef +takes the alias name as an argument and returns a canonical name on +success or undef if not. Note the second argument is not required. +Use this with even more caution than the regex version. =back +=head3 Changes in code reference aliasing + +As of Encode 1.87, the older form + + define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); + +no longer works. + +Encode up to 1.86 internally used "local $_" to implement ths older +form. But consider the code below; + + use Encode; + $_ = "eeeee" ; + while (/(e)/g) { + my $utf = decode('aliased-encoding-name', $1); + print "position:",pos,"\n"; + } + +Prior to Encode 1.86 this fails because of "local $_". + =head2 Alias overloading You can override predefined aliases by simply applying define_alias(). -The new alias is always evaluated first, and when neccessary, +The new alias is always evaluated first, and when necessary, define_alias() flushes the internal cache to make the new definition available. |