summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm382
1 files changed, 217 insertions, 165 deletions
diff --git a/gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm b/gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm
index 9e9cca5344d..8a1551a1640 100644
--- a/gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm
+++ b/gnu/usr.bin/perl/ext/Encode/lib/Encode/Alias.pm
@@ -1,145 +1,158 @@
package Encode::Alias;
use strict;
+use warnings;
+no warnings 'redefine';
use Encode;
-our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
-our $DEBUG = 0;
+our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
+sub DEBUG () { 0 }
use base qw(Exporter);
# Public, encouraged API is exported by default
-our @EXPORT =
- qw (
- define_alias
- find_alias
- );
+our @EXPORT =
+ qw (
+ define_alias
+ find_alias
+);
-our @Alias; # ordered matching list
-our %Alias; # cached known aliases
+our @Alias; # ordered matching list
+our %Alias; # cached known aliases
-sub find_alias
-{
+sub find_alias {
my $class = shift;
- local $_ = shift;
- unless (exists $Alias{$_})
- {
- $Alias{$_} = undef; # Recursion guard
- for (my $i=0; $i < @Alias; $i += 2)
- {
- my $alias = $Alias[$i];
- my $val = $Alias[$i+1];
- my $new;
- if (ref($alias) eq 'Regexp' && $_ =~ $alias)
- {
- $DEBUG and warn "eval $val";
- $new = eval $val;
- # $@ and warn "$val, $@";
- }
- elsif (ref($alias) eq 'CODE')
- {
- $DEBUG and warn "$alias", "->", "($val)";
- $new = $alias->($val);
- }
- elsif (lc($_) eq lc($alias))
- {
- $new = $val;
- }
- if (defined($new))
- {
- next if $new eq $_; # avoid (direct) recursion on bugs
- $DEBUG and warn "$alias, $new";
- my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
- if ($enc)
- {
- $Alias{$_} = $enc;
- last;
- }
- }
- }
+ my $find = shift;
+ unless ( exists $Alias{$find} ) {
+ $Alias{$find} = undef; # Recursion guard
+ for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
+ my $alias = $Alias[$i];
+ my $val = $Alias[ $i + 1 ];
+ my $new;
+ if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
+ DEBUG and warn "eval $val";
+ $new = eval $val;
+ DEBUG and $@ and warn "$val, $@";
+ }
+ elsif ( ref($alias) eq 'CODE' ) {
+ DEBUG and warn "$alias", "->", "($find)";
+ $new = $alias->($find);
+ }
+ elsif ( lc($find) eq lc($alias) ) {
+ $new = $val;
+ }
+ if ( defined($new) ) {
+ next if $new eq $find; # avoid (direct) recursion on bugs
+ DEBUG and warn "$alias, $new";
+ my $enc =
+ ( ref($new) ) ? $new : Encode::find_encoding($new);
+ if ($enc) {
+ $Alias{$find} = $enc;
+ last;
+ }
+ }
+ }
+
+ # case insensitive search when canonical is not in all lowercase
+ # RT ticket #7835
+ unless ( $Alias{$find} ) {
+ my $lcfind = lc($find);
+ for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
+ {
+ $lcfind eq lc($name) or next;
+ $Alias{$find} = Encode::find_encoding($name);
+ DEBUG and warn "$find => $name";
+ }
+ }
}
- if ($DEBUG){
- my $name;
- if (my $e = $Alias{$_}){
- $name = $e->name;
- }else{
- $name = "";
- }
- warn "find_alias($class, $_)->name = $name";
+ if (DEBUG) {
+ my $name;
+ if ( my $e = $Alias{$find} ) {
+ $name = $e->name;
+ }
+ else {
+ $name = "";
+ }
+ warn "find_alias($class, $find)->name = $name";
}
- return $Alias{$_};
+ return $Alias{$find};
}
-sub define_alias
-{
- while (@_)
- {
- my ($alias,$name) = splice(@_,0,2);
- unshift(@Alias, $alias => $name); # newer one has precedence
- # clear %Alias cache to allow overrides
- if (ref($alias)){
- my @a = keys %Alias;
- for my $k (@a){
- if (ref($alias) eq 'Regexp' && $k =~ $alias)
- {
- $DEBUG and warn "delete \$Alias\{$k\}";
- delete $Alias{$k};
- }
- elsif (ref($alias) eq 'CODE')
- {
- $DEBUG and warn "delete \$Alias\{$k\}";
- delete $Alias{$alias->($name)};
- }
- }
- }else{
- $DEBUG and warn "delete \$Alias\{$alias\}";
- delete $Alias{$alias};
- }
+sub define_alias {
+ while (@_) {
+ my ( $alias, $name ) = splice( @_, 0, 2 );
+ unshift( @Alias, $alias => $name ); # newer one has precedence
+ if ( ref($alias) ) {
+
+ # clear %Alias cache to allow overrides
+ my @a = keys %Alias;
+ for my $k (@a) {
+ if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
+ DEBUG and warn "delete \$Alias\{$k\}";
+ delete $Alias{$k};
+ }
+ elsif ( ref($alias) eq 'CODE' ) {
+ DEBUG and warn "delete \$Alias\{$k\}";
+ delete $Alias{ $alias->($name) };
+ }
+ }
+ }
+ else {
+ DEBUG and warn "delete \$Alias\{$alias\}";
+ delete $Alias{$alias};
+ }
}
}
# Allow latin-1 style names as well
- # 0 1 2 3 4 5 6 7 8 9 10
+# 0 1 2 3 4 5 6 7 8 9 10
our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
+
# Allow winlatin1 style names as well
-our %Winlatin2cp = (
- 'latin1' => 1252,
- 'latin2' => 1250,
- 'cyrillic' => 1251,
- 'greek' => 1253,
- 'turkish' => 1254,
- 'hebrew' => 1255,
- 'arabic' => 1256,
- 'baltic' => 1257,
- 'vietnamese' => 1258,
- );
+our %Winlatin2cp = (
+ 'latin1' => 1252,
+ 'latin2' => 1250,
+ 'cyrillic' => 1251,
+ 'greek' => 1253,
+ 'turkish' => 1254,
+ 'hebrew' => 1255,
+ 'arabic' => 1256,
+ 'baltic' => 1257,
+ 'vietnamese' => 1258,
+);
init_aliases();
-sub undef_aliases{
+sub undef_aliases {
@Alias = ();
%Alias = ();
}
-sub init_aliases
-{
+sub init_aliases {
undef_aliases();
# Try all-lower-case version should all else fails
define_alias( qr/^(.*)$/ => '"\L$1"' );
# UTF/UCS stuff
- define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
- define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
- qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
- qr/^iso-10646-1$/i => '"UCS-2BE"' );
- define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"',
- qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"',
- qr/^UTF(16|32)$/i => '"UTF-$1"',
- );
+ define_alias( qr/^UTF-?7$/i => '"UTF-7"' );
+ define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
+ define_alias(
+ qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
+ qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
+ qr/^iso-10646-1$/i => '"UCS-2BE"'
+ );
+ define_alias(
+ qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
+ qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
+ qr/^UTF-?(16|32)$/i => '"UTF-$1"',
+ );
+
# ASCII
- define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
- define_alias('C' => 'ascii');
- define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
+ define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
+ define_alias( 'C' => 'ascii' );
+ define_alias( qr/\bISO[-_]?646[-_]?US$/i => '"ascii"' );
+ define_alias( '646' => 'ascii' );
+
# Allow variants of iso-8859-1 etc.
define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
@@ -147,7 +160,9 @@ sub init_aliases
define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
# More HP stuff.
- define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
+ define_alias(
+ qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
+ '"${1}8"' );
# The Official name of ASCII.
define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
@@ -157,27 +172,33 @@ sub init_aliases
# has been redefined as the euro symbol.)
define_alias( qr/^(.+)\@euro$/i => '"$1"' );
- define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
- => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
-
- define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
- hebrew|arabic|baltic|vietnamese)$/ix =>
- '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
-
- # Common names for non-latin prefered MIME names
- define_alias( 'ascii' => 'US-ascii',
- 'cyrillic' => 'iso-8859-5',
- 'arabic' => 'iso-8859-6',
- 'greek' => 'iso-8859-7',
- 'hebrew' => 'iso-8859-8',
- 'thai' => 'iso-8859-11',
- 'tis620' => 'iso-8859-11',
- );
+ define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
+'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
+ );
+
+ define_alias(
+ qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
+ hebrew|arabic|baltic|vietnamese)$/ix =>
+ '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
+ );
+
+ # Common names for non-latin preferred MIME names
+ define_alias(
+ 'ascii' => 'US-ascii',
+ 'cyrillic' => 'iso-8859-5',
+ 'arabic' => 'iso-8859-6',
+ 'greek' => 'iso-8859-7',
+ 'hebrew' => 'iso-8859-8',
+ 'thai' => 'iso-8859-11',
+ );
+ # RT #20781
+ define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
# And Microsoft has their own naming (again, surprisingly).
- # And windows-* is registered in IANA!
- define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
+ # And windows-* is registered in IANA!
+ define_alias(
+ qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
# Sometimes seen with a leading zero.
# define_alias( qr/\bcp037\b/i => '"cp37"');
@@ -185,47 +206,56 @@ sub init_aliases
# Mac Mappings
# predefined in *.ucm; unneeded
# define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
- define_alias( qr/^mac_(.*)$/i => '"mac$1"');
+ define_alias( qr/^mac_(.*)$/i => '"mac$1"' );
+
# Ououououou. gone. They are differente!
# define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
-
+
# Standardize on the dashed versions.
- # define_alias( qr/\butf8$/i => 'utf-8' );
- define_alias( qr/\bkoi8r$/i => 'koi8-r' );
- define_alias( qr/\bkoi8u$/i => 'koi8-u' );
+ define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
+
+ unless ($Encode::ON_EBCDIC) {
- unless ($Encode::ON_EBCDIC){
# for Encode::CN
- define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
- define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
- # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
- # CP936 doesn't have vendor-addon for GBK, so they're identical.
- define_alias( qr/^gbk$/i => '"cp936"');
- # This fixes gb2312 vs. euc-cn confusion, practically
- define_alias( qr/\bGB[-_ ]?2312(?:\D.*$|$)/i => '"euc-cn"' );
- # for Encode::JP
- define_alias( qr/\bjis$/i => '"7bit-jis"' );
- define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
- define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
- define_alias( qr/\bujis$/i => '"euc-jp"' );
- define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
- define_alias( qr/\bsjis$/i => '"shiftjis"' );
+ define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
+ define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
+
+ # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
+ # CP936 doesn't have vendor-addon for GBK, so they're identical.
+ define_alias( qr/^gbk$/i => '"cp936"' );
+
+ # This fixes gb2312 vs. euc-cn confusion, practically
+ define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
+
+ # for Encode::JP
+ define_alias( qr/\bjis$/i => '"7bit-jis"' );
+ define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
+ define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
+ define_alias( qr/\bujis$/i => '"euc-jp"' );
+ define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
+ define_alias( qr/\bsjis$/i => '"shiftjis"' );
+ define_alias( qr/\bwindows-31j$/i => '"cp932"' );
+
# for Encode::KR
- define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
- define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
- # This fixes ksc5601 vs. euc-kr confusion, practically
- define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
- define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
- define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
+ define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
+ define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
+
+ # This fixes ksc5601 vs. euc-kr confusion, practically
+ define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
+ define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
+ define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
+
# for Encode::TW
- define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
- define_alias( qr/\bbig5-?et(?:en)$/i => '"big5-eten"' );
- define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
- define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
- define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
+ define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
+ define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
+ define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
+ define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
+ define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
}
+
# utf8 is blessed :)
- define_alias( qr/^UTF-8$/i => '"utf8"',);
+ define_alias( qr/^UTF-8$/i => '"utf-8-strict"' );
+
# At last, Map white space and _ to '-'
define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
}
@@ -277,25 +307,47 @@ in order to allow C<$1> etc. to be substituted. The example is one
way to alias names as used in X11 fonts to the MIME names for the
iso-8859-* family. Note the double quotes inside the single quotes.
+(or, you don't have to do this yourself because this example is predefined)
+
If you are using a regex here, you have to use the quotes as shown or
it won't work. Also note that regex handling is tricky even for the
-experienced. Use it with caution.
+experienced. Use this feature with caution.
=item As a code reference, e.g.:
- define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
+ define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
-In this case, C<$_> will be set to the name that is being looked up and
-I<ENCODING> is passed to the sub as its first argument. The example
-is another way to alias names as used in X11 fonts to the MIME names
-for the iso-8859-* family.
+The same effect as the example above in a different way. The coderef
+takes the alias name as an argument and returns a canonical name on
+success or undef if not. Note the second argument is not required.
+Use this with even more caution than the regex version.
=back
+=head3 Changes in code reference aliasing
+
+As of Encode 1.87, the older form
+
+ define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
+
+no longer works.
+
+Encode up to 1.86 internally used "local $_" to implement ths older
+form. But consider the code below;
+
+ use Encode;
+ $_ = "eeeee" ;
+ while (/(e)/g) {
+ my $utf = decode('aliased-encoding-name', $1);
+ print "position:",pos,"\n";
+ }
+
+Prior to Encode 1.86 this fails because of "local $_".
+
=head2 Alias overloading
You can override predefined aliases by simply applying define_alias().
-The new alias is always evaluated first, and when neccessary,
+The new alias is always evaluated first, and when necessary,
define_alias() flushes the internal cache to make the new definition
available.