diff options
author | Todd C. Miller <millert@cvs.openbsd.org> | 2002-10-27 22:15:15 +0000 |
---|---|---|
committer | Todd C. Miller <millert@cvs.openbsd.org> | 2002-10-27 22:15:15 +0000 |
commit | 74cfb115ac810480c0000dc742b20383c1578bac (patch) | |
tree | 316d96e5123617976f1637b143570c309a662045 /gnu/usr.bin/perl/lib/I18N | |
parent | 453ade492b8e06c619009d6cd52a85cb04e8cf17 (diff) |
stock perl 5.8.0 from CPAN
Diffstat (limited to 'gnu/usr.bin/perl/lib/I18N')
-rw-r--r-- | gnu/usr.bin/perl/lib/I18N/Collate.t | 44 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/I18N/LangTags.pm | 800 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/I18N/LangTags/ChangeLog | 107 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/I18N/LangTags/List.pm | 1622 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/I18N/LangTags/README | 78 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/I18N/LangTags/test.pl | 79 |
6 files changed, 2730 insertions, 0 deletions
diff --git a/gnu/usr.bin/perl/lib/I18N/Collate.t b/gnu/usr.bin/perl/lib/I18N/Collate.t new file mode 100644 index 00000000000..bf3ba20b6aa --- /dev/null +++ b/gnu/usr.bin/perl/lib/I18N/Collate.t @@ -0,0 +1,44 @@ +#!./perl + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; + require Config; import Config; + if (!$Config{d_setlocale} || $Config{ccflags} =~ /\bD?NO_LOCALE\b/) { + print "1..0\n"; + exit; + } +} + +print "1..7\n"; + +use I18N::Collate; + +print "ok 1\n"; + +$a = I18N::Collate->new("foo"); + +print "ok 2\n"; + +{ + use warnings; + local $SIG{__WARN__} = sub { $@ = $_[0] }; + $b = I18N::Collate->new("foo"); + print "not " unless $@ =~ /\bHAS BEEN DEPRECATED\b/; + print "ok 3\n"; + $@ = ''; +} + +print "not " unless $a eq $b; +print "ok 4\n"; + +$b = I18N::Collate->new("bar"); +print "not " if $@ =~ /\bHAS BEEN DEPRECATED\b/; +print "ok 5\n"; + +print "not " if $a eq $b; +print "ok 6\n"; + +print "not " if $a lt $b == $a gt $b; +print "ok 7\n"; + diff --git a/gnu/usr.bin/perl/lib/I18N/LangTags.pm b/gnu/usr.bin/perl/lib/I18N/LangTags.pm new file mode 100644 index 00000000000..ab5ef38245e --- /dev/null +++ b/gnu/usr.bin/perl/lib/I18N/LangTags.pm @@ -0,0 +1,800 @@ + +# Time-stamp: "2002-02-02 20:43:03 MST" +# Sean M. Burke <sburke@cpan.org> + +require 5.000; +package I18N::LangTags; +use strict; +use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); +require Exporter; +@ISA = qw(Exporter); +@EXPORT = qw(); +@EXPORT_OK = qw(is_language_tag same_language_tag + extract_language_tags super_languages + similarity_language_tag is_dialect_of + locale2language_tag alternate_language_tags + encode_language_tag panic_languages + ); +%EXPORT_TAGS = ('ALL' => \@EXPORT_OK); + +$VERSION = "0.27"; + +=head1 NAME + +I18N::LangTags - functions for dealing with RFC3066-style language tags + +=head1 SYNOPSIS + + use I18N::LangTags qw(is_language_tag same_language_tag + extract_language_tags super_languages + similarity_language_tag is_dialect_of + locale2language_tag alternate_language_tags + encode_language_tag panic_languages + ); + +...or whatever of those functions you want to import. Those are +all the exportable functions -- you're free to import only some, +or none at all. By default, none are imported. If you say: + + use I18N::LangTags qw(:ALL) + +...then all are exported. (This saves you from having to use +something less obvious like C<use I18N::LangTags qw(/./)>.) + +If you don't import any of these functions, assume a C<&I18N::LangTags::> +in front of all the function names in the following examples. + +=head1 DESCRIPTION + +Language tags are a formalism, described in RFC 3066 (obsoleting +1766), for declaring what language form (language and possibly +dialect) a given chunk of information is in. + +This library provides functions for common tasks involving language +tags as they are needed in a variety of protocols and applications. + +Please see the "See Also" references for a thorough explanation +of how to correctly use language tags. + +=over + +=cut + +########################################################################### + +=item * the function is_language_tag($lang1) + +Returns true iff $lang1 is a formally valid language tag. + + is_language_tag("fr") is TRUE + is_language_tag("x-jicarilla") is FALSE + (Subtags can be 8 chars long at most -- 'jicarilla' is 9) + + is_language_tag("sgn-US") is TRUE + (That's American Sign Language) + + is_language_tag("i-Klikitat") is TRUE + (True without regard to the fact noone has actually + registered Klikitat -- it's a formally valid tag) + + is_language_tag("fr-patois") is TRUE + (Formally valid -- altho descriptively weak!) + + is_language_tag("Spanish") is FALSE + is_language_tag("french-patois") is FALSE + (No good -- first subtag has to match + /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) + + is_language_tag("x-borg-prot2532") is TRUE + (Yes, subtags can contain digits, as of RFC3066) + +=cut + +sub is_language_tag { + + ## Changes in the language tagging standards may have to be reflected here. + + my($tag) = lc($_[0]); + + return 0 if $tag eq "i" or $tag eq "x"; + # Bad degenerate cases that the following + # regexp would erroneously let pass + + return $tag =~ + /^(?: # First subtag + [xi] | [a-z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-z0-9]{1,8} # subtag + )* + $/xs ? 1 : 0; +} + +########################################################################### + +=item * the function extract_language_tags($whatever) + +Returns a list of whatever looks like formally valid language tags +in $whatever. Not very smart, so don't get too creative with +what you want to feed it. + + extract_language_tags("fr, fr-ca, i-mingo") + returns: ('fr', 'fr-ca', 'i-mingo') + + extract_language_tags("It's like this: I'm in fr -- French!") + returns: ('It', 'in', 'fr') + (So don't just feed it any old thing.) + +The output is untainted. If you don't know what tainting is, +don't worry about it. + +=cut + +sub extract_language_tags { + + ## Changes in the language tagging standards may have to be reflected here. + + my($text) = + $_[0] =~ m/(.+)/ # to make for an untainted result + ? $1 : '' + ; + + return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags + $text =~ + m/ + \b + (?: # First subtag + [iIxX] | [a-zA-Z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-zA-Z0-9]{1,8} # subtag + )* + \b + /xsg + ); +} + +########################################################################### + +=item * the function same_language_tag($lang1, $lang2) + +Returns true iff $lang1 and $lang2 are acceptable variant tags +representing the same language-form. + + same_language_tag('x-kadara', 'i-kadara') is TRUE + (The x/i- alternation doesn't matter) + same_language_tag('X-KADARA', 'i-kadara') is TRUE + (...and neither does case) + same_language_tag('en', 'en-US') is FALSE + (all-English is not the SAME as US English) + same_language_tag('x-kadara', 'x-kadar') is FALSE + (these are totally unrelated tags) + same_language_tag('no-bok', 'nb') is TRUE + (no-bok is a legacy tag for nb (Norwegian Bokmal)) + +C<same_language_tag> works by just seeing whether +C<encode_language_tag($lang1)> is the same as +C<encode_language_tag($lang2)>. + +(Yes, I know this function is named a bit oddly. Call it historic +reasons.) + +=cut + +sub same_language_tag { + my $el1 = &encode_language_tag($_[0]); + return 0 unless defined $el1; + # this avoids the problem of + # encode_language_tag($lang1) eq and encode_language_tag($lang2) + # being true if $lang1 and $lang2 are both undef + + return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; +} + +########################################################################### + +=item * the function similarity_language_tag($lang1, $lang2) + +Returns an integer representing the degree of similarity between +tags $lang1 and $lang2 (the order of which does not matter), where +similarity is the number of common elements on the left, +without regard to case and to x/i- alternation. + + similarity_language_tag('fr', 'fr-ca') is 1 + (one element in common) + similarity_language_tag('fr-ca', 'fr-FR') is 1 + (one element in common) + + similarity_language_tag('fr-CA-joual', + 'fr-CA-PEI') is 2 + similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 + (two elements in common) + + similarity_language_tag('x-kadara', 'i-kadara') is 1 + (x/i- doesn't matter) + + similarity_language_tag('en', 'x-kadar') is 0 + similarity_language_tag('x-kadara', 'x-kadar') is 0 + (unrelated tags -- no similarity) + + similarity_language_tag('i-cree-syllabic', + 'i-cherokee-syllabic') is 0 + (no B<leftmost> elements in common!) + +=cut + +sub similarity_language_tag { + my $lang1 = &encode_language_tag($_[0]); + my $lang2 = &encode_language_tag($_[1]); + # And encode_language_tag takes care of the whole + # no-nyn==nn, i-hakka==zh-hakka, etc, things + + # NB: (i-sil-...)? (i-sgn-...)? + + return undef if !defined($lang1) and !defined($lang2); + return 0 if !defined($lang1) or !defined($lang2); + + my @l1_subtags = split('-', $lang1); + my @l2_subtags = split('-', $lang2); + my $similarity = 0; + + while(@l1_subtags and @l2_subtags) { + if(shift(@l1_subtags) eq shift(@l2_subtags)) { + ++$similarity; + } else { + last; + } + } + return $similarity; +} + +########################################################################### + +=item * the function is_dialect_of($lang1, $lang2) + +Returns true iff language tag $lang1 represents a subform of +language tag $lang2. + +B<Get the order right! It doesn't work the other way around!> + + is_dialect_of('en-US', 'en') is TRUE + (American English IS a dialect of all-English) + + is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE + is_dialect_of('fr-CA-joual', 'fr') is TRUE + (Joual is a dialect of (a dialect of) French) + + is_dialect_of('en', 'en-US') is FALSE + (all-English is a NOT dialect of American English) + + is_dialect_of('fr', 'en-CA') is FALSE + + is_dialect_of('en', 'en' ) is TRUE + is_dialect_of('en-US', 'en-US') is TRUE + (B<Note:> these are degenerate cases) + + is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE + (the x/i thing doesn't matter, nor does case) + + is_dialect_of('nn', 'no') is TRUE + (because 'nn' (New Norse) is aliased to 'no-nyn', + as a special legacy case, and 'no-nyn' is a + subform of 'no' (Norwegian)) + +=cut + +sub is_dialect_of { + + my $lang1 = &encode_language_tag($_[0]); + my $lang2 = &encode_language_tag($_[1]); + + return undef if !defined($lang1) and !defined($lang2); + return 0 if !defined($lang1) or !defined($lang2); + + return 1 if $lang1 eq $lang2; + return 0 if length($lang1) < length($lang2); + + $lang1 .= '-'; + $lang2 .= '-'; + return + (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; +} + +########################################################################### + +=item * the function super_languages($lang1) + +Returns a list of language tags that are superordinate tags to $lang1 +-- it gets this by removing subtags from the end of $lang1 until +nothing (or just "i" or "x") is left. + + super_languages("fr-CA-joual") is ("fr-CA", "fr") + + super_languages("en-AU") is ("en") + + super_languages("en") is empty-list, () + + super_languages("i-cherokee") is empty-list, () + ...not ("i"), which would be illegal as well as pointless. + +If $lang1 is not a valid language tag, returns empty-list in +a list context, undef in a scalar context. + +A notable and rather unavoidable problem with this method: +"x-mingo-tom" has an "x" because the whole tag isn't an +IANA-registered tag -- but super_languages('x-mingo-tom') is +('x-mingo') -- which isn't really right, since 'i-mingo' is +registered. But this module has no way of knowing that. (But note +that same_language_tag('x-mingo', 'i-mingo') is TRUE.) + +More importantly, you assume I<at your peril> that superordinates of +$lang1 are mutually intelligible with $lang1. Consider this +carefully. + +=cut + +sub super_languages { + my $lang1 = $_[0]; + return() unless defined($lang1) && &is_language_tag($lang1); + + # a hack for those annoying new (2001) tags: + $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards + $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards + $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way + # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark + + my @l1_subtags = split('-', $lang1); + + ## Changes in the language tagging standards may have to be reflected here. + + # NB: (i-sil-...)? + + my @supers = (); + foreach my $bit (@l1_subtags) { + push @supers, + scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; + } + pop @supers if @supers; + shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; + return reverse @supers; +} + +########################################################################### + +=item * the function locale2language_tag($locale_identifier) + +This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") +and maps it to a language tag. If it's not mappable (as with, +notably, "C" and "POSIX"), this returns empty-list in a list context, +or undef in a scalar context. + + locale2language_tag("en") is "en" + + locale2language_tag("en_US") is "en-US" + + locale2language_tag("en_US.ISO8859-1") is "en-US" + + locale2language_tag("C") is undef or () + + locale2language_tag("POSIX") is undef or () + + locale2language_tag("POSIX") is undef or () + +I'm not totally sure that locale names map satisfactorily to language +tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. + +The output is untainted. If you don't know what tainting is, +don't worry about it. + +=cut + +sub locale2language_tag { + my $lang = + $_[0] =~ m/(.+)/ # to make for an untainted result + ? $1 : '' + ; + + return $lang if &is_language_tag($lang); # like "en" + + $lang =~ tr<_><->; # "en_US" -> en-US + $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US + + return $lang if &is_language_tag($lang); + + return; +} + +########################################################################### + +=item * the function encode_language_tag($lang1) + +This function, if given a language tag, returns an encoding of it such +that: + +* tags representing different languages never get the same encoding. + +* tags representing the same language always get the same encoding. + +* an encoding of a formally valid language tag always is a string +value that is defined, has length, and is true if considered as a +boolean. + +Note that the encoding itself is B<not> a formally valid language tag. +Note also that you cannot, currently, go from an encoding back to a +language tag that it's an encoding of. + +Note also that you B<must> consider the encoded value as atomic; i.e., +you should not consider it as anything but an opaque, unanalysable +string value. (The internals of the encoding method may change in +future versions, as the language tagging standard changes over time.) + +C<encode_language_tag> returns undef if given anything other than a +formally valid language tag. + +The reason C<encode_language_tag> exists is because different language +tags may represent the same language; this is normally treatable with +C<same_language_tag>, but consider this situation: + +You have a data file that expresses greetings in different languages. +Its format is "[language tag]=[how to say 'Hello']", like: + + en-US=Hiho + fr=Bonjour + i-mingo=Hau' + +And suppose you write a program that reads that file and then runs as +a daemon, answering client requests that specify a language tag and +then expect the string that says how to greet in that language. So an +interaction looks like: + + greeting-client asks: fr + greeting-server answers: Bonjour + +So far so good. But suppose the way you're implementing this is: + + my %greetings; + die unless open(IN, "<in.dat"); + while(<IN>) { + chomp; + next unless /^([^=]+)=(.+)/s; + my($lang, $expr) = ($1, $2); + $greetings{$lang} = $expr; + } + close(IN); + +at which point %greetings has the contents: + + "en-US" => "Hiho" + "fr" => "Bonjour" + "i-mingo" => "Hau'" + +And suppose then that you answer client requests for language $wanted +by just looking up $greetings{$wanted}. + +If the client asks for "fr", that will look up successfully in +%greetings, to the value "Bonjour". And if the client asks for +"i-mingo", that will look up successfully in %greetings, to the value +"Hau'". + +But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the +lookup in %greetings fails. That's the Wrong Thing. + +You could instead do lookups on $wanted with: + + use I18N::LangTags qw(same_language_tag); + my $repsonse = ''; + foreach my $l2 (keys %greetings) { + if(same_language_tag($wanted, $l2)) { + $response = $greetings{$l2}; + last; + } + } + +But that's rather inefficient. A better way to do it is to start your +program with: + + use I18N::LangTags qw(encode_language_tag); + my %greetings; + die unless open(IN, "<in.dat"); + while(<IN>) { + chomp; + next unless /^([^=]+)=(.+)/s; + my($lang, $expr) = ($1, $2); + $greetings{ + encode_language_tag($lang) + } = $expr; + } + close(IN); + +and then just answer client requests for language $wanted by just +looking up + + $greetings{encode_language_tag($wanted)} + +And that does the Right Thing. + +=cut + +sub encode_language_tag { + # Only similarity_language_tag() is allowed to analyse encodings! + + ## Changes in the language tagging standards may have to be reflected here. + + my($tag) = $_[0] || return undef; + return undef unless &is_language_tag($tag); + + # For the moment, these legacy variances are few enough that + # we can just handle them here with regexps. + $tag =~ s/^iw\b/he/i; # Hebrew + $tag =~ s/^in\b/id/i; # Indonesian + $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger + $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo + $tag =~ s/^ji\b/yi/i; # Yiddish + # + # These go FROM the simplex to complex form, to get + # similarity-comparison right. And that's okay, since + # similarity_language_tag is the only thing that + # analyzes our output. + $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka + $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal + $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk + + $tag =~ s/^[xiXI]-//s; + # Just lop off any leading "x/i-" + + return "~" . uc($tag); +} + +#-------------------------------------------------------------------------- + +=item * the function alternate_language_tags($lang1) + +This function, if given a language tag, returns all language tags that +are alternate forms of this language tag. (I.e., tags which refer to +the same language.) This is meant to handle legacy tags caused by +the minor changes in language tag standards over the years; and +the x-/i- alternation is also dealt with. + +Note that this function does I<not> try to equate new (and never-used, +and unusable) +ISO639-2 three-letter tags to old (and still in use) ISO639-1 +two-letter equivalents -- like "ara" -> "ar" -- because +"ara" has I<never> been in use as an Internet language tag, +and RFC 3066 stipulates that it never should be, since a shorter +tag ("ar") exists. + +Examples: + + alternate_language_tags('no-bok') is ('nb') + alternate_language_tags('nb') is ('no-bok') + alternate_language_tags('he') is ('iw') + alternate_language_tags('iw') is ('he') + alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') + alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') + alternate_language_tags('en') is () + alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') + alternate_language_tags('x-klikitat') is ('i-klikitat') + alternate_language_tags('i-klikitat') is ('x-klikitat') + +This function returns empty-list if given anything other than a formally +valid language tag. + +=cut + +my %alt = qw( i x x i I X X I ); +sub alternate_language_tags { + my $tag = $_[0]; + return() unless &is_language_tag($tag); + + my @em; # push 'em real goood! + + # For the moment, these legacy variances are few enough that + # we can just handle them here with regexps. + + if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; + } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; + + } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; + } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; + + } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; + } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; + + } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; + } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; + + } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; + } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; + + } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; + } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; + + } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; + } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; + + } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; + } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; + } + + push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; + return @em; +} + +########################################################################### + +{ + # Init %Panic... + + my @panic = ( # MUST all be lowercase! + # Only large ("national") languages make it in this list. + # If you, as a user, are so bizarre that the /only/ language + # you claim to accept is Galician, then no, we won't do you + # the favor of providing Catalan as a panic-fallback for + # you. Because if I start trying to add "little languages" in + # here, I'll just go crazy. + + # Scandinavian lgs. All based on opinion and hearsay. + 'sv' => [qw(nb no da nn)], + 'da' => [qw(nb no sv nn)], # I guess + [qw(no nn nb)], [qw(no nn nb sv da)], + 'is' => [qw(da sv no nb nn)], + 'fo' => [qw(da is no nb nn sv)], # I guess + + # I think this is about the extent of tolerable intelligibility + # among large modern Romance languages. + 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French + 'ca' => [qw(es pt it fr)], + 'es' => [qw(ca it fr pt)], + 'it' => [qw(es fr ca pt)], + 'fr' => [qw(es it ca pt)], + + # Also assume that speakers of the main Indian languages prefer + # to read/hear Hindi over English + [qw( + as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur + )] => 'hi', + # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, + # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, + # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. + 'hi' => [qw(bn pa as or)], + # I welcome finer data for the other Indian languages. + # E.g., what should Oriya's list be, besides just Hindi? + + # And the panic languages for English is, of course, nil! + + # My guesses at Slavic intelligibility: + ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian + 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat + 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak + + 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian + + 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish + + #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai + + ); + my($k,$v); + while(@panic) { + ($k,$v) = splice(@panic,0,2); + foreach my $k (ref($k) ? @$k : $k) { + foreach my $v (ref($v) ? @$v : $v) { + push @{$Panic{$k} ||= []}, $v unless $k eq $v; + } + } + } +} + +=item * the function @langs = panic_languages(@accept_languages) + +This function takes a list of 0 or more language +tags that constitute a given user's Accept-Language list, and +returns a list of tags for I<other> (non-super) +languages that are probably acceptable to the user, to be +used I<if all else fails>. + +For example, if a user accepts only 'ca' (Catalan) and +'es' (Spanish), and the documents/interfaces you have +available are just in German, Italian, and Chinese, then +the user will most likely want the Italian one (and not +the Chinese or German one!), instead of getting +nothing. So C<panic_languages('ca', 'es')> returns +a list containing 'it' (Italian). + +English ('en') is I<always> in the return list, but +whether it's at the very end or not depends +on the input languages. This function works by consulting +an internal table that stipulates what common +languages are "close" to each other. + +A useful construct you might consider using is: + + @fallbacks = super_languages(@accept_languages); + push @fallbacks, panic_languages( + @accept_languages, @fallbacks, + ); + +=cut + +sub panic_languages { + # When in panic or in doubt, run in circles, scream, and shout! + my(@out, %seen); + foreach my $t (@_) { + next unless $t; + next if $seen{$t}++; # so we don't return it or hit it again + # push @out, super_languages($t); # nah, keep that separate + push @out, @{ $Panic{lc $t} || next }; + } + return grep !$seen{$_}++, @out, 'en'; +} + +########################################################################### +1; +__END__ + +=back + +=head1 ABOUT LOWERCASING + +I've considered making all the above functions that output language +tags return all those tags strictly in lowercase. Having all your +language tags in lowercase does make some things easier. But you +might as well just lowercase as you like, or call +C<encode_language_tag($lang1)> where appropriate. + +=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS + +In some future version of I18N::LangTags, I plan to include support +for RFC2482-style language tags -- which are basically just normal +language tags with their ASCII characters shifted into Plane 14. + +=head1 SEE ALSO + +* L<I18N::LangTags::List|I18N::LangTags::List> + +* RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the +Identification of Languages". (Obsoletes RFC 1766) + +* RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on +Character Sets and Languages". + +* RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter +Value and Encoded Word Extensions: Character Sets, Languages, and +Continuations". + +* RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>, +"Language Tagging in Unicode Plain Text". + +* Locale::Codes, in +C<http://www.perl.com/CPAN/modules/by-module/Locale/> + +* ISO 639, "Code for the representation of names of languages", +C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html> + +* ISO 639-2, "Codes for the representation of names of languages", +including three-letter codes, +C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html> + +* The IANA list of registered languages (hopefully up-to-date), +C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/> + +=head1 COPYRIGHT + +Copyright (c) 1998-2001 Sean M. Burke. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + +=head1 AUTHOR + +Sean M. Burke C<sburke@cpan.org> + +=cut + diff --git a/gnu/usr.bin/perl/lib/I18N/LangTags/ChangeLog b/gnu/usr.bin/perl/lib/I18N/LangTags/ChangeLog new file mode 100644 index 00000000000..f3608f7125e --- /dev/null +++ b/gnu/usr.bin/perl/lib/I18N/LangTags/ChangeLog @@ -0,0 +1,107 @@ +Revision history for Perl module I18N::LangTags. + Time-stamp: "2002-02-02 20:45:47 MST" + +2002-02-02 Sean M. Burke sburke@cpan.org + + * Release 0.27 -- minor mods to ::List: + Fixing its entries for sv-se and sv-fi. + Typo-fixes and rewordings in the incidental Pod text elsewhere. + +2001-06-21 Sean M. Burke sburke@cpan.org + + * Release 0.26 -- just making cosmetic changes + to test.pl, at Jarkko's request. + +2001-06-20 Sean M. Burke sburke@cpan.org + + * Release 0.25 -- just tweaking panic_languages behavior + for Scandinavian languages. Much better now. + Slight tweak to ::List's entries for Greek. + +2001-06-20 Sean M. Burke sburke@cpan.org + + * Release 0.24 + + * I18N::LangTags -- some elaborate hacks to make us + recognize legacy aliases like no-nyn == nn. + Added panic_languages(). + Added :ALL export tag. + Minor docs fixes, and spiffing up test.pl. + + * I18N::LangTags::List -- minor corrections; added + a few aliases. + +2001-05-29 Sean M. Burke sburke@cpan.org + + * Release 0.23 + + * I18N::LangTags::List -- minor corrections. And is now + a module, not just documentation. + +2001-05-27 Sean M. Burke sburke@cpan.org + + * Release 0.22 + + * Now bundling I18N::LangTags::List, a reference for lang tags, + replacing generate_language_table.plx and language_codes.txt + +2001-05-25 Sean M. Burke sburke@cpan.org + + * Release 0.21 + + * extract_language_tags and locale2langauge_tag now + return untainted output. Useful if you feed tainted + things, like $ENV{'LANG'}. + +2001-03-13 Sean M. Burke sburke@cpan.org + + * Release 0.20 + + * Added support for RFC 3066 tags: allowing three-letter primary + tags ("nav"), and allowing digits in subtags ("x-borg-prot3252"). + + * Changed all references from RFC 1766 to RFC 3066. + + * Now bundling fulltext of RFC 3066 in the dist. + + * Now bundling generate_language_table.plx and language_codes.txt + + * Added some nice tests to test.pl + + * Inverting order of listings in this ChangeLog file. + +2000-05-13 Sean M. Burke sburke@cpan.org + + * Release 0.13 + + * Just noting my new email address. + +1999-03-06 Sean M. Burke sburke@netadventure.net + + * Release 0.11 + + * Added functions + similarity_language_tag, is_dialect_of, + locale2language_tag, alternate_language_tags, and + encode_language_tag + +1998-12-14 Sean M. Burke sburke@netadventure.net + + * Release 0.09 + + * Added function super_languages() + +1998-10-31 Sean M. Burke sburke@netadventure.net + + * Release 0.08 + + * Just changes in the docs and bundle -- no change + in functionality. + +1998-04-02 Sean M. Burke sburke@netadventure.net + + * Release 0.07 + + * First public release. + +[END OF CHANGELOG] diff --git a/gnu/usr.bin/perl/lib/I18N/LangTags/List.pm b/gnu/usr.bin/perl/lib/I18N/LangTags/List.pm new file mode 100644 index 00000000000..2dbd19a5d78 --- /dev/null +++ b/gnu/usr.bin/perl/lib/I18N/LangTags/List.pm @@ -0,0 +1,1622 @@ + +require 5; +package I18N::LangTags::List; +# Time-stamp: "2002-02-02 20:13:58 MST" +use strict; +use vars qw(%Name $Debug $VERSION); +$VERSION = '0.25'; +# POD at the end. + +#---------------------------------------------------------------------- +{ +# read the table out of our own POD! + my $seeking = 1; + my $count = 0; + my($tag,$name); + while(<I18N::LangTags::List::DATA>) { + if($seeking) { + $seeking = 0 if m/=for woohah/; + } else { + next unless ($tag, $name) = + m/\{([-0-9a-zA-Z]+)\}(?:\s*:)?\s*([^\[\]]+)/; + $name =~ s/\s*[;\.]*\s*$//g; + next unless $name; + ++$count; + print "<$tag> <$name>\n" if $Debug; + $Name{$tag} = $name; + } + } + die "No tags read??" unless $count; +} +#---------------------------------------------------------------------- + +sub name { + my $tag = lc($_[0] || return); + $tag =~ s/^\s+//s; + $tag =~ s/\s+$//s; + + my $alt; + if($tag =~ m/^x-(.+)/) { + $alt = "i-$1"; + } elsif($tag =~ m/^i-(.+)/) { + $alt = "x-$1"; + } else { + $alt = ''; + } + + my $subform = ''; + my $name = ''; + print "Input: {$tag}\n" if $Debug; + while(length $tag) { + last if $name = $Name{$tag}; + last if $name = $Name{$alt}; + if($tag =~ s/(-[a-z0-9]+)$//s) { + print "Shaving off: $1 leaving $tag\n" if $Debug; + $subform = "$1$subform"; + # and loop around again + + $alt =~ s/(-[a-z0-9]+)$//s && $Debug && print " alt -> $alt\n"; + } else { + # we're trying to pull a subform off a primary tag. TILT! + print "Aborting on: {$name}{$subform}\n" if $Debug; + last; + } + } + print "Output: {$name}{$subform}\n" if $Debug; + + return unless $name; # Failure + return $name unless $subform; # Exact match + $subform =~ s/^-//s; + $subform =~ s/-$//s; + return "$name (Subform \"$subform\")"; +} + +1; + +__DATA__ + +=head1 NAME + +I18N::LangTags::List -- tags and names for human languages + +=head1 SYNOPSIS + + use I18N::LangTags::List; + print "Parlez-vous... ", join(', ', + I18N::LangTags::List::name('elx') || 'unknown_language', + I18N::LangTags::List::name('ar-Kw') || 'unknown_language', + I18N::LangTags::List::name('en') || 'unknown_language', + I18N::LangTags::List::name('en-CA') || 'unknown_language', + ), "?\n"; + +prints: + + Parlez-vous... Elamite, Kuwait Arabic, English, Canadian English? + +=head1 DESCRIPTION + +This module provides a function +C<I18N::LangTags::List::name( I<langtag> ) > that takes +a language tag (see L<I18N::LangTags|I18N::LangTags>) +and returns the best attempt at an English name for it, or +undef if it can't make sense of the tag. + +The function I18N::LangTags::List::name(...) is not exported. + +The map of tags-to-names that it uses is accessable as +%I18N::LangTags::List::Name, and it's the same as the list +that follows in this documentation, which should be useful +to you even if you don't use this module. + +=head1 ABOUT LANGUAGE TAGS + +Internet language tags, as defined in RFC 3066, are a formalism +for denoting human languages. The two-letter ISO 639-1 language +codes are well known (as "en" for English), as are their forms +when qualified by a country code ("en-US"). Less well-known are the +arbitrary-length non-ISO codes (like "i-mingo"), and the +recently (in 2001) introduced three-letter ISO-639-2 codes. + +Remember these important facts: + +=over + +=item * + +Language tags are not locale IDs. A locale ID is written with a "_" +instead of a "-", (almost?) always matches C<m/^\w\w_\w\w\b/>, and +I<means> something different than a language tag. A language tag +denotes a language. A locale ID denotes a language I<as used in> +a particular place, in combination with non-linguistic +location-specific information such as what currency is used +there. Locales I<also> often denote character set information, +as in "en_US.ISO8859-1". + +=item * + +Language tags are not for computer languages. + +=item * + +"Dialect" is not a useful term, since there is no objective +criterion for establishing when two language-forms are +dialects of eachother, or are separate languages. + +=item * + +Language tags are not case-sensitive. en-US, en-us, En-Us, etc., +are all the same tag, and denote the same language. + +=item * + +Not every language tag really refers to a single language. Some +language tags refer to conditions: i-default (system-message text +in English plus maybe other languages), und (undetermined +language). Others (notably lots of the three-letter codes) are +bibliographic tags that classify whole groups of languages, as +with cus "Cushitic (Other)" (i.e., a +language that has been classed as Cushtic, but which has no more +specific code) or the even less linguistically coherent +sai for "South American Indian (Other)". Though useful in +bibliography, B<SUCH TAGS ARE NOT +FOR GENERAL USE>. For further guidance, email me. + +=item * + +Language tags are not country codes. In fact, they are often +distinct codes, as with language tag ja for Japanese, and +ISO 3166 country code C<.jp> for Japan. + +=back + +=head1 LIST OF LANGUAGES + +The first part of each item is the language tag, between +{...}. It +is followed by an English name for the language or language-group. +Language tags that I judge to be not for general use, are bracketed. + +This list is in alphabetical order by English name of the language. + +=for reminder + The name in the =item line MUST NOT have E<...>'s in it!! + +=for woohah START + +=over + +=item {ab} : Abkhazian + +eq Abkhaz + +=item {ace} : Achinese + +=item {ach} : Acoli + +=item {ada} : Adangme + +=item {aa} : Afar + +=item {afh} : Afrihili + +(Artificial) + +=item {af} : Afrikaans + +=item [{afa} : Afro-Asiatic (Other)] + +=item {aka} : Akan + +=item {akk} : Akkadian + +(Historical) + +=item {sq} : Albanian + +=item {ale} : Aleut + +=item [{alg} : Algonquian languages] + +NOT Algonquin! + +=item [{tut} : Altaic (Other)] + +=item {am} : Amharic + +NOT Aramaic! + +=item {i-ami} : Ami + +eq Amis. eq 'Amis. eq Pangca. + +=item [{apa} : Apache languages] + +=item {ar} : Arabic + +Many forms are mutually un-intelligible in spoken media. +Notable forms: +{ar-ae} UAE Arabic; +{ar-bh} Bahrain Arabic; +{ar-dz} Algerian Arabic; +{ar-eg} Egyptian Arabic; +{ar-iq} Iraqi Arabic; +{ar-jo} Jordanian Arabic; +{ar-kw} Kuwait Arabic; +{ar-lb} Lebanese Arabic; +{ar-ly} Libyan Arabic; +{ar-ma} Moroccan Arabic; +{ar-om} Omani Arabic; +{ar-qa} Qatari Arabic; +{ar-sa} Sauda Arabic; +{ar-sy} Syrian Arabic; +{ar-tn} Tunisian Arabic; +{ar-ye} Yemen Arabic. + +=item {arc} : Aramaic + +NOT Amharic! NOT Samaritan Aramaic! + +=item {arp} : Arapaho + +=item {arn} : Araucanian + +=item {arw} : Arawak + +=item {hy} : Armenian + +=item [{art} : Artificial (Other)] + +=item {as} : Assamese + +=item [{ath} : Athapascan languages] + +eq Athabaskan. eq Athapaskan. eq Athabascan. + +=item [{aus} : Australian languages] + +=item [{map} : Austronesian (Other)] + +=item {ava} : Avaric + +=item {ae} : Avestan + +eq Zend + +=item {awa} : Awadhi + +=item {ay} : Aymara + +=item {az} : Azerbaijani + +eq Azeri + +=item {ban} : Balinese + +=item [{bat} : Baltic (Other)] + +=item {bal} : Baluchi + +=item {bam} : Bambara + +=item [{bai} : Bamileke languages] + +=item {bad} : Banda + +=item [{bnt} : Bantu (Other)] + +=item {bas} : Basa + +=item {ba} : Bashkir + +=item {eu} : Basque + +=item {btk} : Batak (Indonesia) + +=item {bej} : Beja + +=item {be} : Belarusian + +eq Belarussian. eq Byelarussian. +eq Belorussian. eq Byelorussian. +eq White Russian. eq White Ruthenian. +NOT Ruthenian! + +=item {bem} : Bemba + +=item {bn} : Bengali + +eq Bangla. + +=item [{ber} : Berber (Other)] + +=item {bho} : Bhojpuri + +=item {bh} : Bihari + +=item {bik} : Bikol + +=item {bin} : Bini + +=item {bi} : Bislama + +eq Bichelamar. + +=item {bs} : Bosnian + +=item {bra} : Braj + +=item {br} : Breton + +=item {bug} : Buginese + +=item {bg} : Bulgarian + +=item {i-bnn} : Bunun + +=item {bua} : Buriat + +=item {my} : Burmese + +=item {cad} : Caddo + +=item {car} : Carib + +=item {ca} : Catalan + +eq CatalE<aacute>n. eq Catalonian. + +=item [{cau} : Caucasian (Other)] + +=item {ceb} : Cebuano + +=item [{cel} : Celtic (Other)] + +Notable forms: +{cel-gaulish} Gaulish (Historical) + +=item [{cai} : Central American Indian (Other)] + +=item {chg} : Chagatai + +(Historical?) + +=item [{cmc} : Chamic languages] + +=item {ch} : Chamorro + +=item {ce} : Chechen + +=item {chr} : Cherokee + +eq Tsalagi + +=item {chy} : Cheyenne + +=item {chb} : Chibcha + +(Historical) NOT Chibchan (which is a language family). + +=item {ny} : Chichewa + +eq Nyanja. eq Chinyanja. + +=item {zh} : Chinese + +Many forms are mutually un-intelligible in spoken media. +Notable subforms: +{zh-cn} PRC Chinese; +{zh-hk} Hong Kong Chinese; +{zh-mo} Macau Chinese; +{zh-sg} Singapore Chinese; +{zh-tw} Taiwan Chinese; +{zh-guoyu} Mandarin [Putonghua/Guoyu]; +{zh-hakka} Hakka [formerly i-hakka]; +{zh-min} Hokkien; +{zh-min-nan} Southern Hokkien; +{zh-wuu} Shanghaiese; +{zh-xiang} Hunanese; +{zh-gan} Gan; +{zh-yue} Cantonese. + +=for etc +{i-hakka} Hakka (old tag) + +=item {chn} : Chinook Jargon + +eq Chinook Wawa. + +=item {chp} : Chipewyan + +=item {cho} : Choctaw + +=item {cu} : Church Slavic + +eq Old Church Slavonic. + +=item {chk} : Chuukese + +eq Trukese. eq Chuuk. eq Truk. eq Ruk. + +=item {cv} : Chuvash + +=item {cop} : Coptic + +=item {kw} : Cornish + +=item {co} : Corsican + +eq Corse. + +=item {cre} : Cree + +NOT Creek! + +=item {mus} : Creek + +NOT Cree! + +=item [{cpe} : English-based Creoles and pidgins (Other)] + +=item [{cpf} : French-based Creoles and pidgins (Other)] + +=item [{cpp} : Portuguese-based Creoles and pidgins (Other)] + +=item [{crp} : Creoles and pidgins (Other)] + +=item {hr} : Croatian + +eq Croat. + +=item [{cus} : Cushitic (Other)] + +=item {cs} : Czech + +=item {dak} : Dakota + +eq Nakota. eq Latoka. + +=item {da} : Danish + +=item {day} : Dayak + +=item {i-default} : Default (Fallthru) Language + +Defined in RFC 2277, this is for tagging text +(which must include English text, and might/should include text +in other appropriate languages) that is emitted in a context +where language-negotiation wasn't possible -- in SMTP mail failure +messages, for example. + +=item {del} : Delaware + +=item {din} : Dinka + +=item {div} : Divehi + +=item {doi} : Dogri + +NOT Dogrib! + +=item {dgr} : Dogrib + +NOT Dogri! + +=item [{dra} : Dravidian (Other)] + +=item {dua} : Duala + +=item {nl} : Dutch + +eq Netherlander. Notable forms: +{nl-nl} Netherlands Dutch; +{nl-be} Belgian Dutch. + +=item {dum} : Middle Dutch (ca.1050-1350) + +(Historical) + +=item {dyu} : Dyula + +=item {dz} : Dzongkha + +=item {efi} : Efik + +=item {egy} : Ancient Egyptian + +(Historical) + +=item {eka} : Ekajuk + +=item {elx} : Elamite + +(Historical) + +=item {en} : English + +Notable forms: +{en-au} Australian English; +{en-bz} Belize English; +{en-ca} Canadian English; +{en-gb} UK English; +{en-ie} Irish English; +{en-jm} Jamaican English; +{en-nz} New Zealand English; +{en-ph} Philippine English; +{en-tt} Trinidad English; +{en-us} US English; +{en-za} South African English; +{en-zw} Zimbabwe English. + +=item {enm} : Old English (1100-1500) + +(Historical) + +=item {ang} : Old English (ca.450-1100) + +eq Anglo-Saxon. (Historical) + +=item {eo} : Esperanto + +(Artificial) + +=item {et} : Estonian + +=item {ewe} : Ewe + +=item {ewo} : Ewondo + +=item {fan} : Fang + +=item {fat} : Fanti + +=item {fo} : Faroese + +=item {fj} : Fijian + +=item {fi} : Finnish + +=item [{fiu} : Finno-Ugrian (Other)] + +eq Finno-Ugric. NOT Ugaritic! + +=item {fon} : Fon + +=item {fr} : French + +Notable forms: +{fr-fr} France French; +{fr-be} Belgian French; +{fr-ca} Canadian French; +{fr-ch} Swiss French; +{fr-lu} Luxembourg French; +{fr-mc} Monaco French. + +=item {frm} : Middle French (ca.1400-1600) + +(Historical) + +=item {fro} : Old French (842-ca.1400) + +(Historical) + +=item {fy} : Frisian + +=item {fur} : Friulian + +=item {ful} : Fulah + +=item {gaa} : Ga + +=item {gd} : Scots Gaelic + +NOT Scots! + +=item {gl} : Gallegan + +eq Galician + +=item {lug} : Ganda + +=item {gay} : Gayo + +=item {gba} : Gbaya + +=item {gez} : Geez + +eq Ge'ez + +=item {ka} : Georgian + +=item {de} : German + +Notable forms: +{de-at} Austrian German; +{de-be} Belgian German; +{de-ch} Swiss German; +{de-de} Germany German; +{de-li} Liechtenstein German; +{de-lu} Luxembourg German. + +=item {gmh} : Middle High German (ca.1050-1500) + +(Historical) + +=item {goh} : Old High German (ca.750-1050) + +(Historical) + +=item [{gem} : Germanic (Other)] + +=item {gil} : Gilbertese + +=item {gon} : Gondi + +=item {gor} : Gorontalo + +=item {got} : Gothic + +(Historical) + +=item {grb} : Grebo + +=item {grc} : Ancient Greek + +(Historical) (Until 15th century or so.) + +=item {el} : Modern Greek + +(Since 15th century or so.) + +=item {gn} : Guarani + +GuaranE<iacute> + +=item {gu} : Gujarati + +=item {gwi} : Gwich'in + +eq Gwichin + +=item {hai} : Haida + +=item {ha} : Hausa + +=item {haw} : Hawaiian + +Hawai'ian + +=item {he} : Hebrew + +(Formerly "iw".) + +=for etc +{iw} Hebrew (old tag) + +=item {hz} : Herero + +=item {hil} : Hiligaynon + +=item {him} : Himachali + +=item {hi} : Hindi + +=item {ho} : Hiri Motu + +=item {hit} : Hittite + +(Historical) + +=item {hmn} : Hmong + +=item {hu} : Hungarian + +=item {hup} : Hupa + +=item {iba} : Iban + +=item {is} : Icelandic + +=item {ibo} : Igbo + +=item {ijo} : Ijo + +=item {ilo} : Iloko + +=item [{inc} : Indic (Other)] + +=item [{ine} : Indo-European (Other)] + +=item {id} : Indonesian + +(Formerly "in".) + +=for etc +{in} Indonesian (old tag) + +=item {ia} : Interlingua (International Auxiliary Language Association) + +(Artificial) NOT Interlingue! + +=item {ie} : Interlingue + +(Artificial) NOT Interlingua! + +=item {iu} : Inuktitut + +A subform of "Eskimo". + +=item {ik} : Inupiaq + +A subform of "Eskimo". + +=item [{ira} : Iranian (Other)] + +=item {ga} : Irish + +=item {mga} : Middle Irish (900-1200) + +(Historical) + +=item {sga} : Old Irish (to 900) + +(Historical) + +=item [{iro} : Iroquoian languages] + +=item {it} : Italian + +Notable forms: +{it-it} Italy Italian; +{it-ch} Swiss Italian. + +=item {ja} : Japanese + +(NOT "jp"!) + +=item {jw} : Javanese + +=item {jrb} : Judeo-Arabic + +=item {jpr} : Judeo-Persian + +=item {kab} : Kabyle + +=item {kac} : Kachin + +=item {kl} : Kalaallisut + +eq Greenlandic "Eskimo" + +=item {kam} : Kamba + +=item {kn} : Kannada + +eq Kanarese. NOT Canadian! + +=item {kau} : Kanuri + +=item {kaa} : Kara-Kalpak + +=item {kar} : Karen + +=item {ks} : Kashmiri + +=item {kaw} : Kawi + +=item {kk} : Kazakh + +=item {kha} : Khasi + +=item {km} : Khmer + +eq Cambodian. eq Kampuchean. + +=item [{khi} : Khoisan (Other)] + +=item {kho} : Khotanese + +=item {ki} : Kikuyu + +eq Gikuyu. + +=item {kmb} : Kimbundu + +=item {rw} : Kinyarwanda + +=item {ky} : Kirghiz + +=item {i-klingon} : Klingon + +=item {kv} : Komi + +=item {kon} : Kongo + +=item {kok} : Konkani + +=item {ko} : Korean + +=item {kos} : Kosraean + +=item {kpe} : Kpelle + +=item {kro} : Kru + +=item {kj} : Kuanyama + +=item {kum} : Kumyk + +=item {ku} : Kurdish + +=item {kru} : Kurukh + +=item {kut} : Kutenai + +=item {lad} : Ladino + +eq Judeo-Spanish. NOT Ladin (a minority language in Italy). + +=item {lah} : Lahnda + +NOT Lamba! + +=item {lam} : Lamba + +NOT Lahnda! + +=item {lo} : Lao + +eq Laotian. + +=item {la} : Latin + +(Historical) NOT Ladin! NOT Ladino! + +=item {lv} : Latvian + +eq Lettish. + +=item {lb} : Letzeburgesch + +eq Luxemburgian, eq Luxemburger. (Formerly i-lux.) + +=for etc +{i-lux} Letzeburgesch (old tag) + +=item {lez} : Lezghian + +=item {ln} : Lingala + +=item {lt} : Lithuanian + +=item {nds} : Low German + +eq Low Saxon. eq Low German. eq Low Saxon. + +=item {loz} : Lozi + +=item {lub} : Luba-Katanga + +=item {lua} : Luba-Lulua + +=item {lui} : Luiseno + +eq LuiseE<ntilde>o. + +=item {lun} : Lunda + +=item {luo} : Luo (Kenya and Tanzania) + +=item {lus} : Lushai + +=item {mk} : Macedonian + +eq the modern Slavic language spoken in what was Yugoslavia. +NOT the form of Greek spoken in Greek Macedonia! + +=item {mad} : Madurese + +=item {mag} : Magahi + +=item {mai} : Maithili + +=item {mak} : Makasar + +=item {mg} : Malagasy + +=item {ms} : Malay + +NOT Malayalam! + +=item {ml} : Malayalam + +NOT Malay! + +=item {mt} : Maltese + +=item {mnc} : Manchu + +=item {mdr} : Mandar + +NOT Mandarin! + +=item {man} : Mandingo + +=item {mni} : Manipuri + +eq Meithei. + +=item [{mno} : Manobo languages] + +=item {gv} : Manx + +=item {mi} : Maori + +NOT Mari! + +=item {mr} : Marathi + +=item {chm} : Mari + +NOT Maori! + +=item {mh} : Marshall + +eq Marshallese. + +=item {mwr} : Marwari + +=item {mas} : Masai + +=item [{myn} : Mayan languages] + +=item {men} : Mende + +=item {mic} : Micmac + +=item {min} : Minangkabau + +=item {i-mingo} : Mingo + +eq the Irquoian language West Virginia Seneca. NOT New York Seneca! + +=item [{mis} : Miscellaneous languages] + +Don't use this. + +=item {moh} : Mohawk + +=item {mo} : Moldavian + +eq Moldovan. + +=item [{mkh} : Mon-Khmer (Other)] + +=item {lol} : Mongo + +=item {mn} : Mongolian + +eq Mongol. + +=item {mos} : Mossi + +=item [{mul} : Multiple languages] + +Not for normal use. + +=item [{mun} : Munda languages] + +=item {nah} : Nahuatl + +=item {na} : Nauru + +=item {nv} : Navajo + +eq Navaho. (Formerly i-navajo.) + +=for etc +{i-navajo} Navajo (old tag) + +=item {nd} : North Ndebele + +=item {nr} : South Ndebele + +=item {ng} : Ndonga + +=item {ne} : Nepali + +eq Nepalese. Notable forms: +{ne-np} Nepal Nepali; +{ne-in} India Nepali. + +=item {new} : Newari + +=item {nia} : Nias + +=item [{nic} : Niger-Kordofanian (Other)] + +=item [{ssa} : Nilo-Saharan (Other)] + +=item {niu} : Niuean + +=item {non} : Old Norse + +(Historical) + +=item [{nai} : North American Indian] + +Do not use this. + +=item {se} : Northern Sami + +eq Lappish. eq Lapp. eq (Northern) Saami. + +=item {no} : Norwegian + +Note the two following forms: + +=item {nb} : Norwegian Bokmal + +eq BokmE<aring>l, (A form of Norwegian.) (Formerly no-bok.) + +=for etc +{no-bok} Norwegian Bokmal (old tag) + +=item {nn} : Norwegian Nynorsk + +(A form of Norwegian.) (Formerly no-nyn.) + +=for etc +{no-nyn} Norwegian Nynorsk (old tag) + +=item [{nub} : Nubian languages] + +=item {nym} : Nyamwezi + +=item {nyn} : Nyankole + +=item {nyo} : Nyoro + +=item {nzi} : Nzima + +=item {oc} : Occitan (post 1500) + +eq ProvenE<ccedil>al, eq Provencal + +=item {oji} : Ojibwa + +eq Ojibwe. + +=item {or} : Oriya + +=item {om} : Oromo + +=item {osa} : Osage + +=item {os} : Ossetian; Ossetic + +=item [{oto} : Otomian languages] + +Group of languages collectively called "OtomE<iacute>". + +=item {pal} : Pahlavi + +eq Pahlevi + +=item {i-pwn} : Paiwan + +eq Pariwan + +=item {pau} : Palauan + +=item {pi} : Pali + +(Historical?) + +=item {pam} : Pampanga + +=item {pag} : Pangasinan + +=item {pa} : Panjabi + +eq Punjabi + +=item {pap} : Papiamento + +eq Papiamentu. + +=item [{paa} : Papuan (Other)] + +=item {fa} : Persian + +eq Farsi. eq Iranian. + +=item {peo} : Old Persian (ca.600-400 B.C.) + +=item [{phi} : Philippine (Other)] + +=item {phn} : Phoenician + +(Historical) + +=item {pon} : Pohnpeian + +NOT Pompeiian! + +=item {pl} : Polish + +=item {pt} : Portuguese + +eq Portugese. Notable forms: +{pt-pt} Portugal Portuguese; +{pt-br} Brazilian Portuguese. + +=item [{pra} : Prakrit languages] + +=item {pro} : Old Provencal (to 1500) + +eq Old ProvenE<ccedil>al. (Historical.) + +=item {ps} : Pushto + +eq Pashto. eq Pushtu. + +=item {qu} : Quechua + +eq Quecha. + +=item {rm} : Raeto-Romance + +eq Romansh. + +=item {raj} : Rajasthani + +=item {rap} : Rapanui + +=item {rar} : Rarotongan + +=item [{qaa - qtz} : Reserved for local use.] + +=item [{roa} : Romance (Other)] + +NOT Romanian! NOT Romany! NOT Romansh! + +=item {ro} : Romanian + +eq Rumanian. NOT Romany! + +=item {rom} : Romany + +eq Rom. NOT Romanian! + +=item {rn} : Rundi + +=item {ru} : Russian + +NOT White Russian! NOT Rusyn! + +=item [{sal} : Salishan languages] + +Large language group. + +=item {sam} : Samaritan Aramaic + +NOT Aramaic! + +=item [{smi} : Sami languages (Other)] + +=item {sm} : Samoan + +=item {sad} : Sandawe + +=item {sg} : Sango + +=item {sa} : Sanskrit + +(Historical) + +=item {sat} : Santali + +=item {sc} : Sardinian + +eq Sard. + +=item {sas} : Sasak + +=item {sco} : Scots + +NOT Scots Gaelic! + +=item {sel} : Selkup + +=item [{sem} : Semitic (Other)] + +=item {sr} : Serbian + +eq Serb. NOT Sorbian. + +=item {srr} : Serer + +=item {shn} : Shan + +=item {sn} : Shona + +=item {sid} : Sidamo + +=item {sgn-...} : Sign Languages + +Always use with a subtag. Notable forms: +{sgn-gb} British Sign Language (BSL); +{sgn-ie} Irish Sign Language (ESL); +{sgn-ni} Nicaraguan Sign Language (ISN); +{sgn-us} American Sign Language (ASL). + +=item {bla} : Siksika + +eq Blackfoot. eq Pikanii. + +=item {sd} : Sindhi + +=item {si} : Sinhalese + +eq Sinhala. + +=item [{sit} : Sino-Tibetan (Other)] + +=item [{sio} : Siouan languages] + +=item {den} : Slave (Athapascan) + +("Slavey" is a subform.) + +=item [{sla} : Slavic (Other)] + +=item {sk} : Slovak + +eq Slovakian. + +=item {sl} : Slovenian + +eq Slovene. + +=item {sog} : Sogdian + +=item {so} : Somali + +=item {son} : Songhai + +=item {snk} : Soninke + +=item {wen} : Sorbian languages + +eq Wendish. eq Sorb. eq Lusatian. eq Wend. NOT Venda! NOT Serbian! + +=item {nso} : Northern Sotho + +=item {st} : Southern Sotho + +eq Sutu. eq Sesotho. + +=item [{sai} : South American Indian (Other)] + +=item {es} : Spanish + +Notable forms: +{es-ar} Argentine Spanish; +{es-bo} Bolivian Spanish; +{es-cl} Chilean Spanish; +{es-co} Colombian Spanish; +{es-do} Dominican Spanish; +{es-ec} Ecuadorian Spanish; +{es-es} Spain Spanish; +{es-gt} Guatemalan Spanish; +{es-hn} Honduran Spanish; +{es-mx} Mexican Spanish; +{es-pa} Panamanian Spanish; +{es-pe} Peruvian Spanish; +{es-pr} Puerto Rican Spanish; +{es-py} Paraguay Spanish; +{es-sv} Salvadoran Spanish; +{es-us} US Spanish; +{es-uy} Uruguayan Spanish; +{es-ve} Venezuelan Spanish. + +=item {suk} : Sukuma + +=item {sux} : Sumerian + +(Historical) + +=item {su} : Sundanese + +=item {sus} : Susu + +=item {sw} : Swahili + +eq Kiswahili + +=item {ss} : Swati + +=item {sv} : Swedish + +Notable forms: +{sv-se} Sweden Swedish; +{sv-fi} Finland Swedish. + +=item {syr} : Syriac + +=item {tl} : Tagalog + +=item {ty} : Tahitian + +=item [{tai} : Tai (Other)] + +NOT Thai! + +=item {tg} : Tajik + +=item {tmh} : Tamashek + +=item {ta} : Tamil + +=item {i-tao} : Tao + +eq Yami. + +=item {tt} : Tatar + +=item {i-tay} : Tayal + +eq Atayal. eq Atayan. + +=item {te} : Telugu + +=item {ter} : Tereno + +=item {tet} : Tetum + +=item {th} : Thai + +NOT Tai! + +=item {bo} : Tibetan + +=item {tig} : Tigre + +=item {ti} : Tigrinya + +=item {tem} : Timne + +eq Themne. eq Timene. + +=item {tiv} : Tiv + +=item {tli} : Tlingit + +=item {tpi} : Tok Pisin + +=item {tkl} : Tokelau + +=item {tog} : Tonga (Nyasa) + +NOT Tsonga! + +=item {to} : Tonga (Tonga Islands) + +(Pronounced "Tong-a", not "Tong-ga") + +NOT Tsonga! + +=item {tsi} : Tsimshian + +eq Sm'algyax + +=item {ts} : Tsonga + +NOT Tonga! + +=item {i-tsu} : Tsou + +=item {tn} : Tswana + +Same as Setswana. + +=item {tum} : Tumbuka + +=item {tr} : Turkish + +(Typically in Roman script) + +=item {ota} : Ottoman Turkish (1500-1928) + +(Typically in Arabic script) (Historical) + +=item {tk} : Turkmen + +eq Turkmeni. + +=item {tvl} : Tuvalu + +=item {tyv} : Tuvinian + +eq Tuvan. eq Tuvin. + +=item {tw} : Twi + +=item {uga} : Ugaritic + +NOT Ugric! + +=item {ug} : Uighur + +=item {uk} : Ukrainian + +=item {umb} : Umbundu + +=item {und} : Undetermined + +Not a tag for normal use. + +=item {ur} : Urdu + +=item {uz} : Uzbek + +eq E<Ouml>zbek + +=item {vai} : Vai + +=item {ven} : Venda + +NOT Wendish! NOT Wend! NOT Avestan! + +=item {vi} : Vietnamese + +eq Viet. + +=item {vo} : Volapuk + +eq VolapE<uuml>k. (Artificial) + +=item {vot} : Votic + +eq Votian. eq Vod. + +=item [{wak} : Wakashan languages] + +=item {wal} : Walamo + +eq Wolaytta. + +=item {war} : Waray + +Presumably the Philippine language Waray-Waray (SamareE<ntilde>o), +not the smaller Philippine language Waray Sorsogon, nor the extinct +Australian language Waray. + +=item {was} : Washo + +eq Washoe + +=item {cy} : Welsh + +=item {wo} : Wolof + +=item {x-...} : Unregistered (Semi-Private Use) + +"x-" is a prefix for language tags that are not registered with ISO +or IANA. Example, x-double-dutch + +=item {xh} : Xhosa + +=item {sah} : Yakut + +=item {yao} : Yao + +(The Yao in Malawi?) + +=item {yap} : Yapese + +eq Yap + +=item {yi} : Yiddish + +Formerly "ji". Sometimes in Roman script, sometimes in Hebrew script. + +=for etc +{ji} Yiddish (old tag) + +=item {yo} : Yoruba + +=item [{ypk} : Yupik languages] + +Several "Eskimo" languages. + +=item {znd} : Zande + +=item [{zap} : Zapotec] + +(A group of languages.) + +=item {zen} : Zenaga + +NOT Zend. + +=item {za} : Zhuang + +=item {zu} : Zulu + +=item {zun} : Zuni + +eq ZuE<ntilde>i + +=back + +=for woohah END + +=head1 SEE ALSO + +L<I18N::LangTags|I18N::LangTags> and its "See Also" section. + +=head1 COPYRIGHT AND DISCLAIMER + +Copyright (c) 2001,2002 Sean M. Burke. All rights reserved. + +You can redistribute and/or +modify this document under the same terms as Perl itself. + +This document is provided in the hope that it will be +useful, but without any warranty; +without even the implied warranty of accuracy, authoritativeness, +completeness, merchantability, or fitness for a particular purpose. + +Email any corrections or questions to me. + +=head1 AUTHOR + +Sean M. Burke, sburkeE<64>cpan.org + +=cut + + +# To generate a list of just the two and three-letter codes: + +#!/usr/local/bin/perl -w + +require 5; # Time-stamp: "2001-03-13 21:53:39 MST" + # Sean M. Burke, sburke@cpan.org + # This program is for generating the language_codes.txt file +use strict; +use LWP::Simple; +use HTML::TreeBuilder 3.10; +my $root = HTML::TreeBuilder->new(); +my $url = 'http://lcweb.loc.gov/standards/iso639-2/bibcodes.html'; +$root->parse(get($url) || die "Can't get $url"); +$root->eof(); + +my @codes; + +foreach my $tr ($root->find_by_tag_name('tr')) { + my @f = map $_->as_text(), $tr->content_list(); + #print map("<$_> ", @f), "\n"; + next unless @f == 5; + pop @f; # nix the French name + next if $f[-1] eq 'Language Name (English)'; # it's a header line + my $xx = splice(@f, 2,1); # pull out the two-letter code + $f[-1] =~ s/^\s+//; + $f[-1] =~ s/\s+$//; + if($xx =~ m/[a-zA-Z]/) { # there's a two-letter code for it + push @codes, [ lc($f[-1]), "$xx\t$f[-1]\n" ]; + } else { # print the three-letter codes. + if($f[0] eq $f[1]) { + push @codes, [ lc($f[-1]), "$f[1]\t$f[2]\n" ]; + } else { # shouldn't happen + push @codes, [ lc($f[-1]), "@f !!!!!!!!!!\n" ]; + } + } +} + +print map $_->[1], sort {; $a->[0] cmp $b->[0] } @codes; +print "[ based on $url\n at ", scalar(localtime), "]\n", + "[Note: doesn't include IANA-registered codes.]\n"; +exit; +__END__ + diff --git a/gnu/usr.bin/perl/lib/I18N/LangTags/README b/gnu/usr.bin/perl/lib/I18N/LangTags/README new file mode 100644 index 00000000000..fbae05f43d3 --- /dev/null +++ b/gnu/usr.bin/perl/lib/I18N/LangTags/README @@ -0,0 +1,78 @@ +README for I18N::LangTags + Time-stamp: "2001-05-29 21:52:15 MDT" + + I18N::LangTags + +I18N::LangTags - functions for dealing with RFC3066-style language +tags + +Language tags are a formalism, described in RFC 3066 (obsoleting +1766), for declaring what language form (language and possibly +dialect) a given chunk of information is in. + +This library provides functions for common tasks involving language +tags (notably the extraction of them, comparing them, and testing the +formal validity of them) as is needed in a variety of protocols and +applications. + + +I18N::LangTags::List -- tags and names for human languages. This +module goes from known language tag names ("fr-CA") to their English +names ("Canadian French"). Its documentation also lists the several +hundred known tags and some common subforms. You may find this useful +as a reference. + + +See the POD for more information. + + +INSTALLATION + +You install I18N::LangTags and I18N::LangTags::List, as you would +install any perl module library, by running these commands: + + perl Makefile.PL + make + make test + make install + +If you want to install a private copy of I18N::LangTags in your home +directory, then you should try to produce the initial Makefile with +something like this command: + + perl Makefile.PL LIB=~/perl + +See perldoc perlmodinstall for more information on installing modules. + + +DOCUMENTATION + +POD-format documentation is included in LangTags.pm. POD is readable +with the 'perldoc' utility. See ChangeLog for recent changes. + + +SUPPORT + +Questions, bug reports, useful code bits, and suggestions for +I18N::LangTags should just be sent to me at sburke@cpan.org + + +AVAILABILITY + +The latest version of I18N::LangTags is available from the +Comprehensive Perl Archive Network (CPAN). Visit +<http://www.cpan.org/> to find a CPAN site near you. + + +COPYRIGHT + +Copyright 1998-2001, Sean M. Burke <sburke@cpan.org>, all rights +reserved. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. diff --git a/gnu/usr.bin/perl/lib/I18N/LangTags/test.pl b/gnu/usr.bin/perl/lib/I18N/LangTags/test.pl new file mode 100644 index 00000000000..88a7bf66ae8 --- /dev/null +++ b/gnu/usr.bin/perl/lib/I18N/LangTags/test.pl @@ -0,0 +1,79 @@ +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' + +######################### We start with some black magic to print on failure. +require 5; + # Time-stamp: "2001-06-21 22:59:38 MDT" +use strict; +use Test; +BEGIN { plan tests => 46 }; +BEGIN { ok 1 } +use I18N::LangTags (':ALL'); + +print "# Perl v$], I18N::LangTags v$I18N::LangTags::VERSION\n"; + +ok !is_language_tag(''); +ok is_language_tag('fr'); +ok is_language_tag('fr-ca'); +ok is_language_tag('fr-CA'); +ok !is_language_tag('fr-CA-'); +ok !is_language_tag('fr_CA'); +ok is_language_tag('fr-ca-joual'); +ok !is_language_tag('frca'); +ok is_language_tag('nav'); +ok is_language_tag('nav-shiprock'); +ok !is_language_tag('nav-ceremonial'); # subtag too long +ok !is_language_tag('x'); +ok !is_language_tag('i'); +ok is_language_tag('i-borg'); # NB: fictitious tag +ok is_language_tag('x-borg'); +ok is_language_tag('x-borg-prot5123'); +ok same_language_tag('x-borg-prot5123', 'i-BORG-Prot5123' ); +ok !same_language_tag('en', 'en-us' ); + +ok 0 == similarity_language_tag('en-ca', 'fr-ca'); +ok 1 == similarity_language_tag('en-ca', 'en-us'); +ok 2 == similarity_language_tag('en-us-southern', 'en-us-western'); +ok 2 == similarity_language_tag('en-us-southern', 'en-us'); + +ok grep $_ eq 'hi', panic_languages('kok'); +ok grep $_ eq 'en', panic_languages('x-woozle-wuzzle'); +ok ! grep $_ eq 'mr', panic_languages('it'); +ok grep $_ eq 'es', panic_languages('it'); +ok grep $_ eq 'it', panic_languages('es'); + + +print "# Now the ::List tests...\n"; +use I18N::LangTags::List; +foreach my $lt (qw( + en + en-us + en-kr + el + elx + i-mingo + i-mingo-tom + x-mingo-tom + it + it-it + it-IT + it-FR + yi + ji + cre-syllabic + cre-syllabic-western + cre-western + cre-latin +)) { + my $name = I18N::LangTags::List::name($lt); + if($name) { + ok(1); + print "# $lt -> $name\n"; + } else { + ok(0); + print "# Failed lookup on $lt\n"; + } +} + +print "# So there!\n"; + |