src - OpenBSD base system

diff options


context:
space:
mode:

author	Todd C. Miller <millert@cvs.openbsd.org>	2002-10-27 22:15:15 +0000
committer	Todd C. Miller <millert@cvs.openbsd.org>	2002-10-27 22:15:15 +0000
commit	74cfb115ac810480c0000dc742b20383c1578bac (patch)
tree	316d96e5123617976f1637b143570c309a662045 /gnu/usr.bin/perl/utf8.h
parent	453ade492b8e06c619009d6cd52a85cb04e8cf17 (diff)

stock perl 5.8.0 from CPAN

Diffstat (limited to 'gnu/usr.bin/perl/utf8.h')

-rw-r--r--

gnu/usr.bin/perl/utf8.h

231

1 files changed, 173 insertions, 58 deletions

diff --git a/gnu/usr.bin/perl/utf8.h b/gnu/usr.bin/perl/utf8.h
index d022e867bed..6885859a3f7 100644
--- a/gnu/usr.bin/perl/utf8.h
+++ b/gnu/usr.bin/perl/utf8.h

@@ -1,12 +1,28 @@

/* utf8.h

* You may distribute under the terms of either the GNU General Public

* License or the Artistic License, as specified in the README file.

+/* Use UTF-8 as the default script encoding?

+ * Turning this on will break scripts having non-UTF8 binary

+ * data (such as Latin-1) in string literals. */

+#ifdef USE_UTF8_SCRIPTS

+# define USE_UTF8_IN_NAMES (!IN_BYTES)

+#else

+# define USE_UTF8_IN_NAMES (PL_hints & HINT_UTF8)

+#endif

+#ifdef EBCDIC

+/* The equivalent of these macros but implementing UTF-EBCDIC

+ are in the following header file:

+ */

+#include "utfebcdic.h"

+#else

START_EXTERN_C

#ifdef DOINIT

@@ -26,72 +42,86 @@ EXTCONST unsigned char PL_utf8skip[];

#endif

END_EXTERN_C

-#define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */

-/* #define IN_UTF8 (PL_curcop->op_private & HINT_UTF8) */

-#define IN_BYTE (PL_curcop->op_private & HINT_BYTE)

-#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)

-#define UTF8_ALLOW_EMPTY 0x0001

-#define UTF8_ALLOW_CONTINUATION 0x0002

-#define UTF8_ALLOW_NON_CONTINUATION 0x0004

-#define UTF8_ALLOW_FE_FF 0x0008

-#define UTF8_ALLOW_SHORT 0x0010

-#define UTF8_ALLOW_SURROGATE 0x0020

-#define UTF8_ALLOW_BOM 0x0040

-#define UTF8_ALLOW_FFFF 0x0080

-#define UTF8_ALLOW_LONG 0x0100

-#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\

- UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|\

- UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG)

-#define UTF8_ALLOW_ANY 0x00ff

-#define UTF8_CHECK_ONLY 0x0100

-#define UNICODE_SURROGATE_FIRST 0xd800

-#define UNICODE_SURROGATE_LAST 0xdfff

-#define UNICODE_REPLACEMENT 0xfffd

-#define UNICODE_BYTER_ORDER_MARK 0xfffe

-#define UNICODE_ILLEGAL 0xffff

-#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \

- (c) <= UNICODE_SURROGATE_LAST)

-#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACMENT)

-#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTER_ORDER_MARK)

-#define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL)

#define UTF8SKIP(s) PL_utf8skip[*(U8*)s]

-#define UTF8_QUAD_MAX UINT64_C(0x1000000000)

+/* Native character to iso-8859-1 */

+#define NATIVE_TO_ASCII(ch) (ch)

+#define ASCII_TO_NATIVE(ch) (ch)

+/* Transform after encoding */

+#define NATIVE_TO_UTF(ch) (ch)

+#define UTF_TO_NATIVE(ch) (ch)

+/* Transforms in wide UV chars */

+#define UNI_TO_NATIVE(ch) (ch)

+#define NATIVE_TO_UNI(ch) (ch)

+/* Transforms in invariant space */

+#define NATIVE_TO_NEED(enc,ch) (ch)

+#define ASCII_TO_NEED(enc,ch) (ch)

+/* As there are no translations avoid the function wrapper */

+#define Perl_utf8n_to_uvchr Perl_utf8n_to_uvuni

+#define Perl_uvchr_to_utf8 Perl_uvuni_to_utf8

- The following table is from Unicode 3.1.

+ The following table is from Unicode 3.2.

Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte

- U+0000..U+007F 00..7F

- U+0080..U+07FF C2..DF 80..BF

- U+0800..U+0FFF E0 A0..BF 80..BF

- U+1000..U+FFFF E1..EF 80..BF 80..BF

+ U+0000..U+007F 00..7F

+ U+0080..U+07FF C2..DF 80..BF

+ U+0800..U+0FFF E0 A0..BF 80..BF

+ U+1000..U+CFFF E1..EC 80..BF 80..BF

+ U+D000..U+D7FF ED 80..9F 80..BF

+ U+D800..U+DFFF ******* ill-formed *******

+ U+E000..U+FFFF EE..EF 80..BF 80..BF

U+10000..U+3FFFF F0 90..BF 80..BF 80..BF

U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF

U+100000..U+10FFFF F4 80..8F 80..BF 80..BF

+Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,

+the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.

+The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings:

+it is technically possible to UTF-8-encode a single code point in different

+ways, but that is explicitly forbidden, and the shortest possible encoding

+should always be used (and that is what Perl does).

-#define UTF8_IS_ASCII(c) (((U8)c) < 0x80)

+/*

+ Another way to look at it, as bits:

+ Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte

+ 0aaaaaaa 0aaaaaaa

+ 00000bbbbbaaaaaa 110bbbbb 10aaaaaa

+ ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa

+ 00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa

+As you can see, the continuation bytes all begin with C<10>, and the

+leading bits of the start byte tell how many bytes the are in the

+encoded character.

+*/

+#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)

+#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))

+#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))

#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))

#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))

#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)

-#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) != 0xc0)

+#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0)

+#define UTF_START_MARK(len) ((len > 7) ? 0xFF : (0xFE << (7-len)))

+#define UTF_START_MASK(len) ((len >= 7) ? 0x00 : (0x1F >> (len-2)))

-#define UTF8_CONTINUATION_MASK ((U8)0x3f)

-#define UTF8_ACCUMULATION_SHIFT 6

-#define UTF8_ACCUMULATE(old, new) (((old) << UTF8_ACCUMULATION_SHIFT) | (((U8)new) & UTF8_CONTINUATION_MASK))

+#define UTF_CONTINUATION_MARK 0x80

+#define UTF_ACCUMULATION_SHIFT 6

+#define UTF_CONTINUATION_MASK ((U8)0x3f)

+#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))

-#define UTF8_EIGHT_BIT_HI(c) ( (((U8)(c))>>6) |0xc0)

-#define UTF8_EIGHT_BIT_LO(c) (((((U8)(c)) )&0x3f)|0x80)

+#define UTF8_EIGHT_BIT_HI(c) ((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))

+#define UTF8_EIGHT_BIT_LO(c) (((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)

#ifdef HAS_QUAD

#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \

@@ -100,7 +130,7 @@ END_EXTERN_C

(uv) < 0x200000 ? 4 : \

(uv) < 0x4000000 ? 5 : \

(uv) < 0x80000000 ? 6 : \

- (uv) < UTF8_QUAD_MAX ? 7 : 13 )

+ (uv) < UTF8_QUAD_MAX ? 7 : 13 )

#else

/* No, I'm not even going to *TRY* putting #ifdef inside a #define */

#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \

@@ -111,23 +141,108 @@ END_EXTERN_C

(uv) < 0x80000000 ? 6 : 7 )

#endif

* Note: we try to be careful never to call the isXXX_utf8() functions

* unless we're pretty sure we've seen the beginning of a UTF-8 character

* (that is, the two high bits are set). Otherwise we risk loading in the

* heavy-duty SWASHINIT and SWASHGET routines unnecessarily.

-#ifdef EBCDIC

-#define isIDFIRST_lazy_if(p,c) isIDFIRST(*(p))

-#define isALNUM_lazy_if(p,c) isALNUM(*(p))

-#else

-#define isIDFIRST_lazy_if(p,c) ((IN_BYTE || (!c || (*((U8*)p) < 0xc0))) \

+#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((U8*)p) < 0xc0))) \

? isIDFIRST(*(p)) \

: isIDFIRST_utf8((U8*)p))

-#define isALNUM_lazy_if(p,c) ((IN_BYTE || (!c || (*((U8*)p) < 0xc0))) \

+#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || (*((U8*)p) < 0xc0))) \

? isALNUM(*(p)) \

: isALNUM_utf8((U8*)p))

-#endif

+#endif /* EBCDIC vs ASCII */

+/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */

#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1)

#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)

+/* how wide can a single UTF8 encoded character become */

+#define UTF8_MAXLEN 13

+/* how wide a character can become when upper/lowercased */

+#define UTF8_MAXLEN_UCLC_MULT 3

+#define UTF8_MAXLEN_UCLC (UTF8_MAXLEN*UTF8_MAXLEN_UCLC_MULT)

+/* how wide a character can become when casefolded */

+#define UTF8_MAXLEN_FOLD_MULT 3

+#define UTF8_MAXLEN_FOLD (UTF8_MAXLEN*UTF8_MAXLEN_FOLD_MULT)

+#define IN_BYTES (PL_curcop->op_private & HINT_BYTES)

+#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)

+#define UTF8_ALLOW_EMPTY 0x0001

+#define UTF8_ALLOW_CONTINUATION 0x0002

+#define UTF8_ALLOW_NON_CONTINUATION 0x0004

+#define UTF8_ALLOW_FE_FF 0x0008

+#define UTF8_ALLOW_SHORT 0x0010

+#define UTF8_ALLOW_SURROGATE 0x0020

+#define UTF8_ALLOW_FFFF 0x0040 /* Allows also FFFE. */

+#define UTF8_ALLOW_LONG 0x0080

+#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\

+ UTF8_ALLOW_SURROGATE|\

+ UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG)

+#define UTF8_ALLOW_ANY 0x00FF

+#define UTF8_CHECK_ONLY 0x0200

+#define UNICODE_SURROGATE_FIRST 0xD800

+#define UNICODE_SURROGATE_LAST 0xDFFF

+#define UNICODE_REPLACEMENT 0xFFFD

+#define UNICODE_BYTE_ORDER_MARK 0xFEFF

+#define UNICODE_ILLEGAL 0xFFFF

+/* Though our UTF-8 encoding can go beyond this,

+ * let's be conservative and do as Unicode 3.2 says. */

+#define PERL_UNICODE_MAX 0x10FFFF

+#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */

+#define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */

+#define UNICODE_ALLOW_FFFF 0x0004 /* Allow 0xFFF[EF], 0x1FFF[EF], ... */

+#define UNICODE_ALLOW_SUPER 0x0008 /* Allow past 10xFFFF */

+#define UNICODE_ALLOW_ANY 0x000F

+#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \

+ (c) <= UNICODE_SURROGATE_LAST)

+#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT)

+#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTE_ORDER_MARK)

+#define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL)

+#ifdef HAS_QUAD

+# define UTF8_QUAD_MAX UINT64_C(0x1000000000)

+#endif

+#define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)

+#define UNICODE_LATIN_SMALL_LETTER_SHARP_S 0x00DF

+#define UNICODE_GREEK_CAPITAL_LETTER_SIGMA 0x03A3

+#define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2

+#define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3

+#define EBCDIC_LATIN_SMALL_LETTER_SHARP_S 0x0059

+#define UNI_DISPLAY_ISPRINT 0x0001

+#define UNI_DISPLAY_BACKSLASH 0x0002

+#define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)

+#define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)

+#ifdef EBCDIC

+# define ANYOF_FOLD_SHARP_S(node, input, end) \

+ (ANYOF_BITMAP_TEST(node, EBCDIC_LATIN_SMALL_LETTER_SHARP_S) && \

+ (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \

+ (ANYOF_FLAGS(node) & ANYOF_FOLD) && \

+ ((end) > (input) + 1) && \

+ toLOWER((input)[0]) == 's' && \

+ toLOWER((input)[1]) == 's')

+#else

+# define ANYOF_FOLD_SHARP_S(node, input, end) \

+ (ANYOF_BITMAP_TEST(node, UNICODE_LATIN_SMALL_LETTER_SHARP_S) && \

+ (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \

+ (ANYOF_FLAGS(node) & ANYOF_FOLD) && \

+ ((end) > (input) + 1) && \

+ toLOWER((input)[0]) == 's' && \

+ toLOWER((input)[1]) == 's')

+#endif

+#define SHARP_S_SKIP 2