diff options
Diffstat (limited to 'gnu/usr.bin/perl/locale.c')
-rw-r--r-- | gnu/usr.bin/perl/locale.c | 8357 |
1 files changed, 5031 insertions, 3326 deletions
diff --git a/gnu/usr.bin/perl/locale.c b/gnu/usr.bin/perl/locale.c index c83076da3f0..92b6c1a6ec1 100644 --- a/gnu/usr.bin/perl/locale.c +++ b/gnu/usr.bin/perl/locale.c @@ -38,14 +38,113 @@ * it would be possible to emulate thread-safe locales, but this likely would * involve a lot of locale switching, and would require XS code changes. * Macros could be written so that the code wouldn't have to know which type of - * system is being used. It's unlikely that we would ever do that, since most - * modern systems support thread-safe locales, but there was code written to - * this end, and is retained, #ifdef'd out. + * system is being used. + * + * Table-driven code is used for simplicity and clarity, as many operations + * differ only in which category is being worked on. However the system + * categories need not be small contiguous integers, so do not lend themselves + * to table lookup. Instead we have created our own equivalent values which + * are all small contiguous non-negative integers, and translation functions + * between the two sets. For category 'LC_foo', the name of our index is + * LC_foo_INDEX_. Various parallel tables, indexed by these, are used. + * + * Many of the macros and functions in this file have one of the suffixes '_c', + * '_r', or '_i'. khw found these useful in remembering what type of locale + * category to use as their parameter. '_r' takes an int category number as + * passed to setlocale(), like LC_ALL, LC_CTYPE, etc. The 'r' indicates that + * the value isn't known until runtime. '_c' also indicates such a category + * number, but its value is known at compile time. These are both converted + * into unsigned indexes into various tables of category information, where the + * real work is generally done. The tables are generated at compile-time based + * on platform characteristics and Configure options. They hide from the code + * many of the vagaries of the different locale implementations out there. You + * may have already guessed that '_i' indicates the parameter is such an + * unsigned index. Converting from '_r' to '_i' requires run-time lookup. + * '_c' is used to get cpp to do this at compile time. To avoid the runtime + * expense, the code is structured to use '_r' at the API level, and once + * converted, everything possible is done using the table indexes. + * + * On unthreaded perls, most operations expand out to just the basic + * setlocale() calls. The same is true on threaded perls on modern Windows + * systems where the same API, after set up, is used for thread-safe locale + * handling. On other systems, there is a completely different API, specified + * in POSIX 2008, to do thread-safe locales. On these systems, our + * emulate_setlocale_i() function is used to hide the different API from the + * outside. This makes it completely transparent to most XS code. + * + * A huge complicating factor is that the LC_NUMERIC category is normally held + * in the C locale, except during those relatively rare times when it needs to + * be in the underlying locale. There is a bunch of code to accomplish this, + * and to allow easy switches from one state to the other. + * + * In addition, the setlocale equivalents have versions for the return context, + * 'void' and 'bool', besides the full return value. This can present + * opportunities for avoiding work. We don't have to necessarily create a safe + * copy to return if no return is desired. + * + * There are 3.5 major implementations here; which one chosen depends on what + * the platform has available, and Configuration options. + * + * 1) Raw my_setlocale(). Here the layer adds nothing. This is used for + * unthreaded perls, and when the API for safe locale threading is identical + * to the unsafe API (Windows, currently). + * + * 2) A minimal layer that makes my_setlocale() uninterruptible and returns a + * per-thread/per-category value. + * + * 3a and 3b) A layer that implements POSIX 2008 thread-safe locale handling, + * mapping the setlocale() API to them. This automatically makes almost all + * code thread-safe without need for changes. This layer is chosen on + * threaded perls when the platform supports the POSIX 2008 functions, and + * when there is no manual override in Configure. + * + * 3a) is when the platform has a reliable querylocale() function or + * equivalent that is selected to be used. + * 3b) is when we have to emulate that functionality. + * + * z/OS (os390) is an outlier. Locales really don't work under threads when + * either the radix character isn't a dot, or attempts are made to change + * locales after the first thread is created. The reason is that IBM has made + * it thread-safe by refusing to change locales (returning failure if + * attempted) any time after an application has called pthread_create() to + * create another thread. The expectation is that an application will set up + * its locale information before the first fork, and be stable thereafter. But + * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do + * the other toggles, which are less common. */ +/* If the environment says to, we can output debugging information during + * initialization. This is done before option parsing, and before any thread + * creation, so can be a file-level static. (Must come before #including + * perl.h) */ +#ifdef DEBUGGING +static int debug_initialization = 0; +# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) +# define DEBUG_LOCALE_INITIALIZATION_ debug_initialization +/* C standards seem to say that __LINE__ is merely "an integer constant", + * which means it might be either int, long (with L suffix), or long long + * (or their corresponding unsigned type). So, we have to explicitly cast + * __LINE__ to a particular integer type to pass it reliably to variadic + * functions like (PerlIO_)printf, as below: */ +# ifdef USE_LOCALE_THREADS +# define DEBUG_PRE_STMTS \ + dSAVE_ERRNO; dTHX; PerlIO_printf(Perl_debug_log,"\n%s: %" LINE_Tf ": %p: ",\ + __FILE__, (line_t)__LINE__, aTHX); +# else +# define DEBUG_PRE_STMTS \ + dSAVE_ERRNO; dTHX; PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": ", \ + __FILE__, (line_t)__LINE__); +# endif +# define DEBUG_POST_STMTS RESTORE_ERRNO; +#else +# define debug_initialization 0 +# define DEBUG_INITIALIZATION_set(v) +# define DEBUG_PRE_STMTS +# define DEBUG_POST_STMTS +#endif + #include "EXTERN.h" #define PERL_IN_LOCALE_C -#include "perl_langinfo.h" #include "perl.h" #include "reentr.h" @@ -57,27 +156,53 @@ # include <wctype.h> #endif -/* If the environment says to, we can output debugging information during - * initialization. This is done before option parsing, and before any thread - * creation, so can be a file-level static */ -#if ! defined(DEBUGGING) -# define debug_initialization 0 -# define DEBUG_INITIALIZATION_set(v) + /* The main errno that gets used is this one, on platforms that support it */ +#ifdef EINVAL +# define SET_EINVAL SETERRNO(EINVAL, LIB_INVARG) #else -static bool debug_initialization = FALSE; -# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) +# define SET_EINVAL +#endif + +/* If we have any of these library functions, we can reliably determine is a + * locale is a UTF-8 one or not. And if we aren't using locales at all, we act + * as if everything is the C locale, so the answer there is always "No, it + * isn't UTF-8"; this too is reliably accurate */ +#if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) \ + || ! defined(USE_LOCALE) +# define HAS_RELIABLE_UTF8NESS_DETERMINATION #endif +#ifdef USE_LOCALE + +PERL_STATIC_INLINE const char * +S_mortalized_pv_copy(pTHX_ const char * const pv) +{ + PERL_ARGS_ASSERT_MORTALIZED_PV_COPY; + + /* Copies the input pv, and arranges for it to be freed at an unspecified + * later time. */ + + if (pv == NULL) { + return NULL; + } + + const char * copy = savepv(pv); + SAVEFREEPV(copy); + return copy; +} + +#endif /* Returns the Unix errno portion; ignoring any others. This is a macro here * instead of putting it into perl.h, because unclear to khw what should be * done generally. */ #define GET_ERRNO saved_errno -/* strlen() of a literal string constant. We might want this more general, - * but using it in just this file for now. A problem with more generality is - * the compiler warnings about comparing unlike signs */ -#define STRLENs(s) (sizeof("" s "") - 1) +/* Default values come from the C locale */ +#define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually + a single instance, so is a #define */ +static const char C_decimal_point[] = "."; +static const char C_thousands_sep[] = ""; /* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the * return of setlocale(), then this is extremely likely to be the C or POSIX @@ -94,85 +219,47 @@ static bool debug_initialization = FALSE; && (( *(name) == 'C' && (*(name + 1)) == '\0') \ || strEQ((name), "POSIX"))) -#ifdef USE_LOCALE - -/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far - * looked up. This is in the form of a C string: */ - -#define UTF8NESS_SEP "\v" -#define UTF8NESS_PREFIX "\f" - -/* So, the string looks like: - * - * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0 - * - * where the digit 0 after the \a indicates that the locale starting just - * after the preceding \v is not UTF-8, and the digit 1 mean it is. */ - -STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1); -STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1); - -#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \ - UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0" - -/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are - * kept there always. The remining portion of the cache is LRU, with the - * oldest looked-up locale at the tail end */ - -STATIC char * -S_stdize_locale(pTHX_ char *locs) -{ - /* Standardize the locale name from a string returned by 'setlocale', - * possibly modifying that string. - * - * The typical return value of setlocale() is either - * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL - * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL - * (the space-separated values represent the various sublocales, - * in some unspecified order). This is not handled by this function. - * - * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", - * which is harmful for further use of the string in setlocale(). This - * function removes the trailing new line and everything up through the '=' - * */ - - const char * const s = strchr(locs, '='); - bool okay = TRUE; +#if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO) +# define HAS_SOME_LANGINFO +#endif - PERL_ARGS_ASSERT_STDIZE_LOCALE; +#define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \ + my_langinfo_i(item, category##_INDEX_, locale, retbufp, \ + retbuf_sizep, utf8ness) - if (s) { - const char * const t = strchr(s, '.'); - okay = FALSE; - if (t) { - const char * const u = strchr(t, '\n'); - if (u && (u[1] == 0)) { - const STRLEN len = u - s; - Move(s + 1, locs, len, char); - locs[len] = 0; - okay = TRUE; - } - } - } +#ifdef USE_LOCALE - if (!okay) - Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs); +# ifdef DEBUGGING +# define setlocale_debug_string_i(index, locale, result) \ + my_setlocale_debug_string_i(index, locale, result, __LINE__) +# define setlocale_debug_string_c(category, locale, result) \ + setlocale_debug_string_i(category##_INDEX_, locale, result) +# define setlocale_debug_string_r(category, locale, result) \ + setlocale_debug_string_i(get_category_index(category, locale), \ + locale, result) +# endif - return locs; -} +# define toggle_locale_i(index, locale) \ + S_toggle_locale_i(aTHX_ index, locale, __LINE__) +# define toggle_locale_c(cat, locale) toggle_locale_i(cat##_INDEX_, locale) +# define restore_toggled_locale_i(index, locale) \ + S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__) +# define restore_toggled_locale_c(cat, locale) \ + restore_toggled_locale_i(cat##_INDEX_, locale) -/* Two parallel arrays; first the locale categories Perl uses on this system; - * the second array is their names. These arrays are in mostly arbitrary - * order. */ +/* Two parallel arrays indexed by our mapping of category numbers into small + * non-negative indexes; first the locale categories Perl uses on this system, + * used to do the inverse mapping. The second array is their names. These + * arrays are in mostly arbitrary order. */ -const int categories[] = { +STATIC const int categories[] = { -# ifdef USE_LOCALE_NUMERIC - LC_NUMERIC, -# endif # ifdef USE_LOCALE_CTYPE LC_CTYPE, # endif +# ifdef USE_LOCALE_NUMERIC + LC_NUMERIC, +# endif # ifdef USE_LOCALE_COLLATE LC_COLLATE, # endif @@ -200,6 +287,9 @@ const int categories[] = { # ifdef USE_LOCALE_TELEPHONE LC_TELEPHONE, # endif +# ifdef USE_LOCALE_NAME + LC_NAME, +# endif # ifdef USE_LOCALE_SYNTAX LC_SYNTAX, # endif @@ -209,21 +299,22 @@ const int categories[] = { # ifdef LC_ALL LC_ALL, # endif - -1 /* Placeholder because C doesn't allow a - trailing comma, and it would get complicated - with all the #ifdef's */ + + /* Placeholder as a precaution if code fails to check the return of + * get_category_index(), which returns this element to indicate an error */ + -1 }; /* The top-most real element is LC_ALL */ -const char * const category_names[] = { +STATIC const char * const category_names[] = { -# ifdef USE_LOCALE_NUMERIC - "LC_NUMERIC", -# endif # ifdef USE_LOCALE_CTYPE "LC_CTYPE", # endif +# ifdef USE_LOCALE_NUMERIC + "LC_NUMERIC", +# endif # ifdef USE_LOCALE_COLLATE "LC_COLLATE", # endif @@ -251,6 +342,9 @@ const char * const category_names[] = { # ifdef USE_LOCALE_TELEPHONE "LC_TELEPHONE", # endif +# ifdef USE_LOCALE_NAME + "LC_NAME", +# endif # ifdef USE_LOCALE_SYNTAX "LC_SYNTAX", # endif @@ -260,211 +354,445 @@ const char * const category_names[] = { # ifdef LC_ALL "LC_ALL", # endif - NULL /* Placeholder */ - }; + + /* Placeholder as a precaution if code fails to check the return of + * get_category_index(), which returns this element to indicate an error */ + NULL +}; + +/* A few categories require additional setup when they are changed. This table + * points to the functions that do that setup */ +STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = { +# ifdef USE_LOCALE_CTYPE + S_new_ctype, +# endif +# ifdef USE_LOCALE_NUMERIC + S_new_numeric, +# endif +# ifdef USE_LOCALE_COLLATE + S_new_collate, +# endif +# ifdef USE_LOCALE_TIME + NULL, +# endif +# ifdef USE_LOCALE_MESSAGES + NULL, +# endif +# ifdef USE_LOCALE_MONETARY + NULL, +# endif +# ifdef USE_LOCALE_ADDRESS + NULL, +# endif +# ifdef USE_LOCALE_IDENTIFICATION + NULL, +# endif +# ifdef USE_LOCALE_MEASUREMENT + NULL, +# endif +# ifdef USE_LOCALE_PAPER + NULL, +# endif +# ifdef USE_LOCALE_TELEPHONE + NULL, +# endif +# ifdef USE_LOCALE_NAME + NULL, +# endif +# ifdef USE_LOCALE_SYNTAX + NULL, +# endif +# ifdef USE_LOCALE_TOD + NULL, +# endif + /* No harm done to have this even without an LC_ALL */ + S_new_LC_ALL, + + /* Placeholder as a precaution if code fails to check the return of + * get_category_index(), which returns this element to indicate an error */ + NULL +}; # ifdef LC_ALL /* On systems with LC_ALL, it is kept in the highest index position. (-2 * to account for the final unused placeholder element.) */ # define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2) - # else /* On systems without LC_ALL, we pretend it is there, one beyond the real * top element, hence in the unused placeholder element. */ # define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1) - # endif /* Pretending there is an LC_ALL element just above allows us to avoid most * special cases. Most loops through these arrays in the code below are * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work * on either type of system. But the code must be written to not access the - * element at 'LC_ALL_INDEX' except on platforms that have it. This can be - * checked for at compile time by using the #define LC_ALL_INDEX which is only + * element at 'LC_ALL_INDEX_' except on platforms that have it. This can be + * checked for at compile time by using the #define LC_ALL_INDEX_ which is only * defined if we do have LC_ALL. */ -STATIC const char * -S_category_name(const int category) +STATIC int +S_get_category_index_nowarn(const int category) { - unsigned int i; - -#ifdef LC_ALL + /* Given a category, return the equivalent internal index we generally use + * instead, or negative if not found. + * + * Some sort of hash could be used instead of this loop, but the number of + * elements is so far at most 12 */ - if (category == LC_ALL) { - return "LC_ALL"; - } + unsigned int i; -#endif + PERL_ARGS_ASSERT_GET_CATEGORY_INDEX; - for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { +# ifdef LC_ALL + for (i = 0; i <= LC_ALL_INDEX_; i++) +# else + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) +# endif + { if (category == categories[i]) { - return category_names[i]; + dTHX_DEBUGGING; + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "index of category %d (%s) is %d\n", + category, category_names[i], i)); + return i; } } - { - const char suffix[] = " (unknown)"; - int temp = category; - Size_t length = sizeof(suffix) + 1; - char * unknown; - dTHX; - - if (temp < 0) { - length++; - temp = - temp; - } + return -1; +} - /* Calculate the number of digits */ - while (temp >= 10) { - temp /= 10; - length++; - } +STATIC unsigned int +S_get_category_index(const int category, const char * locale) +{ + /* Given a category, return the equivalent internal index we generally use + * instead. + * + * 'locale' is for use in any generated diagnostics, and may be NULL + */ + + const char * conditional_warn_text = "; can't set it to "; + const int index = get_category_index_nowarn(category); + + if (index >= 0) { + return index; + } + + /* Here, we don't know about this category, so can't handle it. */ - Newx(unknown, length, char); - my_snprintf(unknown, length, "%d%s", category, suffix); - SAVEFREEPV(unknown); - return unknown; + if (! locale) { + locale = ""; + conditional_warn_text = ""; } + + /* diag_listed_as: Unknown locale category %d; can't set it to %s */ + Perl_warner_nocontext(packWARN(WARN_LOCALE), + "Unknown locale category %d%s%s", + category, conditional_warn_text, locale); + + SET_EINVAL; + + /* Return an out-of-bounds value */ + return NOMINAL_LC_ALL_INDEX + 1; } -/* Now create LC_foo_INDEX #defines for just those categories on this system */ -# ifdef USE_LOCALE_NUMERIC -# define LC_NUMERIC_INDEX 0 -# define _DUMMY_NUMERIC LC_NUMERIC_INDEX -# else -# define _DUMMY_NUMERIC -1 -# endif -# ifdef USE_LOCALE_CTYPE -# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1 -# define _DUMMY_CTYPE LC_CTYPE_INDEX -# else -# define _DUMMY_CTYPE _DUMMY_NUMERIC -# endif -# ifdef USE_LOCALE_COLLATE -# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1 -# define _DUMMY_COLLATE LC_COLLATE_INDEX -# else -# define _DUMMY_COLLATE _DUMMY_CTYPE -# endif -# ifdef USE_LOCALE_TIME -# define LC_TIME_INDEX _DUMMY_COLLATE + 1 -# define _DUMMY_TIME LC_TIME_INDEX -# else -# define _DUMMY_TIME _DUMMY_COLLATE -# endif -# ifdef USE_LOCALE_MESSAGES -# define LC_MESSAGES_INDEX _DUMMY_TIME + 1 -# define _DUMMY_MESSAGES LC_MESSAGES_INDEX -# else -# define _DUMMY_MESSAGES _DUMMY_TIME -# endif -# ifdef USE_LOCALE_MONETARY -# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1 -# define _DUMMY_MONETARY LC_MONETARY_INDEX -# else -# define _DUMMY_MONETARY _DUMMY_MESSAGES -# endif -# ifdef USE_LOCALE_ADDRESS -# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1 -# define _DUMMY_ADDRESS LC_ADDRESS_INDEX -# else -# define _DUMMY_ADDRESS _DUMMY_MONETARY -# endif -# ifdef USE_LOCALE_IDENTIFICATION -# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1 -# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX -# else -# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS -# endif -# ifdef USE_LOCALE_MEASUREMENT -# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1 -# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX -# else -# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION -# endif -# ifdef USE_LOCALE_PAPER -# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1 -# define _DUMMY_PAPER LC_PAPER_INDEX -# else -# define _DUMMY_PAPER _DUMMY_MEASUREMENT -# endif -# ifdef USE_LOCALE_TELEPHONE -# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1 -# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX -# else -# define _DUMMY_TELEPHONE _DUMMY_PAPER -# endif -# ifdef USE_LOCALE_SYNTAX -# define LC_SYNTAX_INDEX _DUMMY_TELEPHONE + 1 -# define _DUMMY_SYNTAX LC_SYNTAX_INDEX -# else -# define _DUMMY_SYNTAX _DUMMY_TELEPHONE -# endif -# ifdef USE_LOCALE_TOD -# define LC_TOD_INDEX _DUMMY_SYNTAX + 1 -# define _DUMMY_TOD LC_TOD_INDEX -# else -# define _DUMMY_TOD _DUMMY_SYNTAX -# endif -# ifdef LC_ALL -# define LC_ALL_INDEX _DUMMY_TOD + 1 -# endif #endif /* ifdef USE_LOCALE */ -/* Windows requres a customized base-level setlocale() */ +void +Perl_force_locale_unlock() +{ + +#if defined(USE_LOCALE_THREADS) + + dTHX; + + /* If recursively locked, clear all at once */ + if (PL_locale_mutex_depth > 1) { + PL_locale_mutex_depth = 1; + } + + if (PL_locale_mutex_depth > 0) { + LOCALE_UNLOCK_; + } + +#endif + +} + +#ifdef USE_POSIX_2008_LOCALE + +STATIC locale_t +S_use_curlocale_scratch(pTHX) +{ + /* This function is used to hide from the caller the case where the current + * locale_t object in POSIX 2008 is the global one, which is illegal in + * many of the P2008 API calls. This checks for that and, if necessary + * creates a proper P2008 object. Any prior object is deleted, as is any + * remaining object during global destruction. */ + + locale_t cur = uselocale((locale_t) 0); + + if (cur != LC_GLOBAL_LOCALE) { + return cur; + } + + if (PL_scratch_locale_obj) { + freelocale(PL_scratch_locale_obj); + } + + PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE); + return PL_scratch_locale_obj; +} + +#endif + +void +Perl_locale_panic(const char * msg, + const char * file_name, + const line_t line, + const int errnum) +{ + dTHX; + + PERL_ARGS_ASSERT_LOCALE_PANIC; + + force_locale_unlock(); + +#ifdef USE_C_BACKTRACE + dump_c_backtrace(Perl_debug_log, 20, 1); +#endif + + /* diag_listed_as: panic: %s */ + Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s; errno=%d\n", + file_name, line, msg, errnum); +} + +#define setlocale_failure_panic_c( \ + cat, current, failed, caller_0_line, caller_1_line) \ + setlocale_failure_panic_i(cat##_INDEX_, current, failed, \ + caller_0_line, caller_1_line) + +/* posix_setlocale() presents a consistent POSIX-compliant interface to + * setlocale(). Windows requres a customized base-level setlocale(). Any + * necessary mutex locking needs to be done at a higher level */ #ifdef WIN32 -# define my_setlocale(cat, locale) win32_setlocale(cat, locale) +# define posix_setlocale(cat, locale) win32_setlocale(cat, locale) #else -# define my_setlocale(cat, locale) setlocale(cat, locale) +# define posix_setlocale(cat, locale) ((const char *) setlocale(cat, locale)) #endif -#ifndef USE_POSIX_2008_LOCALE +/* The next layer up is to catch vagaries and bugs in the libc setlocale return + * value. Again, any necessary mutex locking needs to be done at a higher + * level */ +#ifdef stdize_locale +# define stdized_setlocale(cat, locale) \ + stdize_locale(cat, posix_setlocale(cat, locale), \ + &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__) +#else +# define stdized_setlocale(cat, locale) posix_setlocale(cat, locale) +#endif -/* "do_setlocale_c" is intended to be called when the category is a constant - * known at compile time; "do_setlocale_r", not known until run time */ -# define do_setlocale_c(cat, locale) my_setlocale(cat, locale) -# define do_setlocale_r(cat, locale) my_setlocale(cat, locale) -# define FIX_GLIBC_LC_MESSAGES_BUG(i) +/* The next many lines form a layer above the close-to-the-metal 'posix' + * and 'stdized' macros. They are used to present a uniform API to the rest of + * the code in this file in spite of the disparate underlying implementations. + * */ -#else /* Below uses POSIX 2008 */ +#if (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE)) \ + || ( defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE)) -/* We emulate setlocale with our own function. LC_foo is not valid for the - * POSIX 2008 functions. Instead LC_foo_MASK is used, which we use an array - * lookup to convert to. At compile time we have defined LC_foo_INDEX as the - * proper offset into the array 'category_masks[]'. At runtime, we have to - * search through the array (as the actual numbers may not be small contiguous - * positive integers which would lend themselves to array lookup). */ -# define do_setlocale_c(cat, locale) \ - emulate_setlocale(cat, locale, cat ## _INDEX, TRUE) -# define do_setlocale_r(cat, locale) emulate_setlocale(cat, locale, 0, FALSE) +/* For non-threaded perls, the added layer just expands to the base-level + * functions, except if we are supposed to use the POSIX 2008 interface anyway. + * On perls where threading is invisible to us, the base-level functions are + * used regardless of threading. Currently this is only on later Windows + * versions. + * + * See the introductory comments in this file for the meaning of the suffixes + * '_c', '_r', '_i'. */ + +# define setlocale_r(cat, locale) stdized_setlocale(cat, locale) +# define setlocale_i(i, locale) setlocale_r(categories[i], locale) +# define setlocale_c(cat, locale) setlocale_r(cat, locale) + +# define void_setlocale_i(i, locale) \ + STMT_START { \ + if (! posix_setlocale(categories[i], locale)) { \ + setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0); \ + NOT_REACHED; /* NOTREACHED */ \ + } \ + } STMT_END +# define void_setlocale_c(cat, locale) \ + void_setlocale_i(cat##_INDEX_, locale) +# define void_setlocale_r(cat, locale) \ + void_setlocale_i(get_category_index(cat, locale), locale) + +# define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale)) +# define bool_setlocale_i(i, locale) \ + bool_setlocale_c(categories[i], locale) +# define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale) + +/* All the querylocale...() forms return a mortalized copy. If you need + * something stable across calls, you need to savepv() the result yourself */ + +# define querylocale_r(cat) mortalized_pv_copy(setlocale_r(cat, NULL)) +# define querylocale_c(cat) querylocale_r(cat) +# define querylocale_i(i) querylocale_c(categories[i]) + +#elif defined(USE_LOCALE_THREADS) \ + && ! defined(USE_THREAD_SAFE_LOCALE) + + /* Here, there are threads, and there is no support for thread-safe + * operation. This is a dangerous situation, which perl is documented as + * not supporting, but it arises in practice. We can do a modicum of + * automatic mitigation by making sure there is a per-thread return from + * setlocale(), and that a mutex protects it from races */ +STATIC const char * +S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale) +{ + const char * retval; -# if ! defined(__GLIBC__) || ! defined(USE_LOCALE_MESSAGES) + PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R; -# define FIX_GLIBC_LC_MESSAGES_BUG(i) + POSIX_SETLOCALE_LOCK; -# else /* Invalidate glibc cache of loaded translations, see [perl #134264] */ + retval = stdized_setlocale(category, locale); -# include <libintl.h> -# define FIX_GLIBC_LC_MESSAGES_BUG(i) \ - STMT_START { \ - if ((i) == LC_MESSAGES_INDEX) { \ - textdomain(textdomain(NULL)); \ - } \ - } STMT_END + /* We reuse PL_stdize_locale_buf as it doesn't conflict, but the call may + * already have used it, in which case we don't have to do anything further + * */ + retval = save_to_buffer(retval, + &PL_stdize_locale_buf, &PL_stdize_locale_bufsize); + + POSIX_SETLOCALE_UNLOCK; + + return retval; +} + +# define setlocale_r(cat, locale) less_dicey_setlocale_r(cat, locale) +# define setlocale_c(cat, locale) setlocale_r(cat, locale) +# define setlocale_i(i, locale) setlocale_r(categories[i], locale) + +# define querylocale_r(cat) mortalized_pv_copy(setlocale_r(cat, NULL)) +# define querylocale_c(cat) querylocale_r(cat) +# define querylocale_i(i) querylocale_r(categories[i]) + +STATIC void +S_less_dicey_void_setlocale_i(pTHX_ const unsigned cat_index, + const char * locale, + const line_t line) +{ + PERL_ARGS_ASSERT_LESS_DICEY_VOID_SETLOCALE_I; + + POSIX_SETLOCALE_LOCK; + if (! posix_setlocale(categories[cat_index], locale)) { + POSIX_SETLOCALE_UNLOCK; + setlocale_failure_panic_i(cat_index, NULL, locale, __LINE__, line); + } + POSIX_SETLOCALE_UNLOCK; +} + +# define void_setlocale_i(i, locale) \ + less_dicey_void_setlocale_i(i, locale, __LINE__) +# define void_setlocale_c(cat, locale) \ + void_setlocale_i(cat##_INDEX_, locale) +# define void_setlocale_r(cat, locale) \ + void_setlocale_i(get_category_index(cat, locale), locale) + +# if 0 /* Not currently used */ + +STATIC bool +S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale) +{ + bool retval; + + PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R; + + POSIX_SETLOCALE_LOCK; + retval = cBOOL(posix_setlocale(cat, locale)); + POSIX_SETLOCALE_UNLOCK; + + return retval; +} # endif +# define bool_setlocale_r(cat, locale) \ + less_dicey_bool_setlocale_r(cat, locale) +# define bool_setlocale_i(i, locale) \ + bool_setlocale_r(categories[i], locale) +# define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale) +#else -/* A third array, parallel to the ones above to map from category to its - * equivalent mask */ -const int category_masks[] = { -# ifdef USE_LOCALE_NUMERIC - LC_NUMERIC_MASK, +/* Here, there is a completely different API to get thread-safe locales. We + * emulate the setlocale() API with our own function(s). setlocale categories, + * like LC_NUMERIC, are not valid here for the POSIX 2008 API. Instead, there + * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to + * by using get_category_index() followed by table lookup. */ + +# define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line) \ + emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line) + + /* A wrapper for the macros below. */ +# define common_emulate_setlocale(i, locale) \ + emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__) + +# define setlocale_i(i, locale) \ + save_to_buffer(common_emulate_setlocale(i, locale), \ + &PL_stdize_locale_buf, \ + &PL_stdize_locale_bufsize) +# define setlocale_c(cat, locale) setlocale_i(cat##_INDEX_, locale) +# define setlocale_r(cat, locale) \ + setlocale_i(get_category_index(cat, locale), locale) + +# define void_setlocale_i(i, locale) \ + ((void) common_emulate_setlocale(i, locale)) +# define void_setlocale_c(cat, locale) \ + void_setlocale_i(cat##_INDEX_, locale) +# define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale)) + +# define bool_setlocale_i(i, locale) \ + cBOOL(common_emulate_setlocale(i, locale)) +# define bool_setlocale_c(cat, locale) \ + bool_setlocale_i(cat##_INDEX_, locale) +# define bool_setlocale_r(cat, locale) cBOOL(setlocale_r(cat, locale)) + +# define querylocale_i(i) mortalized_pv_copy(my_querylocale_i(i)) +# define querylocale_c(cat) querylocale_i(cat##_INDEX_) +# define querylocale_r(cat) querylocale_i(get_category_index(cat,NULL)) + +# ifdef USE_QUERYLOCALE +# define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask) + + /* This code used to think querylocale() was valid on LC_ALL. Make sure + * all instances of that have been removed */ +# define QUERYLOCALE_ASSERT(index) \ + __ASSERT_(isSINGLE_BIT_SET(category_masks[index])) +# if ! defined(HAS_QUERYLOCALE) && ( defined(_NL_LOCALE_NAME) \ + && defined(HAS_NL_LANGINFO_L)) +# define querylocale_l(index, locale_obj) \ + (QUERYLOCALE_ASSERT(index) \ + mortalized_pv_copy(nl_langinfo_l( \ + _NL_LOCALE_NAME(categories[index]), locale_obj))) +# else +# define querylocale_l(index, locale_obj) \ + (QUERYLOCALE_ASSERT(index) \ + mortalized_pv_copy(querylocale(category_masks[index], locale_obj))) +# endif # endif +# if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES) +# define HAS_GLIBC_LC_MESSAGES_BUG +# include <libintl.h> +# endif + +/* A fourth array, parallel to the ones above to map from category to its + * equivalent mask */ +STATIC const int category_masks[] = { # ifdef USE_LOCALE_CTYPE LC_CTYPE_MASK, # endif +# ifdef USE_LOCALE_NUMERIC + LC_NUMERIC_MASK, +# endif # ifdef USE_LOCALE_COLLATE LC_COLLATE_MASK, # endif @@ -492,6 +820,9 @@ const int category_masks[] = { # ifdef USE_LOCALE_TELEPHONE LC_TELEPHONE_MASK, # endif +# ifdef USE_LOCALE_NAME + LC_NAME_MASK, +# endif # ifdef USE_LOCALE_SYNTAX LC_SYNTAX_MASK, # endif @@ -503,781 +834,1031 @@ const int category_masks[] = { * here, so compile it in unconditionally. * This could catch some glitches at compile * time */ - LC_ALL_MASK - }; + LC_ALL_MASK, + + /* Placeholder as a precaution if code fails to check the return of + * get_category_index(), which returns this element to indicate an error */ + 0 +}; + +# define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_) STATIC const char * -S_emulate_setlocale(const int category, - const char * locale, - unsigned int index, - const bool is_index_valid - ) +S_my_querylocale_i(pTHX_ const unsigned int index) { - /* This function effectively performs a setlocale() on just the current - * thread; thus it is thread-safe. It does this by using the POSIX 2008 - * locale functions to emulate the behavior of setlocale(). Similar to - * regular setlocale(), the return from this function points to memory that - * can be overwritten by other system calls, so needs to be copied - * immediately if you need to retain it. The difference here is that - * system calls besides another setlocale() can overwrite it. - * - * By doing this, most locale-sensitive functions become thread-safe. The - * exceptions are mostly those that return a pointer to static memory. - * - * This function takes the same parameters, 'category' and 'locale', that - * the regular setlocale() function does, but it also takes two additional - * ones. This is because the 2008 functions don't use a category; instead - * they use a corresponding mask. Because this function operates in both - * worlds, it may need one or the other or both. This function can - * calculate the mask from the input category, but to avoid this - * calculation, if the caller knows at compile time what the mask is, it - * can pass it, setting 'is_index_valid' to TRUE; otherwise the mask - * parameter is ignored. + /* This function returns the name of the locale category given by the input + * index into our parallel tables of them. * * POSIX 2008, for some sick reason, chose not to provide a method to find - * the category name of a locale. Some vendors have created a - * querylocale() function to do just that. This function is a lot simpler - * to implement on systems that have this. Otherwise, we have to keep - * track of what the locale has been set to, so that we can return its - * name to emulate setlocale(). It's also possible for C code in some - * library to change the locale without us knowing it, though as of + * the category name of a locale, discarding a basic linguistic tenet that + * for any object, people will create a name for it. Some vendors have + * created a querylocale() function to do just that. This function is a + * lot simpler to implement on systems that have this. Otherwise, we have + * to keep track of what the locale has been set to, so that we can return + * its name so as to emulate setlocale(). It's also possible for C code in + * some library to change the locale without us knowing it, though as of * September 2017, there are no occurrences in CPAN of uselocale(). Some * libraries do use setlocale(), but that changes the global locale, and - * threads using per-thread locales will just ignore those changes. - * Another problem is that without querylocale(), we have to guess at what - * was meant by setting a locale of "". We handle this by not actually - * ever setting to "" (unless querylocale exists), but to emulate what we - * think should happen for "". - */ - - int mask; - locale_t old_obj; - locale_t new_obj; - const char * safelocale = locale ? locale : "(null)"; - dTHX; + * threads using per-thread locales will just ignore those changes. */ -# ifdef DEBUGGING + int category; + const locale_t cur_obj = uselocale((locale_t) 0); + const char * retval; - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale input=%d (%s), \"%s\", %d, %d\n", __FILE__, __LINE__, category, category_name(category), safelocale, index, is_index_valid); - } + PERL_ARGS_ASSERT_MY_QUERYLOCALE_I; + assert(index <= NOMINAL_LC_ALL_INDEX); -# endif + category = categories[index]; - /* If the input mask might be incorrect, calculate the correct one */ - if (! is_index_valid) { - unsigned int i; + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n", + category_names[index], cur_obj)); + if (cur_obj == LC_GLOBAL_LOCALE) { + POSIX_SETLOCALE_LOCK; + retval = posix_setlocale(category, NULL); + POSIX_SETLOCALE_UNLOCK; + } + else { -# ifdef DEBUGGING +# ifdef USE_QUERYLOCALE - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: finding index of category %d (%s)\n", __FILE__, __LINE__, category, category_name(category)); - } + /* We don't currently keep records when there is querylocale(), so have + * to get it anew each time */ + retval = (index == LC_ALL_INDEX_) + ? calculate_LC_ALL(cur_obj) + : querylocale_l(index, cur_obj); -# endif +# else - for (i = 0; i <= LC_ALL_INDEX; i++) { - if (category == categories[i]) { - index = i; - goto found_index; - } + /* But we do have up-to-date values when we keep our own records + * (except some times in initialization, where we get the value from + * the system. */ + const char ** which = (index == LC_ALL_INDEX_) + ? &PL_cur_LC_ALL + : &PL_curlocales[index]; + if (*which == NULL) { + retval = stdized_setlocale(category, NULL); + *which = savepv(retval); } - - /* Here, we don't know about this category, so can't handle it. - * Fallback to the early POSIX usages */ - Perl_warner(aTHX_ packWARN(WARN_LOCALE), - "Unknown locale category %d; can't set it to %s\n", - category, safelocale); - return NULL; - - found_index: ; - -# ifdef DEBUGGING - - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: index is %d for %s\n", __FILE__, __LINE__, index, category_name(category)); + else { + retval = *which; } # endif } - mask = category_masks[index]; - -# ifdef DEBUGGING + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "my_querylocale_i(%s) returning '%s'\n", + category_names[index], retval)); + assert(strNE(retval, "")); + return retval; +} - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: category name is %s; mask is 0x%x\n", __FILE__, __LINE__, category_names[index], mask); - } +# ifdef USE_PL_CURLOCALES -# endif +STATIC const char * +S_update_PL_curlocales_i(pTHX_ + const unsigned int index, + const char * new_locale, + recalc_lc_all_t recalc_LC_ALL) +{ + /* This is a helper function for emulate_setlocale_i(), mostly used to + * make that function easier to read. */ - /* If just querying what the existing locale is ... */ - if (locale == NULL) { - locale_t cur_obj = uselocale((locale_t) 0); + PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I; + assert(index <= NOMINAL_LC_ALL_INDEX); -# ifdef DEBUGGING + if (index == LC_ALL_INDEX_) { + unsigned int i; - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale querying %p\n", __FILE__, __LINE__, cur_obj); + /* For LC_ALL, we change all individual categories to correspond */ + /* PL_curlocales is a parallel array, so has same + * length as 'categories' */ + for (i = 0; i < LC_ALL_INDEX_; i++) { + Safefree(PL_curlocales[i]); + PL_curlocales[i] = savepv(new_locale); } -# endif + Safefree(PL_cur_LC_ALL); + PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales)); + return PL_cur_LC_ALL; + } - if (cur_obj == LC_GLOBAL_LOCALE) { - return my_setlocale(category, NULL); - } + /* Update the single category's record */ + Safefree(PL_curlocales[index]); + PL_curlocales[index] = savepv(new_locale); -# ifdef HAS_QUERYLOCALE + /* And also LC_ALL if the input says to, including if this is the final + * iteration of a loop updating all sub-categories */ + if ( recalc_LC_ALL == YES_RECALC_LC_ALL + || ( recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION + && index == NOMINAL_LC_ALL_INDEX - 1)) + { + Safefree(PL_cur_LC_ALL); + PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales)); + } - return (char *) querylocale(mask, cur_obj); + return PL_curlocales[index]; +} -# else +# endif /* Need PL_curlocales[] */ - /* If this assert fails, adjust the size of curlocales in intrpvar.h */ - STATIC_ASSERT_STMT(C_ARRAY_LENGTH(PL_curlocales) > LC_ALL_INDEX); +STATIC const char * +S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line) +{ + /* This function parses the value of the LC_ALL locale, assuming glibc + * syntax, and sets each individual category on the system to the proper + * value. + * + * This is likely to only ever be called from one place, so exists to make + * the calling function easier to read by moving this ancillary code out of + * the main line. + * + * The locale for each category is independent of the other categories. + * Often, they are all the same, but certainly not always. Perl, in fact, + * usually keeps LC_NUMERIC in the C locale, regardless of the underlying + * locale. LC_ALL has to be able to represent the case of when there are + * varying locales. Platforms have differing ways of representing this. + * Because of this, the code in this file goes to lengths to avoid the + * issue, generally looping over the component categories instead of + * referring to them in the aggregate, wherever possible. However, there + * are cases where we have to parse our own constructed aggregates, which use + * the glibc syntax. */ + + const char * locale_on_entry = querylocale_c(LC_ALL); + + PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL; + + /* If the string that gives what to set doesn't include all categories, + * the omitted ones get set to "C". To get this behavior, first set + * all the individual categories to "C", and override the furnished + * ones below. FALSE => No need to recalculate LC_ALL, as this is a + * temporary state */ + if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) { + setlocale_failure_panic_c(LC_ALL, locale_on_entry, + "C", __LINE__, line); + NOT_REACHED; /* NOTREACHED */ + } -# if defined(_NL_LOCALE_NAME) \ - && defined(DEBUGGING) \ - /* On systems that accept any locale name, the real underlying \ - * locale is often returned by this internal function, so we \ - * can't use it */ \ - && ! defined(SETLOCALE_ACCEPTS_ANY_LOCALE_NAME) - { - /* Internal glibc for querylocale(), but doesn't handle - * empty-string ("") locale properly; who knows what other - * glitches. Check for it now, under debug. */ - - char * temp_name = nl_langinfo_l(_NL_LOCALE_NAME(category), - uselocale((locale_t) 0)); - /* - PerlIO_printf(Perl_debug_log, "%s:%d: temp_name=%s\n", __FILE__, __LINE__, temp_name ? temp_name : "NULL"); - PerlIO_printf(Perl_debug_log, "%s:%d: index=%d\n", __FILE__, __LINE__, index); - PerlIO_printf(Perl_debug_log, "%s:%d: PL_curlocales[index]=%s\n", __FILE__, __LINE__, PL_curlocales[index]); - */ - if (temp_name && PL_curlocales[index] && strNE(temp_name, "")) { - if ( strNE(PL_curlocales[index], temp_name) - && ! ( isNAME_C_OR_POSIX(temp_name) - && isNAME_C_OR_POSIX(PL_curlocales[index]))) { - -# ifdef USE_C_BACKTRACE - - dump_c_backtrace(Perl_debug_log, 20, 1); + const char * s = locale; + const char * e = locale + strlen(locale); + while (s < e) { + const char * p = s; -# endif + /* Parse through the category */ + while (isWORDCHAR(*p)) { + p++; + } - Perl_croak(aTHX_ "panic: Mismatch between what Perl thinks %s is" - " (%s) and what internal glibc thinks" - " (%s)\n", category_names[index], - PL_curlocales[index], temp_name); - } + const char * category_end = p; - return temp_name; - } + if (*p++ != '=') { + locale_panic_(Perl_form(aTHX_ + "Unexpected character in locale category name '%s" + "<-- HERE", + get_displayable_string(s, p - 1, 0))); } -# endif + /* Parse through the locale name */ + const char * name_start = p; + while (p < e && *p != ';') { + p++; + } + if (UNLIKELY( p < e && *p != ';')) { + locale_panic_(Perl_form(aTHX_ + "Unexpected character in locale name '%s<-- HERE", + get_displayable_string(s, p, 0))); + } - /* Without querylocale(), we have to use our record-keeping we've - * done. */ + const char * name_end = p; - if (category != LC_ALL) { + /* Space past the semi-colon */ + if (p < e) { + p++; + } -# ifdef DEBUGGING + /* Find the index of the category name in our lists */ + for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) { - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[index]); + /* Keep going if this index doesn't point to the category being + * parsed. The strnNE() avoids a Perl_form(), but would fail if + * ever a category name could be a substring of another one, e.g., + * if there were a "LC_TIME_DATE" */ + if strnNE(s, category_names[i], category_end - s) { + continue; } -# endif - - return PL_curlocales[index]; - } - else { /* For LC_ALL */ - unsigned int i; - Size_t names_len = 0; - char * all_string; - bool are_all_categories_the_same_locale = TRUE; + /* Here i points to the category being parsed. Now isolate the + * locale it is being changed to */ + const char * individ_locale = Perl_form(aTHX_ "%.*s", + (int) (name_end - name_start), name_start); - /* If we have a valid LC_ALL value, just return it */ - if (PL_curlocales[LC_ALL_INDEX]) { - -# ifdef DEBUGGING + /* And do the change. Don't recalculate LC_ALL; we'll do it + * ourselves after the loop */ + if (! emulate_setlocale_i(i, individ_locale, + DONT_RECALC_LC_ALL, line)) + { - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[LC_ALL_INDEX]); + /* But if we have to back out, do fix up LC_ALL */ + if (! emulate_setlocale_c(LC_ALL, locale_on_entry, + YES_RECALC_LC_ALL, line)) + { + setlocale_failure_panic_i(i, individ_locale, + locale, __LINE__, line); + NOT_REACHED; /* NOTREACHED */ } -# endif - - return PL_curlocales[LC_ALL_INDEX]; + /* Reverting to the entry value succeeded, but the operation + * failed to go to the requested locale. */ + return NULL; } - /* Otherwise, we need to construct a string of name=value pairs. - * We use the glibc syntax, like - * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;... - * First calculate the needed size. Along the way, check if all - * the locale names are the same */ - for (i = 0; i < LC_ALL_INDEX; i++) { - -# ifdef DEBUGGING + /* Found and handled the desired category. Quit the inner loop to + * try the next category */ + break; + } - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]); - } + /* Finished with this category; iterate to the next one in the input */ + s = p; + } -# endif +# ifdef USE_PL_CURLOCALES - names_len += strlen(category_names[i]) - + 1 /* '=' */ - + strlen(PL_curlocales[i]) - + 1; /* ';' */ + /* Here we have set all the individual categories. Update the LC_ALL entry + * as well. We can't just use the input 'locale' as the value may omit + * categories whose locale is 'C'. khw thinks it's better to store a + * complete LC_ALL. So calculate it. */ + const char * retval = savepv(calculate_LC_ALL(PL_curlocales)); + Safefree(PL_cur_LC_ALL); + PL_cur_LC_ALL = retval; - if (i > 0 && strNE(PL_curlocales[i], PL_curlocales[i-1])) { - are_all_categories_the_same_locale = FALSE; - } - } +# else - /* If they are the same, we don't actually have to construct the - * string; we just make the entry in LC_ALL_INDEX valid, and be - * that single name */ - if (are_all_categories_the_same_locale) { - PL_curlocales[LC_ALL_INDEX] = savepv(PL_curlocales[0]); - return PL_curlocales[LC_ALL_INDEX]; - } + const char * retval = querylocale_c(LC_ALL); - names_len++; /* Trailing '\0' */ - SAVEFREEPV(Newx(all_string, names_len, char)); - *all_string = '\0'; +# endif - /* Then fill in the string */ - for (i = 0; i < LC_ALL_INDEX; i++) { + return retval; +} -# ifdef DEBUGGING +STATIC const char * +S_emulate_setlocale_i(pTHX_ - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]); - } + /* Our internal index of the 'category' setlocale is called with */ + const unsigned int index, -# endif + const char * new_locale, /* The locale to set the category to */ + const recalc_lc_all_t recalc_LC_ALL, /* Explained below */ + const line_t line /* Called from this line number */ + ) +{ + PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I; + assert(index <= NOMINAL_LC_ALL_INDEX); - my_strlcat(all_string, category_names[i], names_len); - my_strlcat(all_string, "=", names_len); - my_strlcat(all_string, PL_curlocales[i], names_len); - my_strlcat(all_string, ";", names_len); - } + /* Otherwise could have undefined behavior, as the return of this function + * may be copied to this buffer, which this function could change in the + * middle of its work */ + assert(new_locale != PL_stdize_locale_buf); -# ifdef DEBUGGING + /* This function effectively performs a setlocale() on just the current + * thread; thus it is thread-safe. It does this by using the POSIX 2008 + * locale functions to emulate the behavior of setlocale(). Similar to + * regular setlocale(), the return from this function points to memory that + * can be overwritten by other system calls, so needs to be copied + * immediately if you need to retain it. The difference here is that + * system calls besides another setlocale() can overwrite it. + * + * By doing this, most locale-sensitive functions become thread-safe. The + * exceptions are mostly those that return a pointer to static memory. + * + * This function may be called in a tight loop that iterates over all + * categories. Because LC_ALL is not a "real" category, but merely the sum + * of all the other ones, such loops don't include LC_ALL. On systems that + * have querylocale() or similar, the current LC_ALL value is immediately + * retrievable; on systems lacking that feature, we have to keep track of + * LC_ALL ourselves. We could do that on each iteration, only to throw it + * away on the next, but the calculation is more than a trivial amount of + * work. Instead, the 'recalc_LC_ALL' parameter is set to + * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once. + * This function calls itself recursively in such a loop. + * + * When not in such a loop, the parameter is set to the other enum values + * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */ - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, all_string); - } + int mask = category_masks[index]; + const locale_t entry_obj = uselocale((locale_t) 0); + const char * locale_on_entry = querylocale_i(index); - #endif + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "emulate_setlocale_i input=%d (%s), mask=0x%x," + " new locale=\"%s\", current locale=\"%s\"," + "index=%d, object=%p\n", + categories[index], category_names[index], mask, + ((new_locale == NULL) ? "(nil)" : new_locale), + locale_on_entry, index, entry_obj)); + + /* Return the already-calculated info if just querying what the existing + * locale is */ + if (new_locale == NULL) { + return locale_on_entry; + } - return all_string; + /* Here, trying to change the locale, but it is a no-op if the new boss is + * the same as the old boss. Except this routine is called when converting + * from the global locale, so in that case we will create a per-thread + * locale below (with the current values). It also seemed that newlocale() + * could free up the basis locale memory if we called it with the new and + * old being the same, but khw now thinks that this was due to some other + * bug, since fixed, as there are other places where newlocale() gets + * similarly called without problems. */ + if ( entry_obj != LC_GLOBAL_LOCALE + && locale_on_entry + && strEQ(new_locale, locale_on_entry)) + { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): emulate_setlocale_i" + " no-op to change to what it already was\n", + line)); + +# ifdef USE_PL_CURLOCALES + + /* On the final iteration of a loop that needs to recalculate LC_ALL, do + * so. If no iteration changed anything, LC_ALL also doesn't change, + * but khw believes the complexity needed to keep track of that isn't + * worth it. */ + if (UNLIKELY( recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION + && index == NOMINAL_LC_ALL_INDEX - 1)) + { + Safefree(PL_cur_LC_ALL); + PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales)); } -# ifdef EINVAL +# endif - SETERRNO(EINVAL, LIB_INVARG); + return locale_on_entry; + } -# endif +# ifndef USE_QUERYLOCALE - return NULL; + /* Without a querylocale() mechanism, we have to figure out ourselves what + * happens with setting a locale to "" */ + if (strEQ(new_locale, "")) { + new_locale = find_locale_from_environment(index); + } # endif - } /* End of this being setlocale(LC_foo, NULL) */ + /* So far, it has worked that a semi-colon in the locale name means that + * the category is LC_ALL and it subsumes categories which don't all have + * the same locale. This is the glibc syntax. */ + if (strchr(new_locale, ';')) { + assert(index == LC_ALL_INDEX_); + return setlocale_from_aggregate_LC_ALL(new_locale, line); + } - /* Here, we are switching locales. */ +# ifdef HAS_GLIBC_LC_MESSAGES_BUG -# ifndef HAS_QUERYLOCALE + /* For this bug, if the LC_MESSAGES locale changes, we have to do an + * expensive workaround. Save the current value so we can later determine + * if it changed. */ + const char * old_messages_locale = NULL; + if ( (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_) + && LIKELY(PL_phase != PERL_PHASE_CONSTRUCT)) + { + old_messages_locale = querylocale_c(LC_MESSAGES); + } - if (strEQ(locale, "")) { +# endif - /* For non-querylocale() systems, we do the setting of "" ourselves to - * be sure that we really know what's going on. We follow the Linux - * documented behavior (but if that differs from the actual behavior, - * this won't work exactly as the OS implements). We go out and - * examine the environment based on our understanding of how the system - * works, and use that to figure things out */ + assert(PL_C_locale_obj); - const char * const lc_all = PerlEnv_getenv("LC_ALL"); + /* Now ready to switch to the input 'new_locale' */ - /* Use any "LC_ALL" environment variable, as it overrides everything - * else. */ - if (lc_all && strNE(lc_all, "")) { - locale = lc_all; - } - else { + /* Switching locales generally entails freeing the current one's space (at + * the C library's discretion), hence we can't be using that locale at the + * time of the switch (this wasn't obvious to khw from the man pages). So + * switch to a known locale object that we don't otherwise mess with. */ + if (! uselocale(PL_C_locale_obj)) { + + /* Not being able to change to the C locale is severe; don't keep + * going. */ + setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line); + NOT_REACHED; /* NOTREACHED */ + } - /* Otherwise, we need to dig deeper. Unless overridden, the - * default is the LANG environment variable; if it doesn't exist, - * then "C" */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): emulate_setlocale_i now using C" + " object=%p\n", line, PL_C_locale_obj)); - const char * default_name; + locale_t new_obj; - default_name = PerlEnv_getenv("LANG"); + /* We created a (never changing) object at start-up for LC_ALL being in the + * C locale. If this call is to switch to LC_ALL=>C, simply use that + * object. But in fact, we already have switched to it just above, in + * preparation for the general case. Since we're already there, no need to + * do further switching. */ + if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):" + " emulate_setlocale_i will stay" + " in C object\n", line)); + new_obj = PL_C_locale_obj; - if (! default_name || strEQ(default_name, "")) { - default_name = "C"; + /* And free the old object if it isn't a special one */ + if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) { + freelocale(entry_obj); + } + } + else { /* Here is the general case, not to LC_ALL=>C */ + locale_t basis_obj = entry_obj; + + /* Specially handle two objects */ + if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) { + + /* For these two objects, we make duplicates to hand to newlocale() + * below. For LC_GLOBAL_LOCALE, this is because newlocale() + * doesn't necessarily accept it as input (the results are + * undefined). For PL_C_locale_obj, it is so that it never gets + * modified, as otherwise newlocale() is free to do so */ + basis_obj = duplocale(basis_obj); + if (! basis_obj) { + locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed", + line)); + NOT_REACHED; /* NOTREACHED */ } - if (category != LC_ALL) { - const char * const name = PerlEnv_getenv(category_names[index]); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): emulate_setlocale_i" + " created %p by duping the input\n", + line, basis_obj)); + } - /* Here we are setting a single category. Assume will have the - * default name */ - locale = default_name; + /* Ready to create a new locale by modification of the existing one. + * + * NOTE: This code may incorrectly show up as a leak under the address + * sanitizer. We do not free this object under normal teardown, however + * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed. + */ + new_obj = newlocale(mask, new_locale, basis_obj); - /* But then look for an overriding environment variable */ - if (name && strNE(name, "")) { - locale = name; - } + if (! new_obj) { + DEBUG_L(PerlIO_printf(Perl_debug_log, + " (%" LINE_Tf "): emulate_setlocale_i" + " creating new object from %p failed:" + " errno=%d\n", + line, basis_obj, GET_ERRNO)); + + /* Failed. Likely this is because the proposed new locale isn't + * valid on this system. But we earlier switched to the LC_ALL=>C + * locale in anticipation of it succeeding, Now have to switch + * back to the state upon entry */ + if (! uselocale(entry_obj)) { + setlocale_failure_panic_i(index, "switching back to", + locale_on_entry, __LINE__, line); + NOT_REACHED; /* NOTREACHED */ } - else { - bool did_override = FALSE; - unsigned int i; - - /* Here, we are getting LC_ALL. Any categories that don't have - * a corresponding environment variable set should be set to - * LANG, or to "C" if there is no LANG. If no individual - * categories differ from this, we can just set LC_ALL. This - * is buggy on systems that have extra categories that we don't - * know about. If there is an environment variable that sets - * that category, we won't know to look for it, and so our use - * of LANG or "C" improperly overrides it. On the other hand, - * if we don't do what is done here, and there is no - * environment variable, the category's locale should be set to - * LANG or "C". So there is no good solution. khw thinks the - * best is to look at systems to see what categories they have, - * and include them, and then to assume that we know the - * complete set */ - - for (i = 0; i < LC_ALL_INDEX; i++) { - const char * const env_override - = PerlEnv_getenv(category_names[i]); - const char * this_locale = ( env_override - && strNE(env_override, "")) - ? env_override - : default_name; - if (! emulate_setlocale(categories[i], this_locale, i, TRUE)) - { - return NULL; - } - if (strNE(this_locale, default_name)) { - did_override = TRUE; - } - } - - /* If all the categories are the same, we can set LC_ALL to - * that */ - if (! did_override) { - locale = default_name; - } - else { - - /* Here, LC_ALL is no longer valid, as some individual - * categories don't match it. We call ourselves - * recursively, as that will execute the code that - * generates the proper locale string for this situation. - * We don't do the remainder of this function, as that is - * to update our records, and we've just done that for the - * individual categories in the loop above, and doing so - * would cause LC_ALL to be done as well */ - return emulate_setlocale(LC_ALL, NULL, LC_ALL_INDEX, TRUE); +# ifdef USE_PL_CURLOCALES + + if (entry_obj == LC_GLOBAL_LOCALE) { + + /* Here, we are back in the global locale. We may never have + * set PL_curlocales. If the locale change had succeeded, the + * code would have then set them up, but since it didn't, do so + * here. khw isn't sure if this prevents some issues or not, + * This will calculate LC_ALL's entry only on the final + * iteration */ + POSIX_SETLOCALE_LOCK; + for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + update_PL_curlocales_i(i, + posix_setlocale(categories[i], NULL), + RECALCULATE_LC_ALL_ON_FINAL_INTERATION); } + POSIX_SETLOCALE_UNLOCK; } +# endif + + return NULL; } - } /* End of this being setlocale(LC_foo, "") */ - else if (strchr(locale, ';')) { - /* LC_ALL may actually incude a conglomeration of various categories. - * Without querylocale, this code uses the glibc (as of this writing) - * syntax for representing that, but that is not a stable API, and - * other platforms do it differently, so we have to handle all cases - * ourselves */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): emulate_setlocale_i created %p" + " while freeing %p\n", line, new_obj, basis_obj)); - unsigned int i; - const char * s = locale; - const char * e = locale + strlen(locale); - const char * p = s; - const char * category_end; - const char * name_start; - const char * name_end; - - /* If the string that gives what to set doesn't include all categories, - * the omitted ones get set to "C". To get this behavior, first set - * all the individual categories to "C", and override the furnished - * ones below */ - for (i = 0; i < LC_ALL_INDEX; i++) { - if (! emulate_setlocale(categories[i], "C", i, TRUE)) { - return NULL; - } + /* Here, successfully created an object representing the desired + * locale; now switch into it */ + if (! uselocale(new_obj)) { + freelocale(new_obj); + locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i" + " switching into new locale failed", + line)); } + } - while (s < e) { + /* Here, we are using 'new_obj' which matches the input 'new_locale'. */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): emulate_setlocale_i now using %p\n", + line, new_obj)); - /* Parse through the category */ - while (isWORDCHAR(*p)) { - p++; - } - category_end = p; +#ifdef MULTIPLICITY + PL_cur_locale_obj = new_obj; +#endif - if (*p++ != '=') { - Perl_croak(aTHX_ - "panic: %s: %d: Unexpected character in locale name '%02X", - __FILE__, __LINE__, *(p-1)); - } + /* We are done, except for updating our records (if the system doesn't keep + * them) and in the case of locale "", we don't actually know what the + * locale that got switched to is, as it came from the environment. So + * have to find it */ - /* Parse through the locale name */ - name_start = p; - while (p < e && *p != ';') { - if (! isGRAPH(*p)) { - Perl_croak(aTHX_ - "panic: %s: %d: Unexpected character in locale name '%02X", - __FILE__, __LINE__, *(p-1)); - } - p++; - } - name_end = p; +# ifdef USE_QUERYLOCALE - /* Space past the semi-colon */ - if (p < e) { - p++; - } + if (strEQ(new_locale, "")) { + new_locale = querylocale_i(index); + } - /* Find the index of the category name in our lists */ - for (i = 0; i < LC_ALL_INDEX; i++) { - char * individ_locale; + PERL_UNUSED_ARG(recalc_LC_ALL); - /* Keep going if this isn't the index. The strnNE() avoids a - * Perl_form(), but would fail if ever a category name could be - * a substring of another one, like if there were a - * "LC_TIME_DATE" */ - if strnNE(s, category_names[i], category_end - s) { - continue; - } +# else - /* If this index is for the single category we're changing, we - * have found the locale to set it to. */ - if (category == categories[i]) { - locale = Perl_form(aTHX_ "%.*s", - (int) (name_end - name_start), - name_start); - goto ready_to_set; - } + new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL); - assert(category == LC_ALL); - individ_locale = Perl_form(aTHX_ "%.*s", - (int) (name_end - name_start), name_start); - if (! emulate_setlocale(categories[i], individ_locale, i, TRUE)) - { - return NULL; - } - } +# endif +# ifdef HAS_GLIBC_LC_MESSAGES_BUG - s = p; + /* Invalidate the glibc cache of loaded translations if the locale has + * changed, see [perl #134264] */ + if (old_messages_locale) { + if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) { + textdomain(textdomain(NULL)); } + } - /* Here we have set all the individual categories by recursive calls. - * These collectively should have fixed up LC_ALL, so can just query - * what that now is */ - assert(category == LC_ALL); +# endif - return do_setlocale_c(LC_ALL, NULL); - } /* End of this being setlocale(LC_ALL, - "LC_CTYPE=foo;LC_NUMERIC=bar;...") */ + return new_locale; +} - ready_to_set: ; +#endif /* End of the various implementations of the setlocale and + querylocale macros used in the remainder of this program */ - /* Here at the end of having to deal with the absence of querylocale(). - * Some cases have already been fully handled by recursive calls to this - * function. But at this point, we haven't dealt with those, but are now - * prepared to, knowing what the locale name to set this category to is. - * This would have come for free if this system had had querylocale() */ +#ifdef USE_LOCALE -# endif /* end of ! querylocale */ +/* So far, the locale strings returned by modern 2008-compliant systems have + * been fine */ - assert(PL_C_locale_obj); +STATIC const char * +S_stdize_locale(pTHX_ const int category, + const char *input_locale, + const char **buf, + Size_t *buf_size, + const line_t caller_line) +{ + /* The return value of setlocale() is opaque, but is required to be usable + * as input to a future setlocale() to create the same state. + * Unfortunately not all systems are compliant. But most often they are of + * a very restricted set of forms that this file has been coded to expect. + * + * There are some outliers, though, that this function tries to tame: + * + * 1) A new-line. This function chomps any \n characters + * 2) foo=bar. 'bar' is what is generally meant, and the foo= part is + * stripped. This form is legal for LC_ALL. When found in + * that category group, the function calls itself + * recursively on each possible component category to make + * sure the individual categories are ok. + * + * If no changes to the input were made, it is returned; otherwise the + * changed version is stored into memory at *buf, with *buf_size set to its + * new value, and *buf is returned. + */ - /* Switching locales generally entails freeing the current one's space (at - * the C library's discretion). We need to stop using that locale before - * the switch. So switch to a known locale object that we don't otherwise - * mess with. This returns the locale object in effect at the time of the - * switch. */ - old_obj = uselocale(PL_C_locale_obj); + const char * first_bad; + const char * retval; -# ifdef DEBUGGING + PERL_ARGS_ASSERT_STDIZE_LOCALE; - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale was using %p\n", __FILE__, __LINE__, old_obj); + if (input_locale == NULL) { + return NULL; } -# endif + first_bad = strpbrk(input_locale, "=\n"); - if (! old_obj) { + /* Most likely, there isn't a problem with the input */ + if (LIKELY(! first_bad)) { + return input_locale; + } -# ifdef DEBUGGING +# ifdef LC_ALL + + /* But if there is, and the category is LC_ALL, we have to look at each + * component category */ + if (category == LC_ALL) { + const char * individ_locales[LC_ALL_INDEX_]; + bool made_changes = FALSE; + unsigned int i; - if (DEBUG_L_TEST || debug_initialization) { - dSAVE_ERRNO; - PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to C failed: %d\n", __FILE__, __LINE__, GET_ERRNO); - RESTORE_ERRNO; + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + Size_t this_size = 0; + individ_locales[i] = stdize_locale(categories[i], + posix_setlocale(categories[i], + NULL), + &individ_locales[i], + &this_size, + caller_line); + + /* If the size didn't change, it means this category did not have + * to be adjusted, and individ_locales[i] points to the buffer + * returned by posix_setlocale(); we have to copy that before + * it's called again in the next iteration */ + if (this_size == 0) { + individ_locales[i] = savepv(individ_locales[i]); + } + else { + made_changes = TRUE; + } } -# endif + /* If all the individual categories were ok as-is, this was a false + * alarm. We must have seen an '=' which was a legal occurrence in + * this combination locale */ + if (! made_changes) { + retval = input_locale; /* The input can be returned unchanged */ + } + else { + retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size); + } - return NULL; + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + Safefree(individ_locales[i]); + } + + return retval; } -# ifdef DEBUGGING +# else /* else no LC_ALL */ - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "%s:%d: emulate_setlocale now using %p\n", - __FILE__, __LINE__, PL_C_locale_obj); - } + PERL_UNUSED_ARG(category); + PERL_UNUSED_ARG(caller_line); -# endif +# endif - /* If this call is to switch to the LC_ALL C locale, it already exists, and - * in fact, we already have switched to it (in preparation for what - * normally is to come). But since we're already there, continue to use - * it instead of trying to create a new locale */ - if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(locale)) { + /* Here, there was a problem in an individual category. This means that at + * least one adjustment will be necessary. Create a modifiable copy */ + retval = save_to_buffer(input_locale, buf, buf_size); -# ifdef DEBUGGING + if (*first_bad != '=') { - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "%s:%d: will stay in C object\n", __FILE__, __LINE__); - } + /* Translate the found position into terms of the copy */ + first_bad = retval + (first_bad - input_locale); + } + else { /* An '=' */ -# endif + /* It is unlikely that the return is so screwed-up that it contains + * multiple equals signs, but handle that case by stripping all of + * them. */ + const char * final_equals = strrchr(retval, '='); - new_obj = PL_C_locale_obj; + /* The length passed here causes the move to include the terminating + * NUL */ + Move(final_equals + 1, retval, strlen(final_equals), char); + + /* See if there are additional problems; if not, we're good to return. + * */ + first_bad = strpbrk(retval, "\n"); - /* We already had switched to the C locale in preparation for freeing - * 'old_obj' */ - if (old_obj != LC_GLOBAL_LOCALE && old_obj != PL_C_locale_obj) { - freelocale(old_obj); + if (! first_bad) { + return retval; } } - else { - /* If we weren't in a thread safe locale, set so that newlocale() below - * which uses 'old_obj', uses an empty one. Same for our reserved C - * object. The latter is defensive coding, so that, even if there is - * some bug, we will never end up trying to modify either of these, as - * if passed to newlocale(), they can be. */ - if (old_obj == LC_GLOBAL_LOCALE || old_obj == PL_C_locale_obj) { - old_obj = (locale_t) 0; - } - /* Ready to create a new locale by modification of the exising one */ - new_obj = newlocale(mask, locale, old_obj); + /* Here, the problem must be a \n. Get rid of it and what follows. + * (Originally, only a trailing \n was stripped. Unsure what to do if not + * trailing) */ + *((char *) first_bad) = '\0'; + return retval; +} - if (! new_obj) { - dSAVE_ERRNO; +#if defined(WIN32) || ( defined(USE_POSIX_2008_LOCALE) \ + && ! defined(USE_QUERYLOCALE)) -# ifdef DEBUGGING +STATIC const char * +S_find_locale_from_environment(pTHX_ const unsigned int index) +{ + /* NB: This function may actually change the locale on Windows. + * + * On Windows systems, the concept of the POSIX ordering of environment + * variables is missing. To increase portability of programs across + * platforms, the POSIX ordering is emulated on Windows. + * + * And on POSIX 2008 systems without querylocale(), it is problematic + * getting the results of the POSIX 2008 equivalent of + * setlocale(category, "") + * (which gets the locale from the environment). + * + * To ensure that we know exactly what those values are, we do the setting + * ourselves, using the documented algorithm (assuming the documentation is + * correct) rather than use "" as the locale. This will lead to results + * that differ from native behavior if the native behavior differs from the + * standard documented value, but khw believes it is better to know what's + * going on, even if different from native, than to just guess. + * + * Another option for the POSIX 2008 case would be, in a critical section, + * to save the global locale's current value, and do a straight + * setlocale(LC_ALL, ""). That would return our desired values, destroying + * the global locale's, which we would then restore. But that could cause + * races with any other thread that is using the global locale and isn't + * using the mutex. And, the only reason someone would have done that is + * because they are calling a library function, like in gtk, that calls + * setlocale(), and which can't be changed to use the mutex. That wouldn't + * be a problem if this were to be done before any threads had switched, + * say during perl construction time. But this code would still be needed + * for the general case. + * + * The Windows and POSIX 2008 differ in that the ultimate fallback is "C" + * in POSIX, and is the system default locale in Windows. To get that + * system default value, we actually have to call setlocale() on Windows. + */ - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "%s:%d: emulate_setlocale creating new object" - " failed: %d\n", __FILE__, __LINE__, GET_ERRNO); - } + /* We rely on PerlEnv_getenv() returning a mortalized copy */ + const char * const lc_all = PerlEnv_getenv("LC_ALL"); -# endif + /* Use any "LC_ALL" environment variable, as it overrides everything + * else. */ + if (lc_all && strNE(lc_all, "")) { + return lc_all; + } - if (! uselocale(old_obj)) { + /* If setting an individual category, use its corresponding value found in + * the environment, if any */ + if (index != LC_ALL_INDEX_) { + const char * const new_value = PerlEnv_getenv(category_names[index]); -# ifdef DEBUGGING + if (new_value && strNE(new_value, "")) { + return new_value; + } - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "%s:%d: switching back failed: %d\n", - __FILE__, __LINE__, GET_ERRNO); - } + /* If no corresponding environment variable, see if LANG exists. If + * so, use it. */ + const char * default_name = PerlEnv_getenv("LANG"); + if (default_name && strNE(default_name, "")) { + return default_name; + } + /* If no LANG, use "C" on POSIX 2008, the system default on Windows */ +# ifndef WIN32 + return "C"; +# else + return wrap_wsetlocale(categories[index], ""); # endif - } - RESTORE_ERRNO; - return NULL; - } + } -# ifdef DEBUGGING + /* Here is LC_ALL, and no LC_ALL environment variable. LANG is used as a + * default, but overridden for individual categories that have + * corresponding environment variables. If no LANG exists, the default is + * "C" on POSIX 2008, or the system default for the category on Windows. */ + const char * default_name = PerlEnv_getenv("LANG"); - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "%s:%d: emulate_setlocale created %p", - __FILE__, __LINE__, new_obj); - if (old_obj) { - PerlIO_printf(Perl_debug_log, - "; should have freed %p", old_obj); - } - PerlIO_printf(Perl_debug_log, "\n"); + /* Convert "" to NULL to save conditionals in the loop below */ + if (default_name != NULL && strEQ(default_name, "")) { + default_name = NULL; + } + + /* Loop through all the individual categories, setting each to any + * corresponding environment variable; or to the default if none exists for + * the category */ + const char * locale_names[LC_ALL_INDEX_]; + for (unsigned i = 0; i < LC_ALL_INDEX_; i++) { + const char * const env_override = PerlEnv_getenv(category_names[i]); + + if (env_override && strNE(env_override, "")) { + locale_names[i] = env_override; + } + else if (default_name) { + locale_names[i] = default_name; } + else { +# ifndef WIN32 + locale_names[i] = "C"; +# else + locale_names[i] = wrap_wsetlocale(categories[index], ""); # endif - /* And switch into it */ - if (! uselocale(new_obj)) { - dSAVE_ERRNO; + } -# ifdef DEBUGGING + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "find_locale_from_environment i=%d, name=%s, locale=%s\n", + i, category_names[i], locale_names[i])); + } - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "%s:%d: emulate_setlocale switching to new object" - " failed\n", __FILE__, __LINE__); - } + return calculate_LC_ALL(locale_names); +} +#endif +#if defined(WIN32) || defined(USE_POSIX_2008_LOCALE) || ! defined(LC_ALL) + +STATIC +const char * + +# ifdef USE_QUERYLOCALE +S_calculate_LC_ALL(pTHX_ const locale_t cur_obj) +# else +S_calculate_LC_ALL(pTHX_ const char ** individ_locales) # endif - if (! uselocale(old_obj)) { +{ + /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed. + * querylocale(), on systems that have it, doesn't tend to work for LC_ALL. + * So we have to construct the answer ourselves based on the passed in + * data, which is either a locale_t object, for systems with querylocale(), + * or an array we keep updated to the proper values, otherwise. + * + * For Windows, we also may need to construct an LC_ALL when setting the + * locale to the system default. + * + * This function returns a mortalized string containing the locale name(s) + * of LC_ALL. + * + * If all individual categories are the same locale, we can just set LC_ALL + * to that locale. But if not, we have to create an aggregation of all the + * categories on the system. Platforms differ as to the syntax they use + * for these non-uniform locales for LC_ALL. Some use a '/' or other + * delimiter of the locales with a predetermined order of categories; a + * Configure probe would be needed to tell us how to decipher those. glibc + * and Windows use a series of name=value pairs, like + * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;... + * This function returns that syntax, which is suitable for input to the + * Windows setlocale(). It could also be suitable for glibc, but because + * the non-Windows code is common to systems that use a different syntax, + * we don't depend on it for glibc. Instead we take care not to use the + * native setlocale() function on whatever non-Windows style is chosen. + * But, it would be possible for someone to call Perl_setlocale() using a + * native style we don't understand. So far no one has complained. + * + * For systems that have categories we don't know about, the algorithm + * below won't know about those missing categories, leading to potential + * bugs for code that looks at them. If there is an environment variable + * that sets that category, we won't know to look for it, and so our use of + * LANG or "C" improperly overrides it. On the other hand, if we don't do + * what is done here, and there is no environment variable, the category's + * locale should be set to LANG or "C". So there is no good solution. khw + * thinks the best is to make sure we have a complete list of possible + * categories, adding new ones as they show up on obscure platforms. + */ + + unsigned int i; + Size_t names_len = 0; + bool are_all_categories_the_same_locale = TRUE; + char * aggregate_locale; + char * previous_start = NULL; + char * this_start = NULL; + Size_t entry_len = 0; -# ifdef DEBUGGING + PERL_ARGS_ASSERT_CALCULATE_LC_ALL; - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "%s:%d: switching back failed: %d\n", - __FILE__, __LINE__, GET_ERRNO); - } + /* First calculate the needed size for the string listing the categories + * and their locales. */ + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { +# ifdef USE_QUERYLOCALE + const char * entry = querylocale_l(i, cur_obj); +# else + const char * entry = individ_locales[i]; # endif - } - freelocale(new_obj); - RESTORE_ERRNO; - return NULL; - } + names_len += strlen(category_names[i]) + + 1 /* '=' */ + + strlen(entry) + + 1; /* ';' */ } -# ifdef DEBUGGING + names_len++; /* Trailing '\0' */ - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "%s:%d: emulate_setlocale now using %p\n", - __FILE__, __LINE__, new_obj); - } + /* Allocate enough space for the aggregated string */ + Newxz(aggregate_locale, names_len, char); + SAVEFREEPV(aggregate_locale); + + /* Then fill it in */ + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + Size_t new_len; +# ifdef USE_QUERYLOCALE + const char * entry = querylocale_l(i, cur_obj); +# else + const char * entry = individ_locales[i]; # endif - /* We are done, except for updating our records (if the system doesn't keep - * them) and in the case of locale "", we don't actually know what the - * locale that got switched to is, as it came from the environment. So - * have to find it */ + new_len = my_strlcat(aggregate_locale, category_names[i], names_len); + assert(new_len <= names_len); + new_len = my_strlcat(aggregate_locale, "=", names_len); + assert(new_len <= names_len); -# ifdef HAS_QUERYLOCALE + this_start = aggregate_locale + strlen(aggregate_locale); + entry_len = strlen(entry); - if (strEQ(locale, "")) { - locale = querylocale(mask, new_obj); + new_len = my_strlcat(aggregate_locale, entry, names_len); + assert(new_len <= names_len); + new_len = my_strlcat(aggregate_locale, ";", names_len); + assert(new_len <= names_len); + PERL_UNUSED_VAR(new_len); /* Only used in DEBUGGING */ + + if ( i > 0 + && are_all_categories_the_same_locale + && memNE(previous_start, this_start, entry_len + 1)) + { + are_all_categories_the_same_locale = FALSE; + } + else { + previous_start = this_start; + } } -# else + /* If they are all the same, just return any one of them */ + if (are_all_categories_the_same_locale) { + aggregate_locale = this_start; + aggregate_locale[entry_len] = '\0'; + } - /* Here, 'locale' is the return value */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "calculate_LC_ALL returning '%s'\n", + aggregate_locale)); - /* Without querylocale(), we have to update our records */ + return aggregate_locale; +} - if (category == LC_ALL) { - unsigned int i; +#endif +#if defined(USE_LOCALE) && ( defined(DEBUGGING) \ + || defined(USE_PERL_SWITCH_LOCALE_CONTEXT)) - /* For LC_ALL, we change all individual categories to correspond */ - /* PL_curlocales is a parallel array, so has same - * length as 'categories' */ - for (i = 0; i <= LC_ALL_INDEX; i++) { - Safefree(PL_curlocales[i]); - PL_curlocales[i] = savepv(locale); - } +STATIC const char * +S_get_LC_ALL_display(pTHX) +{ - FIX_GLIBC_LC_MESSAGES_BUG(LC_MESSAGES_INDEX); - } - else { +# ifdef LC_ALL - /* For a single category, if it's not the same as the one in LC_ALL, we - * nullify LC_ALL */ + return querylocale_c(LC_ALL); - if (PL_curlocales[LC_ALL_INDEX] && strNE(PL_curlocales[LC_ALL_INDEX], locale)) { - Safefree(PL_curlocales[LC_ALL_INDEX]); - PL_curlocales[LC_ALL_INDEX] = NULL; - } +# else - /* Then update the category's record */ - Safefree(PL_curlocales[index]); - PL_curlocales[index] = savepv(locale); + const char * curlocales[NOMINAL_LC_ALL_INDEX]; - FIX_GLIBC_LC_MESSAGES_BUG(index); + for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + curlocales[i] = querylocale_i(i); } + return calculate_LC_ALL(curlocales); + # endif - return locale; } -#endif /* USE_POSIX_2008_LOCALE */ - -#ifdef USE_LOCALE +#endif STATIC void -S_set_numeric_radix(pTHX_ const bool use_locale) +S_setlocale_failure_panic_i(pTHX_ + const unsigned int cat_index, + const char * current, + const char * failed, + const line_t caller_0_line, + const line_t caller_1_line) { - /* If 'use_locale' is FALSE, set to use a dot for the radix character. If - * TRUE, use the radix character derived from the current locale */ - -#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \ - || defined(HAS_NL_LANGINFO)) + dSAVE_ERRNO; + const int cat = categories[cat_index]; + const char * name = category_names[cat_index]; - const char * radix = (use_locale) - ? my_nl_langinfo(RADIXCHAR, FALSE) - /* FALSE => already in dest locale */ - : "."; + PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I; - sv_setpv(PL_numeric_radix_sv, radix); - - /* If this is valid UTF-8 that isn't totally ASCII, and we are in - * a UTF-8 locale, then mark the radix as being in UTF-8 */ - if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv), - SvCUR(PL_numeric_radix_sv)) - && _is_cur_LC_category_utf8(LC_NUMERIC)) - { - SvUTF8_on(PL_numeric_radix_sv); + if (current == NULL) { + current = querylocale_i(cat_index); } -# ifdef DEBUGGING - - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n", - SvPVX(PL_numeric_radix_sv), - cBOOL(SvUTF8(PL_numeric_radix_sv))); - } + Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf + "): Can't change locale for %s(%d)" + " from '%s' to '%s'", + caller_1_line, name, cat, + current, failed), + __FILE__, caller_0_line, GET_ERRNO); + NOT_REACHED; /* NOTREACHED */ +} +/* Any of these will allow us to find the RADIX */ +# if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_SOME_LANGINFO) \ + || defined(HAS_LOCALECONV) \ + || defined(HAS_SNPRINTF)) +# define CAN_CALCULATE_RADIX # endif -#else - - PERL_UNUSED_ARG(use_locale); - -#endif /* USE_LOCALE_NUMERIC and can find the radix char */ - -} +# ifdef USE_LOCALE_NUMERIC STATIC void -S_new_numeric(pTHX_ const char *newnum) +S_new_numeric(pTHX_ const char *newnum, bool force) { - -#ifndef USE_LOCALE_NUMERIC - - PERL_UNUSED_ARG(newnum); - -#else + PERL_ARGS_ASSERT_NEW_NUMERIC; /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell - * core Perl this and that 'newnum' is the name of the new locale. - * It installs this locale as the current underlying default. + * core Perl this and that 'newnum' is the name of the new locale, and we + * are switched into it. It installs this locale as the current underlying + * default, and then switches to the C locale, if necessary, so that the + * code that has traditionally expected the radix character to be a dot may + * continue to do so. * * The default locale and the C locale can be toggled between by use of the * set_numeric_underlying() and set_numeric_standard() functions, which @@ -1285,8 +1866,8 @@ S_new_numeric(pTHX_ const char *newnum) * SET_NUMERIC_STANDARD() in perl.h. * * The toggling is necessary mainly so that a non-dot radix decimal point - * character can be output, while allowing internal calculations to use a - * dot. + * character can be input and output, while allowing internal calculations + * to use a dot. * * This sets several interpreter-level variables: * PL_numeric_name The underlying locale's name: a copy of 'newnum' @@ -1305,101 +1886,155 @@ S_new_numeric(pTHX_ const char *newnum) * variables are true at the same time. (Toggling is a * no-op under these circumstances.) This variable is * used to avoid having to recalculate. + * PL_numeric_radix_sv Contains the string that code should use for the + * decimal point. It is set to either a dot or the + * program's underlying locale's radix character string, + * depending on the situation. + * PL_underlying_radix_sv Contains the program's underlying locale's radix + * character string. This is copied into + * PL_numeric_radix_sv when the situation warrants. It + * exists to avoid having to recalculate it when toggling. + * PL_underlying_numeric_obj = (only on POSIX 2008 platforms) An object + * with everything set up properly so as to avoid work on + * such platforms. */ - char *save_newnum; + DEBUG_L( PerlIO_printf(Perl_debug_log, + "Called new_numeric with %s, PL_numeric_name=%s\n", + newnum, PL_numeric_name)); + + /* If not forcing this procedure, and there isn't actually a change from + * our records, do nothing. (Our records can be wrong when sync'ing to the + * locale set up by an external library, hence the 'force' parameter) */ + if (! force && strEQ(PL_numeric_name, newnum)) { + return; + } + + Safefree(PL_numeric_name); + PL_numeric_name = savepv(newnum); - if (! newnum) { - Safefree(PL_numeric_name); - PL_numeric_name = NULL; + /* Handle the trivial case. Since this is called at process + * initialization, be aware that this bit can't rely on much being + * available. */ + if (isNAME_C_OR_POSIX(PL_numeric_name)) { PL_numeric_standard = TRUE; - PL_numeric_underlying = TRUE; PL_numeric_underlying_is_standard = TRUE; + PL_numeric_underlying = TRUE; + sv_setpv(PL_numeric_radix_sv, C_decimal_point); + sv_setpv(PL_underlying_radix_sv, C_decimal_point); return; } - save_newnum = stdize_locale(savepv(newnum)); + /* We are in the underlying locale until changed at the end of this + * function */ PL_numeric_underlying = TRUE; - PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum); -#ifndef TS_W32_BROKEN_LOCALECONV +# ifdef USE_POSIX_2008_LOCALE - /* If its name isn't C nor POSIX, it could still be indistinguishable from - * them. But on broken Windows systems calling my_nl_langinfo() for - * THOUSEP can currently (but rarely) cause a race, so avoid doing that, - * and just always change the locale if not C nor POSIX on those systems */ - if (! PL_numeric_standard) { - PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR, - FALSE /* Don't toggle locale */ )) - && strEQ("", my_nl_langinfo(THOUSEP, FALSE))); - } + /* We keep a special object for easy switching to. + * + * NOTE: This code may incorrectly show up as a leak under the address + * sanitizer. We do not free this object under normal teardown, however + * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed. + */ + PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK, + PL_numeric_name, + PL_underlying_numeric_obj); -#endif +# endif - /* Save the new name if it isn't the same as the previous one, if any */ - if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) { - Safefree(PL_numeric_name); - PL_numeric_name = save_newnum; - } - else { - Safefree(save_newnum); - } + const char * radix = NULL; + utf8ness_t utf8ness = UTF8NESS_IMMATERIAL; - PL_numeric_underlying_is_standard = PL_numeric_standard; + /* Find and save this locale's radix character. */ + my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name, + &radix, NULL, &utf8ness); + sv_setpv(PL_underlying_radix_sv, radix); -# ifdef HAS_POSIX_2008_LOCALE + if (utf8ness == UTF8NESS_YES) { + SvUTF8_on(PL_underlying_radix_sv); + } - PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK, - PL_numeric_name, - PL_underlying_numeric_obj); + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Locale radix is '%s', ?UTF-8=%d\n", + SvPVX(PL_underlying_radix_sv), + cBOOL(SvUTF8(PL_underlying_radix_sv)))); -#endif + /* This locale is indistinguishable from C (for numeric purposes) if both + * the radix character and the thousands separator are the same as C's. + * Start with the radix. */ + PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix); + Safefree(radix); - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name); - } +# ifndef TS_W32_BROKEN_LOCALECONV - /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't - * have to worry about the radix being a non-dot. (Core operations that - * need the underlying locale change to it temporarily). */ - if (PL_numeric_standard) { - set_numeric_radix(0); + /* If the radix isn't the same as C's, we know it is distinguishable from + * C; otherwise check the thousands separator too. Only if both are the + * same as C's is the locale indistinguishable from C. + * + * But on earlier Windows versions, there is a potential race. This code + * knows that localeconv() (elsewhere in this file) will be used to extract + * the needed value, and localeconv() was buggy for quite a while, and that + * code in this file hence uses a workaround. And that workaround may have + * an (unlikely) race. Gathering the radix uses a different workaround on + * Windows that doesn't involve a race. It might be possible to do the + * same for this (patches welcome). + * + * Until then khw doesn't think it's worth even the small risk of a race to + * get this value, which doesn't appear to be used in any of the Microsoft + * library routines anyway. */ + + const char * scratch_buffer = NULL; + if (PL_numeric_underlying_is_standard) { + PL_numeric_underlying_is_standard = strEQ(C_thousands_sep, + my_langinfo_c(THOUSEP, LC_NUMERIC, + PL_numeric_name, + &scratch_buffer, + NULL, NULL)); } - else { + Safefree(scratch_buffer); + +# else + PERL_UNUSED_VAR(C_thousands_sep); +# endif + + PL_numeric_standard = PL_numeric_underlying_is_standard; + + /* Keep LC_NUMERIC so that it has the C locale radix and thousands + * separator. This is for XS modules, so they don't have to worry about + * the radix being a non-dot. (Core operations that need the underlying + * locale change to it temporarily). */ + if (! PL_numeric_standard) { set_numeric_standard(); } -#endif /* USE_LOCALE_NUMERIC */ - } +# endif + void Perl_set_numeric_standard(pTHX) { -#ifdef USE_LOCALE_NUMERIC - - /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like - * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The - * macro avoids calling this routine if toggling isn't necessary according - * to our records (which could be wrong if some XS code has changed the - * locale behind our back) */ - -# ifdef DEBUGGING +# ifdef USE_LOCALE_NUMERIC - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "Setting LC_NUMERIC locale to standard C\n"); - } + /* Unconditionally toggle the LC_NUMERIC locale to the C locale + * + * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h + * instead of calling this directly. The macro avoids calling this routine + * if toggling isn't necessary according to our records (which could be + * wrong if some XS code has changed the locale behind our back) */ -# endif + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Setting LC_NUMERIC locale to standard C\n")); - do_setlocale_c(LC_NUMERIC, "C"); + void_setlocale_c(LC_NUMERIC, "C"); PL_numeric_standard = TRUE; + sv_setpv(PL_numeric_radix_sv, C_decimal_point); + PL_numeric_underlying = PL_numeric_underlying_is_standard; - set_numeric_radix(0); -#endif /* USE_LOCALE_NUMERIC */ +# endif /* USE_LOCALE_NUMERIC */ } @@ -1407,65 +2042,50 @@ void Perl_set_numeric_underlying(pTHX) { -#ifdef USE_LOCALE_NUMERIC +# ifdef USE_LOCALE_NUMERIC - /* Toggle the LC_NUMERIC locale to the current underlying default. Most - * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h + /* Unconditionally toggle the LC_NUMERIC locale to the current underlying + * default. + * + * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h * instead of calling this directly. The macro avoids calling this routine * if toggling isn't necessary according to our records (which could be * wrong if some XS code has changed the locale behind our back) */ -# ifdef DEBUGGING - - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "Setting LC_NUMERIC locale to %s\n", - PL_numeric_name); - } + DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n", + PL_numeric_name)); -# endif + void_setlocale_c(LC_NUMERIC, PL_numeric_name); + PL_numeric_underlying = TRUE; + sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv); - do_setlocale_c(LC_NUMERIC, PL_numeric_name); PL_numeric_standard = PL_numeric_underlying_is_standard; - PL_numeric_underlying = TRUE; - set_numeric_radix(! PL_numeric_standard); -#endif /* USE_LOCALE_NUMERIC */ +# endif /* USE_LOCALE_NUMERIC */ } -/* - * Set up for a new ctype locale. - */ +# ifdef USE_LOCALE_CTYPE + STATIC void -S_new_ctype(pTHX_ const char *newctype) +S_new_ctype(pTHX_ const char *newctype, bool force) { - -#ifndef USE_LOCALE_CTYPE - - PERL_UNUSED_ARG(newctype); - PERL_UNUSED_CONTEXT; - -#else + PERL_ARGS_ASSERT_NEW_CTYPE; + PERL_UNUSED_ARG(force); /* Called after each libc setlocale() call affecting LC_CTYPE, to tell * core Perl this and that 'newctype' is the name of the new locale. * * This function sets up the folding arrays for all 256 bytes, assuming * that tofold() is tolc() since fold case is not a concept in POSIX, - * - * Any code changing the locale (outside this file) should use - * Perl_setlocale or POSIX::setlocale, which call this function. Therefore - * this function should be called directly only from this file and from - * POSIX::setlocale() */ + */ - unsigned int i; + DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype)); - /* Don't check for problems if we are suppressing the warnings */ - bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST); - bool maybe_utf8_turkic = FALSE; - - PERL_ARGS_ASSERT_NEW_CTYPE; + /* No change means no-op */ + if (strEQ(PL_ctype_name, newctype)) { + return; + } /* We will replace any bad locale warning with 1) nothing if the new one is * ok; or 2) a new warning for the bad new locale */ @@ -1474,52 +2094,196 @@ S_new_ctype(pTHX_ const char *newctype) PL_warn_locale = NULL; } - PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE); + /* Clear cache */ + Safefree(PL_ctype_name); + PL_ctype_name = ""; + + PL_in_utf8_turkic_locale = FALSE; + + /* For the C locale, just use the standard folds, and we know there are no + * glitches possible, so return early. Since this is called at process + * initialization, be aware that this bit can't rely on much being + * available. */ + if (isNAME_C_OR_POSIX(newctype)) { + Copy(PL_fold, PL_fold_locale, 256, U8); + PL_ctype_name = savepv(newctype); + PL_in_utf8_CTYPE_locale = FALSE; + return; + } + + /* The cache being cleared signals this function to compute a new value */ + PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype); + + PL_ctype_name = savepv(newctype); + bool maybe_utf8_turkic = FALSE; + + /* Don't check for problems if we are suppressing the warnings */ + bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST); - /* A UTF-8 locale gets standard rules. But note that code still has to - * handle this specially because of the three problematic code points */ if (PL_in_utf8_CTYPE_locale) { + + /* A UTF-8 locale gets standard rules. But note that code still has to + * handle this specially because of the three problematic code points + * */ Copy(PL_fold_latin1, PL_fold_locale, 256, U8); /* UTF-8 locales can have special handling for 'I' and 'i' if they are - * Turkic. Make sure these two are the only anomalies. (We don't use - * towupper and towlower because they aren't in C89.) */ + * Turkic. Make sure these two are the only anomalies. (We don't + * require towupper and towlower because they aren't in C89.) */ -#if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER) +# if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER) - if (towupper('i') == 0x130 && towlower('I') == 0x131) { + if (towupper('i') == 0x130 && towlower('I') == 0x131) -#else +# else - if (toupper('i') == 'i' && tolower('I') == 'I') { + if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I') -#endif +# endif + + { + /* This is how we determine it really is Turkic */ check_for_problems = TRUE; maybe_utf8_turkic = TRUE; } } + else { /* Not a canned locale we know the values for. Compute them */ + +# ifdef DEBUGGING + + bool has_non_ascii_fold = FALSE; + bool found_unexpected = FALSE; + + /* Under -DLv, see if there are any folds outside the ASCII range. + * This factoid is used below */ + if (DEBUG_Lv_TEST) { + for (unsigned i = 128; i < 256; i++) { + int j = LATIN1_TO_NATIVE(i); + if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) { + has_non_ascii_fold = TRUE; + break; + } + } + } + +# endif + + for (unsigned i = 0; i < 256; i++) { + if (isU8_UPPER_LC(i)) + PL_fold_locale[i] = (U8) toU8_LOWER_LC(i); + else if (isU8_LOWER_LC(i)) + PL_fold_locale[i] = (U8) toU8_UPPER_LC(i); + else + PL_fold_locale[i] = (U8) i; + +# ifdef DEBUGGING + + /* Most locales these days are supersets of ASCII. When debugging + * with -DLv, it is helpful to know what the exceptions to that are + * in this locale */ + if (DEBUG_Lv_TEST) { + bool unexpected = FALSE; + + if (isUPPER_L1(i)) { + if (isUPPER_A(i)) { + if (PL_fold_locale[i] != toLOWER_A(i)) { + unexpected = TRUE; + } + } + else if (has_non_ascii_fold) { + if (PL_fold_locale[i] != toLOWER_L1(i)) { + unexpected = TRUE; + } + } + else if (PL_fold_locale[i] != i) { + unexpected = TRUE; + } + } + else if ( isLOWER_L1(i) + && i != LATIN_SMALL_LETTER_SHARP_S + && i != MICRO_SIGN) + { + if (isLOWER_A(i)) { + if (PL_fold_locale[i] != toUPPER_A(i)) { + unexpected = TRUE; + } + } + else if (has_non_ascii_fold) { + if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) { + unexpected = TRUE; + } + } + else if (PL_fold_locale[i] != i) { + unexpected = TRUE; + } + } + else if (PL_fold_locale[i] != i) { + unexpected = TRUE; + } + + if (unexpected) { + found_unexpected = TRUE; + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "For %s, fold of %02x is %02x\n", + newctype, i, PL_fold_locale[i])); + } + } + } + + if (found_unexpected) { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "All bytes not mentioned above either fold to" + " themselves or are the expected ASCII or" + " Latin1 ones\n")); + } + else { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "No nonstandard folds were found\n")); +# endif + + } + } + +# ifdef MB_CUR_MAX + + /* We only handle single-byte locales (outside of UTF-8 ones); so if this + * locale requires more than one byte, there are going to be BIG problems. + * */ + + if (MB_CUR_MAX > 1 && ! PL_in_utf8_CTYPE_locale + + /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale. + * Just assume that the implementation for them (plus for POSIX) is + * correct and the > 1 value is spurious. (Since these are + * specially handled to never be considered UTF-8 locales, as long + * as this is the only problem, everything should work fine */ + && ! isNAME_C_OR_POSIX(newctype)) + { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Unsupported, MB_CUR_MAX=%d\n", (int) MB_CUR_MAX)); + + Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE), + "Locale '%s' is unsupported, and may crash the" + " interpreter.\n", + newctype); + } + +# endif + + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n", + check_for_problems)); /* We don't populate the other lists if a UTF-8 locale, but do check that * everything works as expected, unless checking turned off */ - if (check_for_problems || ! PL_in_utf8_CTYPE_locale) { + if (check_for_problems) { /* Assume enough space for every character being bad. 4 spaces each * for the 94 printable characters that are output like "'x' "; and 5 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating * NUL */ char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' }; - bool multi_byte_locale = FALSE; /* Assume is a single-byte locale - to start */ unsigned int bad_count = 0; /* Count of bad characters */ - for (i = 0; i < 256; i++) { - if (! PL_in_utf8_CTYPE_locale) { - if (isupper(i)) - PL_fold_locale[i] = (U8) tolower(i); - else if (islower(i)) - PL_fold_locale[i] = (U8) toupper(i); - else - PL_fold_locale[i] = (U8) i; - } + for (unsigned i = 0; i < 256; i++) { /* If checking for locale problems, see if the native ASCII-range * printables plus \n and \t are in their expected categories in @@ -1530,9 +2294,7 @@ S_new_ctype(pTHX_ const char *newctype) * nowadays. It isn't a problem for most controls to be changed * into something else; we check only \n and \t, though perhaps \r * could be an issue as well. */ - if ( check_for_problems - && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n')) - { + if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') { bool is_bad = FALSE; char name[4] = { '\0' }; @@ -1553,77 +2315,79 @@ S_new_ctype(pTHX_ const char *newctype) } /* Check each possibe class */ - if (UNLIKELY(cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i)))) { + if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) != + cBOOL(isALPHANUMERIC_A(i)))) + { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "isalnum('%s') unexpectedly is %d\n", - name, cBOOL(isalnum(i)))); + "isalnum('%s') unexpectedly is %x\n", + name, cBOOL(isU8_ALPHANUMERIC_LC(i)))); } - if (UNLIKELY(cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)))) { + if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i)))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "isalpha('%s') unexpectedly is %d\n", - name, cBOOL(isalpha(i)))); + "isalpha('%s') unexpectedly is %x\n", + name, cBOOL(isU8_ALPHA_LC(i)))); } - if (UNLIKELY(cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)))) { + if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i)))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "isdigit('%s') unexpectedly is %d\n", - name, cBOOL(isdigit(i)))); + "isdigit('%s') unexpectedly is %x\n", + name, cBOOL(isU8_DIGIT_LC(i)))); } - if (UNLIKELY(cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)))) { + if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i)))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "isgraph('%s') unexpectedly is %d\n", - name, cBOOL(isgraph(i)))); + "isgraph('%s') unexpectedly is %x\n", + name, cBOOL(isU8_GRAPH_LC(i)))); } - if (UNLIKELY(cBOOL(islower(i)) != cBOOL(isLOWER_A(i)))) { + if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i)))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "islower('%s') unexpectedly is %d\n", - name, cBOOL(islower(i)))); + "islower('%s') unexpectedly is %x\n", + name, cBOOL(isU8_LOWER_LC(i)))); } - if (UNLIKELY(cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)))) { + if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i)))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "isprint('%s') unexpectedly is %d\n", - name, cBOOL(isprint(i)))); + "isprint('%s') unexpectedly is %x\n", + name, cBOOL(isU8_PRINT_LC(i)))); } - if (UNLIKELY(cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)))) { + if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i)))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "ispunct('%s') unexpectedly is %d\n", - name, cBOOL(ispunct(i)))); + "ispunct('%s') unexpectedly is %x\n", + name, cBOOL(isU8_PUNCT_LC(i)))); } - if (UNLIKELY(cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)))) { + if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i)))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "isspace('%s') unexpectedly is %d\n", - name, cBOOL(isspace(i)))); + "isspace('%s') unexpectedly is %x\n", + name, cBOOL(isU8_SPACE_LC(i)))); } - if (UNLIKELY(cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)))) { + if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i)))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "isupper('%s') unexpectedly is %d\n", - name, cBOOL(isupper(i)))); + "isupper('%s') unexpectedly is %x\n", + name, cBOOL(isU8_UPPER_LC(i)))); } - if (UNLIKELY(cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)))) { + if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, - "isxdigit('%s') unexpectedly is %d\n", - name, cBOOL(isxdigit(i)))); + "isxdigit('%s') unexpectedly is %x\n", + name, cBOOL(isU8_XDIGIT_LC(i)))); } - if (UNLIKELY(tolower(i) != (int) toLOWER_A(i))) { + if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, "tolower('%s')=0x%x instead of the expected 0x%x\n", - name, tolower(i), (int) toLOWER_A(i))); + name, toU8_LOWER_LC(i), (int) toLOWER_A(i))); } - if (UNLIKELY(toupper(i) != (int) toUPPER_A(i))) { + if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) { is_bad = TRUE; DEBUG_L(PerlIO_printf(Perl_debug_log, "toupper('%s')=0x%x instead of the expected 0x%x\n", - name, toupper(i), (int) toUPPER_A(i))); + name, toU8_UPPER_LC(i), (int) toUPPER_A(i))); } if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) { is_bad = TRUE; @@ -1645,46 +2409,25 @@ S_new_ctype(pTHX_ const char *newctype) if (bad_count == 2 && maybe_utf8_turkic) { bad_count = 0; *bad_chars_list = '\0'; - PL_fold_locale['I'] = 'I'; - PL_fold_locale['i'] = 'i'; - PL_in_utf8_turkic_locale = TRUE; - DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n", - __FILE__, __LINE__, newctype)); - } - else { - PL_in_utf8_turkic_locale = FALSE; - } -# ifdef MB_CUR_MAX - - /* We only handle single-byte locales (outside of UTF-8 ones; so if - * this locale requires more than one byte, there are going to be - * problems. */ - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n", - __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX)); - - if ( check_for_problems && MB_CUR_MAX > 1 - && ! PL_in_utf8_CTYPE_locale - - /* Some platforms return MB_CUR_MAX > 1 for even the "C" - * locale. Just assume that the implementation for them (plus - * for POSIX) is correct and the > 1 value is spurious. (Since - * these are specially handled to never be considered UTF-8 - * locales, as long as this is the only problem, everything - * should work fine */ - && strNE(newctype, "C") && strNE(newctype, "POSIX")) - { - multi_byte_locale = TRUE; + /* The casts are because otherwise some compilers warn: + gcc.gnu.org/bugzilla/show_bug.cgi?id=99950 + gcc.gnu.org/bugzilla/show_bug.cgi?id=94182 + */ + PL_fold_locale[ (U8) 'I' ] = 'I'; + PL_fold_locale[ (U8) 'i' ] = 'i'; + PL_in_utf8_turkic_locale = TRUE; + DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype)); } -# endif - /* If we found problems and we want them output, do so */ - if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale)) + if ( (UNLIKELY(bad_count)) && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST))) { - if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) { + /* WARNING. If you change the wording of these; be sure to update + * t/loc_tools.pl correspondingly */ + + if (PL_in_utf8_CTYPE_locale) { PL_warn_locale = Perl_newSVpvf(aTHX_ "Locale '%s' contains (at least) the following characters" " which have\nunexpected meanings: %s\nThe Perl program" @@ -1692,29 +2435,24 @@ S_new_ctype(pTHX_ const char *newctype) newctype, bad_chars_list); } else { - PL_warn_locale = Perl_newSVpvf(aTHX_ - "Locale '%s' may not work well.%s%s%s\n", - newctype, - (multi_byte_locale) - ? " Some characters in it are not recognized by" - " Perl." - : "", - (bad_count) - ? "\nThe following characters (and maybe others)" - " may not have the same meaning as the Perl" - " program expects:\n" - : "", - (bad_count) - ? bad_chars_list - : "" + PL_warn_locale = + Perl_newSVpvf(aTHX_ + "\nThe following characters (and maybe" + " others) may not have the same meaning as" + " the Perl program expects: %s\n", + bad_chars_list ); } -# ifdef HAS_NL_LANGINFO +# ifdef HAS_SOME_LANGINFO + const char * scratch_buffer = NULL; Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s", - /* parameter FALSE is a don't care here */ - my_nl_langinfo(CODESET, FALSE)); + my_langinfo_c(CODESET, LC_CTYPE, + newctype, + &scratch_buffer, NULL, + NULL)); + Safefree(scratch_buffer); # endif @@ -1728,7 +2466,8 @@ S_new_ctype(pTHX_ const char *newctype) if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) { /* The '0' below suppresses a bogus gcc compiler warning */ - Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0); + Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), + 0); if (IN_LC(LC_CTYPE)) { SvREFCNT_dec_NN(PL_warn_locale); @@ -1737,22 +2476,21 @@ S_new_ctype(pTHX_ const char *newctype) } } } - -#endif /* USE_LOCALE_CTYPE */ - } +# endif /* USE_LOCALE_CTYPE */ + void Perl__warn_problematic_locale() { -#ifdef USE_LOCALE_CTYPE +# ifdef USE_LOCALE_CTYPE dTHX; /* Internal-to-core function that outputs the message in PL_warn_locale, * and then NULLS it. Should be called only through the macro - * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */ + * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */ if (PL_warn_locale) { Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -1762,26 +2500,39 @@ Perl__warn_problematic_locale() PL_warn_locale = NULL; } -#endif +# endif } STATIC void -S_new_collate(pTHX_ const char *newcoll) +S_new_LC_ALL(pTHX_ const char *unused, bool force) { + PERL_ARGS_ASSERT_NEW_LC_ALL; + PERL_UNUSED_ARG(unused); -#ifndef USE_LOCALE_COLLATE + /* LC_ALL updates all the things we care about. */ + + for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + if (update_functions[i]) { + const char * this_locale = querylocale_i(i); + update_functions[i](aTHX_ this_locale, force); + } + } +} - PERL_UNUSED_ARG(newcoll); - PERL_UNUSED_CONTEXT; +# ifdef USE_LOCALE_COLLATE -#else +STATIC void +S_new_collate(pTHX_ const char *newcoll, bool force) +{ + PERL_ARGS_ASSERT_NEW_COLLATE; + PERL_UNUSED_ARG(force); /* Called after each libc setlocale() call affecting LC_COLLATE, to tell * core Perl this and that 'newcoll' is the name of the new locale. * * The design of locale collation is that every locale change is given an - * index 'PL_collation_ix'. The first time a string particpates in an + * index 'PL_collation_ix'. The first time a string participates in an * operation that requires collation while locale collation is active, it * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That * magic includes the collation index, and the transformation of the string @@ -1794,14 +2545,23 @@ S_new_collate(pTHX_ const char *newcoll) * that a transformation would improperly be considered valid, leading to * an unlikely bug */ - if (! newcoll) { - if (PL_collation_name) { - ++PL_collation_ix; - Safefree(PL_collation_name); - PL_collation_name = NULL; - } - PL_collation_standard = TRUE; - is_standard_collation: + /* Return if the locale isn't changing */ + if (strEQ(PL_collation_name, newcoll)) { + return; + } + + Safefree(PL_collation_name); + PL_collation_name = savepv(newcoll); + ++PL_collation_ix; + + /* Set the new one up if trivial. Since this is called at process + * initialization, be aware that this bit can't rely on much being + * available. */ + PL_collation_standard = isNAME_C_OR_POSIX(newcoll); + if (PL_collation_standard) { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "Setting PL_collation name='%s'\n", + PL_collation_name)); PL_collxfrm_base = 0; PL_collxfrm_mult = 2; PL_in_utf8_COLLATE_locale = FALSE; @@ -1810,240 +2570,105 @@ S_new_collate(pTHX_ const char *newcoll) return; } - /* If this is not the same locale as currently, set the new one up */ - if (! PL_collation_name || strNE(PL_collation_name, newcoll)) { - ++PL_collation_ix; - Safefree(PL_collation_name); - PL_collation_name = stdize_locale(savepv(newcoll)); - PL_collation_standard = isNAME_C_OR_POSIX(newcoll); - if (PL_collation_standard) { - goto is_standard_collation; - } + /* Flag that the remainder of the set up is being deferred until first + * need. */ + PL_collxfrm_mult = 0; + PL_collxfrm_base = 0; - PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE); - PL_strxfrm_NUL_replacement = '\0'; - PL_strxfrm_max_cp = 0; - - /* A locale collation definition includes primary, secondary, tertiary, - * etc. weights for each character. To sort, the primary weights are - * used, and only if they compare equal, then the secondary weights are - * used, and only if they compare equal, then the tertiary, etc. - * - * strxfrm() works by taking the input string, say ABC, and creating an - * output transformed string consisting of first the primary weights, - * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the - * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters - * may not have weights at every level. In our example, let's say B - * doesn't have a tertiary weight, and A doesn't have a secondary - * weight. The constructed string is then going to be - * A¹B¹C¹ B²C² A³C³ .... - * This has the desired effect that strcmp() will look at the secondary - * or tertiary weights only if the strings compare equal at all higher - * priority weights. The spaces shown here, like in - * "A¹B¹C¹ A²B²C² " - * are not just for readability. In the general case, these must - * actually be bytes, which we will call here 'separator weights'; and - * they must be smaller than any other weight value, but since these - * are C strings, only the terminating one can be a NUL (some - * implementations may include a non-NUL separator weight just before - * the NUL). Implementations tend to reserve 01 for the separator - * weights. They are needed so that a shorter string's secondary - * weights won't be misconstrued as primary weights of a longer string, - * etc. By making them smaller than any other weight, the shorter - * string will sort first. (Actually, if all secondary weights are - * smaller than all primary ones, there is no need for a separator - * weight between those two levels, etc.) - * - * The length of the transformed string is roughly a linear function of - * the input string. It's not exactly linear because some characters - * don't have weights at all levels. When we call strxfrm() we have to - * allocate some memory to hold the transformed string. The - * calculations below try to find coefficients 'm' and 'b' for this - * locale so that m*x + b equals how much space we need, given the size - * of the input string in 'x'. If we calculate too small, we increase - * the size as needed, and call strxfrm() again, but it is better to - * get it right the first time to avoid wasted expensive string - * transformations. */ +} - { - /* We use the string below to find how long the tranformation of it - * is. Almost all locales are supersets of ASCII, or at least the - * ASCII letters. We use all of them, half upper half lower, - * because if we used fewer, we might hit just the ones that are - * outliers in a particular locale. Most of the strings being - * collated will contain a preponderance of letters, and even if - * they are above-ASCII, they are likely to have the same number of - * weight levels as the ASCII ones. It turns out that digits tend - * to have fewer levels, and some punctuation has more, but those - * are relatively sparse in text, and khw believes this gives a - * reasonable result, but it could be changed if experience so - * dictates. */ - const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz"; - char * x_longer; /* Transformed 'longer' */ - Size_t x_len_longer; /* Length of 'x_longer' */ - - char * x_shorter; /* We also transform a substring of 'longer' */ - Size_t x_len_shorter; - - /* _mem_collxfrm() is used get the transformation (though here we - * are interested only in its length). It is used because it has - * the intelligence to handle all cases, but to work, it needs some - * values of 'm' and 'b' to get it started. For the purposes of - * this calculation we use a very conservative estimate of 'm' and - * 'b'. This assumes a weight can be multiple bytes, enough to - * hold any UV on the platform, and there are 5 levels, 4 weight - * bytes, and a trailing NUL. */ - PL_collxfrm_base = 5; - PL_collxfrm_mult = 5 * sizeof(UV); - - /* Find out how long the transformation really is */ - x_longer = _mem_collxfrm(longer, - sizeof(longer) - 1, - &x_len_longer, - - /* We avoid converting to UTF-8 in the - * called function by telling it the - * string is in UTF-8 if the locale is a - * UTF-8 one. Since the string passed - * here is invariant under UTF-8, we can - * claim it's UTF-8 even though it isn't. - * */ - PL_in_utf8_COLLATE_locale); - Safefree(x_longer); - - /* Find out how long the transformation of a substring of 'longer' - * is. Together the lengths of these transformations are - * sufficient to calculate 'm' and 'b'. The substring is all of - * 'longer' except the first character. This minimizes the chances - * of being swayed by outliers */ - x_shorter = _mem_collxfrm(longer + 1, - sizeof(longer) - 2, - &x_len_shorter, - PL_in_utf8_COLLATE_locale); - Safefree(x_shorter); - - /* If the results are nonsensical for this simple test, the whole - * locale definition is suspect. Mark it so that locale collation - * is not active at all for it. XXX Should we warn? */ - if ( x_len_shorter == 0 - || x_len_longer == 0 - || x_len_shorter >= x_len_longer) - { - PL_collxfrm_mult = 0; - PL_collxfrm_base = 0; - } - else { - SSize_t base; /* Temporary */ - - /* We have both: m * strlen(longer) + b = x_len_longer - * m * strlen(shorter) + b = x_len_shorter; - * subtracting yields: - * m * (strlen(longer) - strlen(shorter)) - * = x_len_longer - x_len_shorter - * But we have set things up so that 'shorter' is 1 byte smaller - * than 'longer'. Hence: - * m = x_len_longer - x_len_shorter - * - * But if something went wrong, make sure the multiplier is at - * least 1. - */ - if (x_len_longer > x_len_shorter) { - PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter; - } - else { - PL_collxfrm_mult = 1; - } +# endif /* USE_LOCALE_COLLATE */ +#endif /* USE_LOCALE */ - /* mx + b = len - * so: b = len - mx - * but in case something has gone wrong, make sure it is - * non-negative */ - base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1); - if (base < 0) { - base = 0; - } +#ifdef WIN32 - /* Add 1 for the trailing NUL */ - PL_collxfrm_base = base + 1; - } +wchar_t * +S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string) +{ + /* Caller must arrange to free the returned string */ -# ifdef DEBUGGING + int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0); + if (! req_size) { + SET_EINVAL; + return NULL; + } - if (DEBUG_L_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, " - "x_len_longer=%zu," - " collate multipler=%zu, collate base=%zu\n", - __FILE__, __LINE__, - PL_in_utf8_COLLATE_locale, - x_len_shorter, x_len_longer, - PL_collxfrm_mult, PL_collxfrm_base); - } -# endif + wchar_t *wstring; + Newx(wstring, req_size, wchar_t); - } + if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size)) + { + Safefree(wstring); + SET_EINVAL; + return NULL; } -#endif /* USE_LOCALE_COLLATE */ - + return wstring; } -#endif +#define Win_utf8_string_to_wstring(s) Win_byte_string_to_wstring(CP_UTF8, (s)) -#ifdef WIN32 +char * +S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring) +{ + /* Caller must arrange to free the returned string */ -#define USE_WSETLOCALE + int req_size = + WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL); -#ifdef USE_WSETLOCALE + char *byte_string; + Newx(byte_string, req_size, char); -STATIC char * -S_wrap_wsetlocale(pTHX_ int category, const char *locale) { - wchar_t *wlocale; - wchar_t *wresult; - char *result; + if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string, + req_size, NULL, NULL)) + { + Safefree(byte_string); + SET_EINVAL; + return NULL; + } - if (locale) { - int req_size = - MultiByteToWideChar(CP_UTF8, 0, locale, -1, NULL, 0); + return byte_string; +} - if (!req_size) { - errno = EINVAL; - return NULL; - } +#define Win_wstring_to_utf8_string(ws) Win_wstring_to_byte_string(CP_UTF8, (ws)) + +STATIC const char * +S_wrap_wsetlocale(pTHX_ const int category, const char *locale) +{ + PERL_ARGS_ASSERT_WRAP_WSETLOCALE; + + /* Calls _wsetlocale(), converting the parameters/return to/from + * Perl-expected forms as if plain setlocale() were being called instead. + */ + + const wchar_t * wlocale = NULL; - Newx(wlocale, req_size, wchar_t); - if (!MultiByteToWideChar(CP_UTF8, 0, locale, -1, wlocale, req_size)) { - Safefree(wlocale); - errno = EINVAL; + if (locale) { + wlocale = Win_utf8_string_to_wstring(locale); + if (! wlocale) { return NULL; } } - else { - wlocale = NULL; - } - wresult = _wsetlocale(category, wlocale); + + WSETLOCALE_LOCK; + const wchar_t * wresult = _wsetlocale(category, wlocale); Safefree(wlocale); - if (wresult) { - int req_size = - WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL); - Newx(result, req_size, char); - SAVEFREEPV(result); /* is there something better we can do here? */ - if (!WideCharToMultiByte(CP_UTF8, 0, wresult, -1, - result, req_size, NULL, NULL)) { - errno = EINVAL; - return NULL; - } - } - else { - result = NULL; + + if (! wresult) { + WSETLOCALE_UNLOCK; + return NULL; } + const char * result = Win_wstring_to_utf8_string(wresult); + WSETLOCALE_UNLOCK; + + SAVEFREEPV(result); /* is there something better we can do here? Answer: + Without restructuring, returning a unique value each + call is required. See GH #20434 */ return result; } -#endif - -STATIC char * +STATIC const char * S_win32_setlocale(pTHX_ int category, const char* locale) { /* This, for Windows, emulates POSIX setlocale() behavior. There is no @@ -2060,90 +2685,32 @@ S_win32_setlocale(pTHX_ int category, const char* locale) * use the particular category's variable if set; otherwise to use the LANG * variable. */ - bool override_LC_ALL = FALSE; - char * result; - unsigned int i; - - if (locale && strEQ(locale, "")) { - -# ifdef LC_ALL - - locale = PerlEnv_getenv("LC_ALL"); - if (! locale) { - if (category == LC_ALL) { - override_LC_ALL = TRUE; - } - else { - -# endif - - for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { - if (category == categories[i]) { - locale = PerlEnv_getenv(category_names[i]); - goto found_locale; - } - } - - locale = PerlEnv_getenv("LANG"); - if (! locale) { - locale = ""; - } - - found_locale: ; + if (locale == NULL) { + return wrap_wsetlocale(category, NULL); + } -# ifdef LC_ALL + if (strEQ(locale, "")) { + /* Note this function may change the locale, but that's ok because we + * are about to change it anyway */ + locale = find_locale_from_environment(get_category_index(category, "")); + } - } - } + const char * result = wrap_wsetlocale(category, locale); + DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", + setlocale_debug_string_r(category, locale, result))); -# endif +# ifdef USE_PL_CUR_LC_ALL + /* If we need to keep track of LC_ALL, update it to the new value. */ + Safefree(PL_cur_LC_ALL); + if (category == LC_ALL) { + PL_cur_LC_ALL = savepv(result); } - -#ifdef USE_WSETLOCALE - result = S_wrap_wsetlocale(aTHX_ category, locale); -#else - result = setlocale(category, locale); -#endif - DEBUG_L(STMT_START { - dSAVE_ERRNO; - PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, - setlocale_debug_string(category, locale, result)); - RESTORE_ERRNO; - } STMT_END); - - if (! override_LC_ALL) { - return result; - } - - /* Here the input category was LC_ALL, and we have set it to what is in the - * LANG variable or the system default if there is no LANG. But these have - * lower priority than the other LC_foo variables, so override it for each - * one that is set. (If they are set to "", it means to use the same thing - * we just set LC_ALL to, so can skip) */ - - for (i = 0; i < LC_ALL_INDEX; i++) { - result = PerlEnv_getenv(category_names[i]); - if (result && strNE(result, "")) { -#ifdef USE_WSETLOCALE - S_wrap_wsetlocale(aTHX_ categories[i], result); -#else - setlocale(categories[i], result); -#endif - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(categories[i], result, "not captured"))); - } + else { + PL_cur_LC_ALL = savepv(wrap_wsetlocale(LC_ALL, NULL)); } - result = setlocale(LC_ALL, NULL); - DEBUG_L(STMT_START { - dSAVE_ERRNO; - PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", - __FILE__, __LINE__, - setlocale_debug_string(LC_ALL, NULL, result)); - RESTORE_ERRNO; - } STMT_END); +# endif return result; } @@ -2173,11 +2740,16 @@ Finally, C<Perl_setlocale> works under all circumstances, whereas plain C<setlocale> can be completely ineffective on some platforms under some configurations. -C<Perl_setlocale> should not be used to change the locale except on systems -where the predefined variable C<${^SAFE_LOCALES}> is 1. On some such systems, -the system C<setlocale()> is ineffective, returning the wrong information, and -failing to actually change the locale. C<Perl_setlocale>, however works -properly in all circumstances. +Changing the locale is not a good idea when more than one thread is running, +except on systems where the predefined variable C<${^SAFE_LOCALES}> is 1. +This is because on such systems the locale is global to the whole process and +not local to just the thread calling the function. So changing it in one +thread instantaneously changes it in all. On some such systems, the system +C<setlocale()> is ineffective, returning the wrong information, and failing to +actually change the locale. z/OS refuses to try to change the locale once a +second thread is created. C<Perl_setlocale>, should give you accurate results +of what actually happened on these problematic platforms, returning NULL if the +system forbade the locale change. The return points to a per-thread static buffer, which is overwritten the next time C<Perl_setlocale> is called from the same thread. @@ -2186,6 +2758,14 @@ time C<Perl_setlocale> is called from the same thread. */ +#ifndef USE_LOCALE_NUMERIC +# define affects_LC_NUMERIC(cat) 0 +#elif defined(LC_ALL) +# define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC || cat == LC_ALL) +#else +# define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC) +#endif + const char * Perl_setlocale(const int category, const char * locale) { @@ -2201,137 +2781,302 @@ Perl_setlocale(const int category, const char * locale) #else const char * retval; - const char * newlocale; - dSAVEDERRNO; dTHX; - DECLARATION_FOR_LC_NUMERIC_MANIPULATION; -#ifdef USE_LOCALE_NUMERIC + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Entering Perl_setlocale(%d, \"%s\")\n", + category, locale)); - /* A NULL locale means only query what the current one is. We have the - * LC_NUMERIC name saved, because we are normally switched into the C - * (or equivalent) locale for it. For an LC_ALL query, switch back to get - * the correct results. All other categories don't require special - * handling */ + /* A NULL locale means only query what the current one is. */ if (locale == NULL) { + +# ifndef USE_LOCALE_NUMERIC + + /* Without LC_NUMERIC, it's trivial; we just return the value */ + return save_to_buffer(querylocale_r(category), + &PL_setlocale_buf, &PL_setlocale_bufsize); +# else + + /* We have the LC_NUMERIC name saved, because we are normally switched + * into the C locale (or equivalent) for it. */ if (category == LC_NUMERIC) { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n", + PL_numeric_name)); /* We don't have to copy this return value, as it is a per-thread * variable, and won't change until a future setlocale */ return PL_numeric_name; } -# ifdef LC_ALL +# ifndef LC_ALL - else if (category == LC_ALL) { - STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); + /* Without LC_ALL, just return the value */ + return save_to_buffer(querylocale_r(category), + &PL_setlocale_buf, &PL_setlocale_bufsize); + +# else + + /* Here, LC_ALL is available on this platform. It's the one + * complicating category (because it can contain a toggled LC_NUMERIC + * value), for all the remaining ones (we took care of LC_NUMERIC + * above), just return the value */ + if (category != LC_ALL) { + return save_to_buffer(querylocale_r(category), + &PL_setlocale_buf, &PL_setlocale_bufsize); } + bool toggled = FALSE; + + /* For an LC_ALL query, switch back to the underlying numeric locale + * (if we aren't there already) so as to get the correct results. Our + * records for all the other categories are valid without switching */ + if (! PL_numeric_underlying) { + set_numeric_underlying(); + toggled = TRUE; + } + + retval = querylocale_c(LC_ALL); + + if (toggled) { + set_numeric_standard(); + } + + DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", + setlocale_debug_string_r(category, locale, retval))); + + return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize); + +# endif /* Has LC_ALL */ +# endif /* Has LC_NUMERIC */ + + } /* End of querying the current locale */ + + + unsigned int cat_index = get_category_index(category, NULL); + retval = querylocale_i(cat_index); + + /* If the new locale is the same as the current one, nothing is actually + * being changed, so do nothing. */ + if ( strEQ(retval, locale) + && ( ! affects_LC_NUMERIC(category) + +# ifdef USE_LOCALE_NUMERIC + + || strEQ(locale, PL_numeric_name) + # endif + )) { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Already in requested locale: no action taken\n")); + return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize); } -#endif + /* Here, an actual change is being requested. Do it */ + retval = setlocale_i(cat_index, locale); - retval = save_to_buffer(do_setlocale_r(category, locale), - &PL_setlocale_buf, &PL_setlocale_bufsize, 0); - SAVE_ERRNO; + if (! retval) { + DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", + setlocale_debug_string_i(cat_index, locale, "NULL"))); + return NULL; + } -#if defined(USE_LOCALE_NUMERIC) && defined(LC_ALL) + assert(strNE(retval, "")); + retval = save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize); - if (locale == NULL && category == LC_ALL) { - RESTORE_LC_NUMERIC(); + /* Now that have changed locales, we have to update our records to + * correspond. Only certain categories have extra work to update. */ + if (update_functions[cat_index]) { + update_functions[cat_index](aTHX_ retval, false); } + DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval)); + + return retval; + #endif - DEBUG_L(PerlIO_printf(Perl_debug_log, - "%s:%d: %s\n", __FILE__, __LINE__, - setlocale_debug_string(category, locale, retval))); +} - RESTORE_ERRNO; +STATIC utf8ness_t +S_get_locale_string_utf8ness_i(pTHX_ const char * string, + const locale_utf8ness_t known_utf8, + const char * locale, + const unsigned cat_index) +{ + PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I; - if (! retval) { - return NULL; +#ifndef USE_LOCALE + + return UTF8NESS_NO; + PERL_UNUSED_ARG(string); + PERL_UNUSED_ARG(known_utf8); + PERL_UNUSED_ARG(locale); + PERL_UNUSED_ARG(cat_index); + +#else + + assert(cat_index <= NOMINAL_LC_ALL_INDEX); + + /* Return to indicate if 'string' in the locale given by the input + * arguments should be considered UTF-8 or not. + * + * If the input 'locale' is not NULL, use that for the locale; otherwise + * use the current locale for the category specified by 'cat_index'. + */ + + if (string == NULL) { + return UTF8NESS_NO; } - /* If locale == NULL, we are just querying the state */ - if (locale == NULL) { - return retval; + if (IN_BYTES) { /* respect 'use bytes' */ + return UTF8NESS_NO; } - /* Now that have switched locales, we have to update our records to - * correspond. */ + Size_t len = strlen(string); - switch (category) { + /* UTF8ness is immaterial if the representation doesn't vary */ + const U8 * first_variant = NULL; + if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) { + return UTF8NESS_IMMATERIAL; + } -#ifdef USE_LOCALE_CTYPE + /* Can't be UTF-8 if invalid */ + if (! is_utf8_string((U8 *) first_variant, + len - ((char *) first_variant - string))) + { + return UTF8NESS_NO; + } - case LC_CTYPE: - new_ctype(retval); - break; + /* Here and below, we know the string is legal UTF-8, containing at least + * one character requiring a sequence of two or more bytes. It is quite + * likely to be UTF-8. But it pays to be paranoid and do further checking. + * + * If we already know the UTF-8ness of the locale, then we immediately know + * what the string is */ + if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) { + if (known_utf8 == LOCALE_IS_UTF8) { + return UTF8NESS_YES; + } + else { + return UTF8NESS_NO; + } + } -#endif -#ifdef USE_LOCALE_COLLATE +# ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION - case LC_COLLATE: - new_collate(retval); - break; + /* Here, we have available the libc functions that can be used to + * accurately determine the UTF8ness of the underlying locale. If it is a + * UTF-8 locale, the string is UTF-8; otherwise it was coincidental that + * the string is legal UTF-8 + * + * However, if the perl is compiled to not pay attention to the category + * being passed in, you might think that that locale is essentially always + * the C locale, so it would make sense to say it isn't UTF-8. But to get + * here, the string has to contain characters unknown in the C locale. And + * in fact, Windows boxes are compiled without LC_MESSAGES, as their + * message catalog isn't really a part of the locale system. But those + * messages really could be UTF-8, and given that the odds are rather small + * of something not being UTF-8 but being syntactically valid UTF-8, khw + * has decided to call such strings as UTF-8. */ -#endif -#ifdef USE_LOCALE_NUMERIC + if (locale == NULL) { + locale = querylocale_i(cat_index); + } + if (is_locale_utf8(locale)) { + return UTF8NESS_YES; + } - case LC_NUMERIC: - new_numeric(retval); - break; + return UTF8NESS_NO; -#endif -#ifdef LC_ALL +# else - case LC_ALL: + /* Here, we have a valid UTF-8 string containing non-ASCII characters, and + * don't have access to functions to check if the locale is UTF-8 or not. + * Assume that it is. khw tried adding a check that the string is entirely + * in a single Unicode script, but discovered the strftime() timezone is + * user-settable through the environment, which may be in a different + * script than the locale-expected value. */ + PERL_UNUSED_ARG(locale); + PERL_UNUSED_ARG(cat_index); - /* LC_ALL updates all the things we care about. The values may not - * be the same as 'retval', as the locale "" may have set things - * individually */ + return UTF8NESS_YES; -# ifdef USE_LOCALE_CTYPE +# endif +#endif - newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL)); - new_ctype(newlocale); - Safefree(newlocale); +} -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE +STATIC bool +S_is_locale_utf8(pTHX_ const char * locale) +{ + /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. It uses + * my_langinfo(), which employs various methods to get this information + * if nl_langinfo() isn't available, using heuristics as a last resort, in + * which case, the result will very likely be correct for locales for + * languages that have commonly used non-ASCII characters, but for notably + * English, it comes down to if the locale's name ends in something like + * "UTF-8". It errs on the side of not being a UTF-8 locale. */ - newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL)); - new_collate(newlocale); - Safefree(newlocale); +# if ! defined(USE_LOCALE) \ + || ! defined(USE_LOCALE_CTYPE) \ + || defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */ -# endif -# ifdef USE_LOCALE_NUMERIC + PERL_UNUSED_ARG(locale); - newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL)); - new_numeric(newlocale); - Safefree(newlocale); + return FALSE; -# endif /* USE_LOCALE_NUMERIC */ -#endif /* LC_ALL */ +# else - default: - break; + const char * scratch_buffer = NULL; + const char * codeset; + bool retval; + + PERL_ARGS_ASSERT_IS_LOCALE_UTF8; + + if (strEQ(locale, PL_ctype_name)) { + return PL_in_utf8_CTYPE_locale; } + codeset = my_langinfo_c(CODESET, LC_CTYPE, locale, + &scratch_buffer, NULL, NULL); + retval = is_codeset_name_UTF8(codeset); + + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "found codeset=%s, is_utf8=%d\n", codeset, retval)); + + Safefree(scratch_buffer); return retval; -#endif +# endif } -PERL_STATIC_INLINE const char * -S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset) +#ifdef USE_LOCALE + +STATIC const char * +S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size) { - /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size', - * growing it if necessary */ + /* Copy the NUL-terminated 'string' to a buffer whose address before this + * call began at *buf, and whose available length before this call was + * *buf_size. + * + * If the length of 'string' is greater than the space available, the + * buffer is grown accordingly, which may mean that it gets relocated. + * *buf and *buf_size will be updated to reflect this. + * + * Regardless, the function returns a pointer to where 'string' is now + * stored. + * + * 'string' may be NULL, which means no action gets taken, and NULL is + * returned. + * + * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed + * empty, and memory is malloc'd. 'buf-size' being NULL is to be used + * when this is a single use buffer, which will shortly be freed by the + * caller. + */ Size_t string_size; @@ -2341,9 +3086,17 @@ S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t return NULL; } - string_size = strlen(string) + offset + 1; + /* No-op to copy over oneself */ + if (string == *buf) { + return string; + } + + string_size = strlen(string) + 1; - if (*buf_size == 0) { + if (buf_size == NULL) { + Newx(*buf, string_size, char); + } + else if (*buf_size == 0) { Newx(*buf, string_size, char); *buf_size = string_size; } @@ -2352,36 +3105,787 @@ S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t *buf_size = string_size; } - Copy(string, *buf + offset, string_size - offset, char); + { + dTHX_DEBUGGING; + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "Copying '%s' to %p\n", + ((is_utf8_string((U8 *) string, 0)) + ? string + :_byte_dump_string((U8 *) string, strlen(string), 0)), + *buf)); + } + +# ifdef DEBUGGING + + /* Catch glitches. Usually this is because LC_CTYPE needs to be the same + * locale as whatever is being worked on */ + if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) { + dTHX_DEBUGGING; + + locale_panic_(Perl_form(aTHX_ + "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s", + string, get_LC_ALL_display())); + } + +# endif + + Copy(string, *buf, string_size, char); return *buf; } +# ifdef WIN32 + +bool +Perl_get_win32_message_utf8ness(pTHX_ const char * string) +{ + /* NULL => locale irrelevant, 0 => category irrelevant + * so returns based on the UTF-8 legality of the input string, ignoring the + * locale and category completely. + * + * This is because Windows doesn't have LC_MESSAGES */ + return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8, NULL, 0); +} + +# endif +#endif /* USE_LOCALE */ + + +int +Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len) +{ + +#if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC) + + PERL_UNUSED_ARG(pwc); + PERL_UNUSED_ARG(s); + PERL_UNUSED_ARG(len); + return -1; + +#else /* Below we have some form of mbtowc() */ +# if defined(HAS_MBRTOWC) \ + && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC)) +# define USE_MBRTOWC +# else +# undef USE_MBRTOWC +# endif + + int retval = -1; + + if (s == NULL) { /* Initialize the shift state to all zeros in + PL_mbrtowc_ps. */ + +# if defined(USE_MBRTOWC) + + memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps)); + return 0; + +# else + + MBTOWC_LOCK_; + SETERRNO(0, 0); + retval = mbtowc(NULL, NULL, 0); + MBTOWC_UNLOCK_; + return retval; + +# endif + + } + +# if defined(USE_MBRTOWC) + + SETERRNO(0, 0); + retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps); + +# else + + /* Locking prevents races, but locales can be switched out without locking, + * so this isn't a cure all */ + MBTOWC_LOCK_; + SETERRNO(0, 0); + retval = mbtowc((wchar_t *) pwc, s, len); + MBTOWC_UNLOCK_; + +# endif + + return retval; + +#endif + +} + /* +=for apidoc Perl_localeconv -=for apidoc Perl_langinfo +This is a thread-safe version of the libc L<localeconv(3)>. It is the same as +L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()> +fields), but directly callable from XS code. -This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>, -taking the same C<item> parameter values, and returning the same information. -But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks -of Perl's locale handling from your code, and can be used on systems that lack -a native C<nl_langinfo>. +=cut +*/ -Expanding on these: +HV * +Perl_localeconv(pTHX) +{ + +#if ! defined(HAS_LOCALECONV) + + return newHV(); + +#else + + return my_localeconv(0); + +#endif + +} + +#if defined(HAS_LOCALECONV) + +HV * +S_my_localeconv(pTHX_ const int item) +{ + PERL_ARGS_ASSERT_MY_LOCALECONV; + + /* This returns a mortalized hash containing all or one of the elements + * returned by localeconv(). It is used by Perl_localeconv() and + * POSIX::localeconv() and is thread-safe. + * + * There are two use cases: + * 1) Called from POSIX::locale_conv(). This returns the lconv structure + * copied to a hash, based on the current underlying locales for + * LC_NUMERIC and LC_MONETARY. An input item==0 signifies this case, or + * on many platforms it is the only use case compiled. + * 2) Certain items that nl_langinfo() provides are also derivable from + * the return of localeconv(). Windows notably doesn't have + * nl_langinfo(), so on that, and actually any platform lacking it, + * my_localeconv() is used also to emulate it for those particular + * items. The code to do this is compiled only on such platforms. + * Rather than going to the expense of creating a full hash when only + * one item is needed, the returned hash has just the desired item in + * it. + * + * To access all the localeconv() struct lconv fields, there is a data + * structure that contains every commonly documented field in it. (Maybe + * some minority platforms have extra fields. Those could be added here + * without harm; they would just be ignored on platforms lacking them.) + * + * Our structure is compiled to make looping through the fields easier by + * pointing each name to its value's offset within lconv, e.g., + { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) } + */ +# define LCONV_ENTRY(name) \ + {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)} + + /* These synonyms are just for clarity, and to make it easier in case + * something needs to change in the future */ +# define LCONV_NUMERIC_ENTRY(name) LCONV_ENTRY(name) +# define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name) + + /* There are just a few fields for NUMERIC strings */ + const lconv_offset_t lconv_numeric_strings[] = { +# ifndef NO_LOCALECONV_GROUPING + LCONV_NUMERIC_ENTRY(grouping), +# endif + LCONV_NUMERIC_ENTRY(thousands_sep), + LCONV_NUMERIC_ENTRY(decimal_point), + {NULL, 0} + }; + + /* When used to implement nl_langinfo(), we save time by only populating + * the hash with the field(s) needed. Thus we would need a data structure + * of just: + * LCONV_NUMERIC_ENTRY(decimal_point), + * {NULL, 0} + * + * By placing the decimal_point field last in the full structure, we can + * use just the tail for this bit of it, saving space. This macro yields + * the address of the sub structure. */ +# define DECIMAL_POINT_ADDRESS \ + &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)] + + /* And the MONETARY string fields */ + const lconv_offset_t lconv_monetary_strings[] = { + LCONV_MONETARY_ENTRY(int_curr_symbol), + LCONV_MONETARY_ENTRY(mon_decimal_point), +# ifndef NO_LOCALECONV_MON_THOUSANDS_SEP + LCONV_MONETARY_ENTRY(mon_thousands_sep), +# endif +# ifndef NO_LOCALECONV_MON_GROUPING + LCONV_MONETARY_ENTRY(mon_grouping), +# endif + LCONV_MONETARY_ENTRY(positive_sign), + LCONV_MONETARY_ENTRY(negative_sign), + LCONV_MONETARY_ENTRY(currency_symbol), + {NULL, 0} + }; + + /* Like above, this field being last can be used as a sub structure */ +# define CURRENCY_SYMBOL_ADDRESS \ + &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)] + + /* Finally there are integer fields, all are for monetary purposes */ + const lconv_offset_t lconv_integers[] = { + LCONV_ENTRY(int_frac_digits), + LCONV_ENTRY(frac_digits), + LCONV_ENTRY(p_sep_by_space), + LCONV_ENTRY(n_cs_precedes), + LCONV_ENTRY(n_sep_by_space), + LCONV_ENTRY(p_sign_posn), + LCONV_ENTRY(n_sign_posn), +# ifdef HAS_LC_MONETARY_2008 + LCONV_ENTRY(int_p_cs_precedes), + LCONV_ENTRY(int_p_sep_by_space), + LCONV_ENTRY(int_n_cs_precedes), + LCONV_ENTRY(int_n_sep_by_space), + LCONV_ENTRY(int_p_sign_posn), + LCONV_ENTRY(int_n_sign_posn), +# endif + LCONV_ENTRY(p_cs_precedes), + {NULL, 0} + }; + + /* Like above, this field being last can be used as a sub structure */ +# define P_CS_PRECEDES_ADDRESS \ + &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)] + + /* If we aren't paying attention to a given category, use LC_CTYPE instead; + * If not paying attention to that either, the code below should end up not + * using this. Make sure that things blow up if that avoidance gets lost, + * by setting the category to -1 */ + unsigned int numeric_index; + unsigned int monetary_index; + +# ifdef USE_LOCALE_NUMERIC + numeric_index = LC_NUMERIC_INDEX_; +# elif defined(USE_LOCALE_CTYPE) + numeric_index = LC_CTYPE_INDEX_; +# else + numeric_index = (unsigned) -1; +# endif +# ifdef USE_LOCALE_MONETARY + monetary_index = LC_MONETARY_INDEX_; +# elif defined(USE_LOCALE_CTYPE) + monetary_index = LC_CTYPE_INDEX_; +# else + monetary_index = (unsigned) -1; +# endif + + /* Some platforms, for correct non-mojibake results, require LC_CTYPE's + * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's + * for the monetary ones. What happens if LC_NUMERIC and LC_MONETARY + * aren't compatible? Wrong results. To avoid that, we call localeconv() + * twice, once for each locale, setting LC_CTYPE to match the category. + * But if the locales of both categories are the same, there is no need for + * a second call. Assume this is the case unless overridden below */ + bool requires_2nd_localeconv = false; + + /* The actual hash populating is done by S_populate_hash_from_localeconv(). + * It gets passed an array of length two containing the data structure it + * is supposed to use to get the key names to fill the hash with. One + * element is alwasy for the NUMERIC strings (or NULL if none to use), and + * the other element similarly for the MONETARY ones. */ +# define NUMERIC_STRING_OFFSET 0 +# define MONETARY_STRING_OFFSET 1 + const lconv_offset_t * strings[2] = { NULL, NULL }; + + /* This is a mask, with one bit to tell S_populate_hash_from_localeconv to + * populate the NUMERIC items; another bit for the MONETARY ones. This way + * it can choose which (or both) to populate from */ + U32 index_bits = 0; + + /* This converts from a locale index to its bit position in the above mask. + * */ +# define INDEX_TO_BIT(i) (1 << (i)) + + /* The two categories can have disparate locales. Initialize them to C and + * override later whichever one(s) we pay attention to */ + const char * numeric_locale = "C"; + const char * monetary_locale = "C"; + + /* This will be either 'numeric_locale' or 'monetary_locale' depending on + * what we are working on at the moment */ + const char * locale; + + /* The LC_MONETARY category also has some integer-valued fields, whose + * information is kept in a separate list */ + const lconv_offset_t * integers; + +# ifdef HAS_SOME_LANGINFO + + /* If the only use-case for this is the full localeconv(), the 'item' + * parameter is ignored. */ + PERL_UNUSED_ARG(item); + +# else + + /* This only gets compiled for the use-case of using localeconv() to + * emulate an nl_langinfo() missing from the platform. + * + * We need this substructure to only return this field for the THOUSEP + * item. The other items also need substructures, but they were handled + * above by placing the substructure's item at the end of the full one, so + * the data structure could do double duty. However, both this and + * RADIXCHAR would need to be in the final position of the same full + * structure; an impossibility. So make this into a separate structure */ + const lconv_offset_t thousands_sep_string[] = { + LCONV_NUMERIC_ENTRY(thousands_sep), + {NULL, 0} + }; + + /* End of all the initialization of datastructures. Now for actual code. + * + * Without nl_langinfo(), the call to my_localeconv() could be for just one + * of the following 3 items to emulate nl_langinfo(). This is compiled + * only when using perl_langinfo.h, which we control, and it has been + * constructed so that no item is numbered 0. + * + * For each, setup the appropriate parameters for the call below to + * S_populate_hash_from_localeconv() */ + if (item != 0) switch (item) { + default: + locale_panic_(Perl_form(aTHX_ + "Unexpected item passed to my_localeconv: %d", item)); + break; + +# ifdef USE_LOCALE_NUMERIC + + case RADIXCHAR: + locale = numeric_locale = PL_numeric_name; + index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_); + strings[NUMERIC_STRING_OFFSET] = DECIMAL_POINT_ADDRESS; + integers = NULL; + break; + + case THOUSEP: + index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_); + locale = numeric_locale = PL_numeric_name; + strings[NUMERIC_STRING_OFFSET] = thousands_sep_string; + integers = NULL; + break; + +# endif +# ifdef USE_LOCALE_MONETARY + + case CRNCYSTR: + index_bits = INDEX_TO_BIT(LC_MONETARY_INDEX_); + locale = monetary_locale = querylocale_i(LC_MONETARY_INDEX_); + + /* This item needs the values for both the currency symbol, and another + * one used to construct the nl_langino()-compatible return */ + strings[MONETARY_STRING_OFFSET] = CURRENCY_SYMBOL_ADDRESS; + integers = P_CS_PRECEDES_ADDRESS; + break; + +# endif + + } /* End of switch() */ + + else /* End of for just one item to emulate nl_langinfo() */ + +# endif + + { /* Here, the call is for all of localeconv(). It has a bunch of + * items. As in the individual item case, set up the parameters for + * S_populate_hash_from_localeconv(); */ + +# ifdef USE_LOCALE_NUMERIC + numeric_locale = PL_numeric_name; +# elif defined(USE_LOCALE_CTYPE) + numeric_locale = querylocale_i(numeric_index); +# endif +# if defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_CTYPE) + monetary_locale = querylocale_i(monetary_index); +# endif + + /* The first call to S_populate_hash_from_localeconv() will be for the + * MONETARY values */ + index_bits = INDEX_TO_BIT(monetary_index); + locale = monetary_locale; + + /* And if the locales for the two categories are the same, we can also + * do the NUMERIC values in the same call */ + if (strEQ(numeric_locale, monetary_locale)) { + index_bits |= INDEX_TO_BIT(numeric_index); + } + else { + requires_2nd_localeconv = true; + } + + /* We always pass both sets of strings. 'index_bits' tells + * S_populate_hash_from_localeconv which to actually look at */ + strings[NUMERIC_STRING_OFFSET] = lconv_numeric_strings; + strings[MONETARY_STRING_OFFSET] = lconv_monetary_strings; + + /* And pass the integer values to populate; again 'index_bits' will + * say to use them or not */ + integers = lconv_integers; + + } /* End of call is for localeconv() */ + + /* The code above has determined the parameters to + S_populate_hash_from_localeconv() for both cases of an individual item + and for the entire structure. Below is code common to both */ + + HV * hv = newHV(); /* The returned hash, initially empty */ + sv_2mortal((SV*)hv); + + /* Call localeconv() and copy its results into the hash. All the + * parameters have been initialized above */ + populate_hash_from_localeconv(hv, + locale, + index_bits, + strings, + integers + ); + + /* The above call may have done all the hash fields, but not always, as + * already explained. If we need a second call it is always for the + * NUMERIC fields */ + if (requires_2nd_localeconv) { + populate_hash_from_localeconv(hv, + numeric_locale, + INDEX_TO_BIT(numeric_index), + strings, + NULL /* There are No NUMERIC integer + fields */ + ); + } + + /* Here, the hash has been completely populated. + * + * Now go through all the items and: + * a) For string items, see if they should be marked as UTF-8 or not. + * This would have been more convenient and faster to do while + * populating the hash in the first place, but that operation has to be + * done within a critical section, keeping other threads from + * executing, so only the minimal amount of work necessary is done at + * that time. + * b) For integer items, convert the C CHAR_MAX value into -1. Again, + * this could have been done in the critical section, but was deferred + * to here to keep to the bare minimum amount the time spent owning the + * processor. CHAR_MAX is a C concept for an 8-bit character type. + * Perl has no such type; the closest fit is a -1. + * + * XXX On unthreaded perls, this code could be #ifdef'd out, and the + * corrections determined at hash population time, at an extra maintenance + * cost which khw doesn't think is worth it + */ + for (unsigned int i = 0; i < 2; i++) { /* Try both types of strings */ + if (! strings[i]) { /* Skip if no strings of this type */ + continue; + } + + locale = (i == NUMERIC_STRING_OFFSET) + ? numeric_locale + : monetary_locale; + + locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN; + +# ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION + + /* It saves time in the loop below to have predetermined the UTF8ness + * of the locale. But only do so if the platform reliably has this + * information; otherwise to do it, this could recurse indefinitely. + * + * When we don't do it here, it will be done on a per-element basis in + * the loop. The per-element check is intelligent enough to not + * recurse */ + + locale_is_utf8 = (is_locale_utf8(locale)) + ? LOCALE_IS_UTF8 + : LOCALE_NOT_UTF8; + + if (locale_is_utf8 == LOCALE_NOT_UTF8) { + continue; /* No string can be UTF-8 if the locale isn't */ + } + +# endif + + /* Examine each string */ + while (1) { + const char * name = strings[i]->name; + + if (! name) { /* Reached the end */ + break; + } + + /* 'value' will contain the string that may need to be marked as + * UTF-8 */ + SV ** value = hv_fetch(hv, name, strlen(name), true); + if (! value) { + continue; + } + + /* Determine if the string should be marked as UTF-8. */ + if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value), + locale_is_utf8, + NULL, 0))) + { + SvUTF8_on(*value); + } + + strings[i]++; /* Iterate */ + } + } /* End of fixing up UTF8ness */ + + + /* Examine each integer */ + if (integers) while (1) { + const char * name = integers->name; + + if (! name) { /* Reached the end */ + break; + } + + SV ** value = hv_fetch(hv, name, strlen(name), true); + if (! value) { + continue; + } + + /* Change CHAR_MAX to -1 */ + if (SvIV(*value) == CHAR_MAX) { + sv_setiv(*value, -1); + } + + integers++; /* Iterate */ + } + + return hv; +} + +STATIC void +S_populate_hash_from_localeconv(pTHX_ HV * hv, + + /* Switch to this locale to run + * localeconv() from */ + const char * locale, + + /* bit mask of which categories to + * populate */ + const U32 which_mask, + + /* strings[0] points the numeric + * string fields; [1] to the monetary */ + const lconv_offset_t * strings[2], + + /* And to the monetary integer fields */ + const lconv_offset_t * integers) +{ + PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV; + PERL_UNUSED_ARG(which_mask); /* Some configurations don't use this; + complicated to figure out which */ + + /* Run localeconv() and copy some or all of its results to the input 'hv' + * hash. Most localeconv() implementations return the values in a global + * static buffer, so the operation must be performed in a critical section, + * ending only after the copy is completed. There are so many locks + * because localeconv() deals with two categories, and returns in a single + * global static buffer. Some locks might be no-ops on this platform, but + * not others. We need to lock if any one isn't a no-op. */ + +# ifdef USE_LOCALE_CTYPE + + /* Some platforms require LC_CTYPE to be congruent with the category we are + * looking for */ + const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale); + +# endif +# ifdef USE_LOCALE_NUMERIC + + /* We need to toggle to the underlying NUMERIC locale if we are getting + * NUMERIC strings */ + const char * orig_NUMERIC_locale = NULL; + if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) { + LC_NUMERIC_LOCK(0); + orig_NUMERIC_locale = toggle_locale_i(LC_NUMERIC_INDEX_, locale); + } + +# endif + + /* Finally ready to do the actual localeconv(). Lock to prevent other + * accesses until we have made a copy of its returned static buffer */ + gwLOCALE_LOCK; + +# ifdef TS_W32_BROKEN_LOCALECONV + + /* This is a workaround for another bug in Windows. localeconv() was + * broken with thread-safe locales prior to VS 15. It looks at the global + * locale instead of the thread one. As a work-around, we toggle to the + * global locale; populate the return; then toggle back. We have to use + * LC_ALL instead of the individual categories because of yet another bug + * in Windows. And this all has to be done in a critical section. + * + * This introduces a potential race with any other thread that has also + * converted to use the global locale, and doesn't protect its locale calls + * with mutexes. khw can't think of any reason for a thread to do so on + * Windows, as the locale API is the same regardless of thread-safety, except + * if the code is ported from working on another platform where there might + * be some reason to do this. But this is typically due to some + * alien-to-Perl library that thinks it owns locale setting. Such a + * library isn't likely to exist on Windows, so such an application is + * unlikely to be run on Windows + */ + bool restore_per_thread = FALSE; + + /* Save the per-thread locale state */ + const char * save_thread = querylocale_c(LC_ALL); + + /* Change to the global locale, and note if we already were there */ + if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE) + != _DISABLE_PER_THREAD_LOCALE) + { + restore_per_thread = TRUE; + } + + /* Save the state of the global locale; then convert to our desired + * state. */ + const char * save_global = querylocale_c(LC_ALL); + void_setlocale_c(LC_ALL, save_thread); + +# endif /* TS_W32_BROKEN_LOCALECONV */ + + /* Finally, do the actual localeconv */ + const char *lcbuf_as_string = (const char *) localeconv(); + + /* Fill in the string fields of the HV* */ + for (unsigned int i = 0; i < 2; i++) { + +# ifdef USE_LOCALE_NUMERIC + + /* One iteration is only for the numeric string fields */ + if ( i == NUMERIC_STRING_OFFSET + && (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) == 0) + { + continue; + } + +# endif +# ifdef USE_LOCALE_MONETARY + + /* The other iteration is only for the monetary string fields */ + if ( i == MONETARY_STRING_OFFSET + && (which_mask & INDEX_TO_BIT(LC_MONETARY_INDEX_)) == 0) + { + continue; + } + +# endif + + /* For each field for the given category ... */ + const lconv_offset_t * category_strings = strings[i]; + while (1) { + const char * name = category_strings->name; + if (! name) { /* Quit at the end */ + break; + } + + /* we have set things up so that we know where in the returned + * structure, when viewed as a string, the corresponding value is. + * */ + const char *value = *((const char **)( lcbuf_as_string + + category_strings->offset)); + + /* Set to get next string on next iteration */ + category_strings++; + + /* Skip if this platform doesn't have this field. */ + if (! value) { + continue; + } + + /* Copy to the hash */ + (void) hv_store(hv, + name, strlen(name), + newSVpv(value, strlen(value)), + 0); + } + + /* Add any int fields to the HV* */ + if (i == MONETARY_STRING_OFFSET && integers) { + while (integers->name) { + const char value = *((const char *)( lcbuf_as_string + + integers->offset)); + (void) hv_store(hv, integers->name, + strlen(integers->name), newSViv(value), 0); + integers++; + } + } + } /* End of loop through the fields */ + + /* Done with copying to the hash. Can unwind the critical section locks */ + +# ifdef TS_W32_BROKEN_LOCALECONV + + /* Restore the global locale's prior state */ + void_setlocale_c(LC_ALL, save_global); + + /* And back to per-thread locales */ + if (restore_per_thread) { + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + } + + /* Restore the per-thread locale state */ + void_setlocale_c(LC_ALL, save_thread); + +# endif /* TS_W32_BROKEN_LOCALECONV */ + + gwLOCALE_UNLOCK; /* Finished with the critical section of a + globally-accessible buffer */ + +# ifdef USE_LOCALE_NUMERIC + + restore_toggled_locale_i(LC_NUMERIC_INDEX_, orig_NUMERIC_locale); + if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) { + LC_NUMERIC_UNLOCK; + } + +# endif +# ifdef USE_LOCALE_CTYPE + + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); + +# endif + +} + +#endif /* defined(HAS_LOCALECONV) */ +#ifndef HAS_SOME_LANGINFO + +typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */ + +#endif + +/* + +=for apidoc Perl_langinfo +=for apidoc_item Perl_langinfo8 + +C<Perl_langinfo> is an (almost) drop-in replacement for the system +C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning +the same information. But it is more thread-safe than regular +C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your +code, and can be used on systems that lack a native C<nl_langinfo>. + +However, you should instead use the improved version of this: +L</Perl_langinfo8>, which behaves identically except for an additional +parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it +returns to you how you should treat the returned string with regards to it +being encoded in UTF-8 or not. + +Concerning the differences between these and plain C<nl_langinfo()>: =over -=item * +=item a. -The reason it isn't quite a drop-in replacement is actually an advantage. The -only difference is that it returns S<C<const char *>>, whereas plain -C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation) -forbidden to write into the buffer. By declaring this C<const>, the compiler -enforces this restriction, so if it is violated, you know at compilation time, -rather than getting segfaults at runtime. +C<Perl_langinfo8> has an extra parameter, described above. Besides this, the +other reason they aren't quite a drop-in replacement is actually an advantage. +The C<const>ness of the return allows the compiler to catch attempts to write +into the returned buffer, which is illegal and could cause run-time crashes. -=item * +=item b. -It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items, +They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items, without you having to write extra code. The reason for the extra code would be because these are from the C<LC_NUMERIC> locale category, which is normally kept set by Perl so that the radix is a dot, and the separator is the empty @@ -2393,739 +3897,1096 @@ the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C (or equivalent) locale would break a lot of CPAN, which is expecting the radix (decimal point) character to be a dot.) -=item * +=item c. -The system function it replaces can have its static return buffer trashed, +The system function they replace can have its static return buffer trashed, not only by a subsequent call to that function, but by a C<freelocale>, -C<setlocale>, or other locale change. The returned buffer of this function is -not changed until the next call to it, so the buffer is never in a trashed -state. +C<setlocale>, or other locale change. The returned buffer of these functions +is not changed until the next call to one or the other, so the buffer is never +in a trashed state. -=item * +=item d. -Its return buffer is per-thread, so it also is never overwritten by a call to -this function from another thread; unlike the function it replaces. +The return buffer is per-thread, so it also is never overwritten by a call to +these functions from another thread; unlike the function it replaces. -=item * +=item e. -But most importantly, it works on systems that don't have C<nl_langinfo>, such -as Windows, hence makes your code more portable. Of the fifty-some possible +But most importantly, they work on systems that don't have C<nl_langinfo>, such +as Windows, hence making your code more portable. Of the fifty-some possible items specified by the POSIX 2008 standard, L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>, only one is completely unimplemented, though on non-Windows platforms, another -significant one is also not implemented). It uses various techniques to +significant one is not fully implemented). They use various techniques to recover the other items, including calling C<L<localeconv(3)>>, and C<L<strftime(3)>>, both of which are specified in C89, so should be always be -available. Later C<strftime()> versions have additional capabilities; C<""> is -returned for those not available on your system. +available. Later C<strftime()> versions have additional capabilities; What the +C locale yields or C<""> is returned for any item not available on your system. -It is important to note that when called with an item that is recovered by +It is important to note that, when called with an item that is recovered by using C<localeconv>, the buffer from any previous explicit call to -C<localeconv> will be overwritten. This means you must save that buffer's -contents if you need to access them after a call to this function. (But note -that you might not want to be using C<localeconv()> directly anyway, because of -issues like the ones listed in the second item of this list (above) for -C<RADIXCHAR> and C<THOUSEP>. You can use the methods given in L<perlcall> to -call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to -unpack). +C<L<localeconv(3)>> will be overwritten. But you shouldn't be using +C<localeconv> anyway because it is is very much not thread-safe, and suffers +from the same problems outlined in item 'b.' above for the fields it returns that +are controlled by the LC_NUMERIC locale category. Instead, avoid all of those +problems by calling L</Perl_localeconv>, which is thread-safe; or by using the +methods given in L<perlcall> to call +L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe. + +=back The details for those items which may deviate from what this emulation returns and what a native C<nl_langinfo()> would return are specified in L<I18N::Langinfo>. -=back - -When using C<Perl_langinfo> on systems that don't have a native -C<nl_langinfo()>, you must +When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't +have a native C<nl_langinfo()>, you must #include "perl_langinfo.h" -before the C<perl.h> C<#include>. You can replace your C<langinfo.h> +before the C<perl.h> C<#include>. You can replace your F<langinfo.h> C<#include> with this one. (Doing it this way keeps out the symbols that plain -C<langinfo.h> would try to import into the namespace for code that doesn't need +F<langinfo.h> would try to import into the namespace for code that doesn't need it.) -The original impetus for C<Perl_langinfo()> was so that code that needs to -find out the current currency symbol, floating point radix character, or digit -grouping separator can use, on all systems, the simpler and more -thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a -pain to make thread-friendly. For other fields returned by C<localeconv>, it -is better to use the methods given in L<perlcall> to call -L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly. - =cut */ const char * -#ifdef HAS_NL_LANGINFO Perl_langinfo(const nl_item item) -#else -Perl_langinfo(const int item) -#endif { - return my_nl_langinfo(item, TRUE); + return Perl_langinfo8(item, NULL); } -STATIC const char * -#ifdef HAS_NL_LANGINFO -S_my_nl_langinfo(const nl_item item, bool toggle) -#else -S_my_nl_langinfo(const int item, bool toggle) -#endif +const char * +Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness) { dTHX; - const char * retval; + unsigned cat_index; -#ifdef USE_LOCALE_NUMERIC + PERL_ARGS_ASSERT_PERL_LANGINFO8; - /* We only need to toggle into the underlying LC_NUMERIC locale for these - * two items, and only if not already there */ - if (toggle && (( item != RADIXCHAR && item != THOUSEP) - || PL_numeric_underlying)) + if (utf8ness) { /* Assume for now */ + *utf8ness = UTF8NESS_IMMATERIAL; + } -#endif /* No toggling needed if not using LC_NUMERIC */ + /* Find the locale category that controls the input 'item'. If we are not + * paying attention to that category, instead return a default value. Also + * return the default value if there is no way for us to figure out the + * correct value. If we have some form of nl_langinfo(), we can always + * figure it out, but lacking that, there may be alternative methods that + * can be used to recover most of the possible items. Some of those + * methods need libc functions, which may or may not be available. If + * unavailable, we can't compute the correct value, so must here return the + * default. */ + switch (item) { - toggle = FALSE; + case CODESET: -#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */ -# if ! defined(HAS_THREAD_SAFE_NL_LANGINFO_L) \ - || ! defined(HAS_POSIX_2008_LOCALE) +#ifdef USE_LOCALE_CTYPE - /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC - * for those items dependent on it. This must be copied to a buffer before - * switching back, as some systems destroy the buffer when setlocale() is - * called */ + cat_index = LC_CTYPE_INDEX_; + break; - { - DECLARATION_FOR_LC_NUMERIC_MANIPULATION; +#else + return C_codeset; +#endif +#if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO) - if (toggle) { - STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); - } + case YESEXPR: case YESSTR: case NOEXPR: case NOSTR: + cat_index = LC_MESSAGES_INDEX_; + break; +#else + case YESEXPR: return "^[+1yY]"; + case YESSTR: return "yes"; + case NOEXPR: return "^[-0nN]"; + case NOSTR: return "no"; +#endif - /* Prevent interference from another thread executing this code - * section. */ - NL_LANGINFO_LOCK; + case CRNCYSTR: - /* Copy to a per-thread buffer, which is also one that won't be - * destroyed by a subsequent setlocale(), such as the - * RESTORE_LC_NUMERIC may do just below. */ - retval = save_to_buffer(nl_langinfo(item), - &PL_langinfo_buf, &PL_langinfo_bufsize, 0); - NL_LANGINFO_UNLOCK; +#if defined(USE_LOCALE_MONETARY) \ + && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)) - if (toggle) { - RESTORE_LC_NUMERIC(); - } - } + cat_index = LC_MONETARY_INDEX_; + break; +#else + return "-"; +#endif -# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */ + case RADIXCHAR: - { - bool do_free = FALSE; - locale_t cur = uselocale((locale_t) 0); +#ifdef CAN_CALCULATE_RADIX - if (cur == LC_GLOBAL_LOCALE) { - cur = duplocale(LC_GLOBAL_LOCALE); - do_free = TRUE; - } + cat_index = LC_NUMERIC_INDEX_; + break; +#else + return C_decimal_point; +#endif -# ifdef USE_LOCALE_NUMERIC + case THOUSEP: - if (toggle) { - if (PL_underlying_numeric_obj) { - cur = PL_underlying_numeric_obj; - } - else { - cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); - do_free = TRUE; - } - } +#if defined(USE_LOCALE_NUMERIC) \ + && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV)) -# endif + cat_index = LC_NUMERIC_INDEX_; + break; +#else + return C_thousands_sep; +#endif - /* We have to save it to a buffer, because the freelocale() just below - * can invalidate the internal one */ - retval = save_to_buffer(nl_langinfo_l(item, cur), - &PL_langinfo_buf, &PL_langinfo_bufsize, 0); +/* The other possible items are all in LC_TIME. */ +#ifdef USE_LOCALE_TIME - if (do_free) { - freelocale(cur); - } + default: + cat_index = LC_TIME_INDEX_; + break; + +#endif +#if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO) + + /* If not using LC_TIME, hard code the rest. Or, if there is no + * nl_langinfo(), we use strftime() as an alternative, and it is missing + * functionality to get every single one, so hard-code those */ + + case ERA: return ""; /* Unimplemented; for use with strftime() %E + modifier */ + + /* These formats are defined by C89, so we assume that strftime supports + * them, and so are returned unconditionally; they may not be what the + * locale actually says, but should give good enough results for someone + * using them as formats (as opposed to trying to parse them to figure + * out what the locale says). The other format items are actually tested + * to verify they work on the platform */ + case D_FMT: return "%x"; + case T_FMT: return "%X"; + case D_T_FMT: return "%c"; + +# if defined(WIN32) || ! defined(USE_LOCALE_TIME) + + /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions + * that would allow it to recover these */ + case ERA_D_FMT: return "%x"; + case ERA_T_FMT: return "%X"; + case ERA_D_T_FMT: return "%c"; + case ALT_DIGITS: return "0"; + +# endif +# ifndef USE_LOCALE_TIME + + case T_FMT_AMPM: return "%r"; + case ABDAY_1: return "Sun"; + case ABDAY_2: return "Mon"; + case ABDAY_3: return "Tue"; + case ABDAY_4: return "Wed"; + case ABDAY_5: return "Thu"; + case ABDAY_6: return "Fri"; + case ABDAY_7: return "Sat"; + case AM_STR: return "AM"; + case PM_STR: return "PM"; + case ABMON_1: return "Jan"; + case ABMON_2: return "Feb"; + case ABMON_3: return "Mar"; + case ABMON_4: return "Apr"; + case ABMON_5: return "May"; + case ABMON_6: return "Jun"; + case ABMON_7: return "Jul"; + case ABMON_8: return "Aug"; + case ABMON_9: return "Sep"; + case ABMON_10: return "Oct"; + case ABMON_11: return "Nov"; + case ABMON_12: return "Dec"; + case DAY_1: return "Sunday"; + case DAY_2: return "Monday"; + case DAY_3: return "Tuesday"; + case DAY_4: return "Wednesday"; + case DAY_5: return "Thursday"; + case DAY_6: return "Friday"; + case DAY_7: return "Saturday"; + case MON_1: return "January"; + case MON_2: return "February"; + case MON_3: return "March"; + case MON_4: return "April"; + case MON_5: return "May"; + case MON_6: return "June"; + case MON_7: return "July"; + case MON_8: return "August"; + case MON_9: return "September"; + case MON_10: return "October"; + case MON_11: return "November"; + case MON_12: return "December"; + +# endif +#endif + + } /* End of switch on item */ + +#ifndef USE_LOCALE + + Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item); + NOT_REACHED; /* NOTREACHED */ + PERL_UNUSED_VAR(cat_index); + +#else +# ifdef USE_LOCALE_NUMERIC + + /* Use either the underlying numeric, or the other underlying categories */ + if (cat_index == LC_NUMERIC_INDEX_) { + return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name, + &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness); } + else # endif - if (strEQ(retval, "")) { - if (item == YESSTR) { - return "yes"; - } - if (item == NOSTR) { - return "no"; - } + { + return my_langinfo_i(item, cat_index, querylocale_i(cat_index), + &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness); } - return retval; +#endif -#else /* Below, emulate nl_langinfo as best we can */ +} - { +#ifdef USE_LOCALE -# ifdef HAS_LOCALECONV +/* There are several implementations of my_langinfo, depending on the + * Configuration. They all share the same beginning of the function */ +STATIC const char * +S_my_langinfo_i(pTHX_ + const nl_item item, /* The item to look up */ + const unsigned int cat_index, /* The locale category that + controls it */ + /* The locale to look up 'item' in. */ + const char * locale, + + /* Where to store the result, and where the size of that buffer + * is stored, updated on exit. retbuf_sizep may be NULL for an + * empty-on-entry, single use buffer whose size we don't need + * to keep track of */ + const char ** retbufp, + Size_t * retbuf_sizep, + + /* If not NULL, the location to store the UTF8-ness of 'item's + * value, as documented */ + utf8ness_t * utf8ness) +{ + const char * retval = NULL; - const struct lconv* lc; - const char * temp; - DECLARATION_FOR_LC_NUMERIC_MANIPULATION; + PERL_ARGS_ASSERT_MY_LANGINFO_I; + assert(cat_index < NOMINAL_LC_ALL_INDEX); -# ifdef TS_W32_BROKEN_LOCALECONV + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "Entering my_langinfo item=%ld, using locale %s\n", + (long) item, locale)); +/*--------------------------------------------------------------------------*/ +/* Above is the common beginning to all the implementations of my_langinfo(). + * Below are the various completions. + * + * Some platforms don't deal well with non-ASCII strings in locale X when + * LC_CTYPE is not in X. (Actually it is probably when X is UTF-8 and LC_CTYPE + * isn't, or vice versa). There is explicit code to bring the categories into + * sync. This doesn't seem to be a problem with nl_langinfo(), so that + * implementation doesn't currently worry about it. But it is a problem on + * Windows boxes, which don't have nl_langinfo(). */ + +/*--------------------------------------------------------------------------*/ +# if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */ +# ifdef USE_LOCALE_CTYPE - const char * save_global; - const char * save_thread; - int needed_size; - char * ptr; - char * e; - char * item_start; + /* Ths function sorts out if things actually have to be switched or not, + * for both calls. */ + const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale); # endif -# endif -# ifdef HAS_STRFTIME - struct tm tm; - bool return_format = FALSE; /* Return the %format, not the value */ - const char * format; + const char * orig_switched_locale = toggle_locale_i(cat_index, locale); -# endif + gwLOCALE_LOCK; + retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep); + gwLOCALE_UNLOCK; - /* We copy the results to a per-thread buffer, even if not - * multi-threaded. This is in part to simplify this code, and partly - * because we need a buffer anyway for strftime(), and partly because a - * call of localeconv() could otherwise wipe out the buffer, and the - * programmer would not be expecting this, as this is a nl_langinfo() - * substitute after all, so s/he might be thinking their localeconv() - * is safe until another localeconv() call. */ + if (utf8ness) { + *utf8ness = get_locale_string_utf8ness_i(retval, + LOCALE_UTF8NESS_UNKNOWN, + locale, cat_index); + } - switch (item) { - Size_t len; + restore_toggled_locale_i(cat_index, orig_switched_locale); - /* This is unimplemented */ - case ERA: /* For use with strftime() %E modifier */ +# ifdef USE_LOCALE_CTYPE + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); +# endif - default: - return ""; + return retval; +/*--------------------------------------------------------------------------*/ +# else /* Below, emulate nl_langinfo as best we can */ - /* We use only an English set, since we don't know any more */ - case YESEXPR: return "^[+1yY]"; - case YESSTR: return "yes"; - case NOEXPR: return "^[-0nN]"; - case NOSTR: return "no"; + /* And the third and final completion is where we have to emulate + * nl_langinfo(). There are various possibilities depending on the + * Configuration */ - case CODESET: +# ifdef USE_LOCALE_CTYPE -# ifndef WIN32 + const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale); - /* On non-windows, this is unimplemented, in part because of - * inconsistencies between vendors. The Darwin native - * nl_langinfo() implementation simply looks at everything past - * any dot in the name, but that doesn't work for other - * vendors. Many Linux locales that don't have UTF-8 in their - * names really are UTF-8, for example; z/OS locales that do - * have UTF-8 in their names, aren't really UTF-8 */ - return ""; +# endif -# else + const char * orig_switched_locale = toggle_locale_i(cat_index, locale); - { /* But on Windows, the name does seem to be consistent, so - use that. */ - const char * p; - const char * first; - Size_t offset = 0; - const char * name = my_setlocale(LC_CTYPE, NULL); + /* Here, we are in the locale we want information about */ - if (isNAME_C_OR_POSIX(name)) { - return "ANSI_X3.4-1968"; - } + /* Almost all the items will have ASCII return values. Set that here, and + * override if necessary */ + utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL; - /* Find the dot in the locale name */ - first = (const char *) strchr(name, '.'); - if (! first) { - first = name; - goto has_nondigit; - } + switch (item) { + default: + assert(item < 0); /* Make sure using perl_langinfo.h */ + retval = ""; + break; - /* Look at everything past the dot */ - first++; - p = first; + case RADIXCHAR: - while (*p) { - if (! isDIGIT(*p)) { - goto has_nondigit; - } +# if defined(HAS_SNPRINTF) \ + && (! defined(HAS_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV)) - p++; - } + { + /* snprintf() can be used to find the radix character by outputting + * a known simple floating point number to a buffer, and parsing + * it, inferring the radix as the bytes separating the integer and + * fractional parts. But localeconv() is more direct, not + * requiring inference, so use it instead of the code just below, + * if (likely) it is available and works ok */ + + char * floatbuf = NULL; + const Size_t initial_size = 10; + + Newx(floatbuf, initial_size, char); + + /* 1.5 is exactly representable on binary computers */ + Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5); + + /* If our guess wasn't big enough, increase and try again, based on + * the real number that snprintf() is supposed to return */ + if (UNLIKELY(needed_size >= initial_size)) { + needed_size++; /* insurance */ + Renew(floatbuf, needed_size, char); + Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5); + assert(new_needed <= needed_size); + needed_size = new_needed; + } - /* Here everything past the dot is a digit. Treat it as a - * code page */ - retval = save_to_buffer("CP", &PL_langinfo_buf, - &PL_langinfo_bufsize, 0); - offset = STRLENs("CP"); + char * s = floatbuf; + char * e = floatbuf + needed_size; - has_nondigit: + /* Find the '1' */ + while (s < e && *s != '1') { + s++; + } - retval = save_to_buffer(first, &PL_langinfo_buf, - &PL_langinfo_bufsize, offset); + if (LIKELY(s < e)) { + s++; + } + + /* Find the '5' */ + char * item_start = s; + while (s < e && *s != '5') { + s++; + } + + /* Everything in between is the radix string */ + if (LIKELY(s < e)) { + *s = '\0'; + retval = save_to_buffer(item_start, retbufp, retbuf_sizep); + Safefree(floatbuf); + + if (utf8ness) { + is_utf8 = get_locale_string_utf8ness_i(retval, + LOCALE_UTF8NESS_UNKNOWN, + locale, cat_index); } break; + } -# endif -# ifdef HAS_LOCALECONV + Safefree(floatbuf); + } - case CRNCYSTR: +# ifdef HAS_LOCALECONV /* snprintf() failed; drop down to use + localeconv() */ - /* We don't bother with localeconv_l() because any system that - * has it is likely to also have nl_langinfo() */ + /* FALLTHROUGH */ - LOCALECONV_LOCK; /* Prevent interference with other threads - using localeconv() */ +# else /* snprintf() failed and no localeconv() */ -# ifdef TS_W32_BROKEN_LOCALECONV + retval = C_decimal_point; + break; - /* This is a workaround for a Windows bug prior to VS 15. - * What we do here is, while locked, switch to the global - * locale so localeconv() works; then switch back just before - * the unlock. This can screw things up if some thread is - * already using the global locale while assuming no other is. - * A different workaround would be to call GetCurrencyFormat on - * a known value, and parse it; patches welcome - * - * We have to use LC_ALL instead of LC_MONETARY because of - * another bug in Windows */ +# endif +# endif +# ifdef HAS_LOCALECONV - save_thread = savepv(my_setlocale(LC_ALL, NULL)); - _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); - save_global= savepv(my_setlocale(LC_ALL, NULL)); - my_setlocale(LC_ALL, save_thread); + /* These items are available from localeconv(). (To avoid using + * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and + * GetCurrencyFormat; patches welcome) */ -# endif +# define P_CS_PRECEDES "p_cs_precedes" +# define CURRENCY_SYMBOL "currency_symbol" - lc = localeconv(); - if ( ! lc - || ! lc->currency_symbol - || strEQ("", lc->currency_symbol)) - { - LOCALECONV_UNLOCK; - return ""; - } + /* case RADIXCHAR: // May drop down to here in some configurations */ + case THOUSEP: + case CRNCYSTR: + { - /* Leave the first spot empty to be filled in below */ - retval = save_to_buffer(lc->currency_symbol, &PL_langinfo_buf, - &PL_langinfo_bufsize, 1); - if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, "")) - { /* khw couldn't figure out how the localedef specifications - would show that the $ should replace the radix; this is - just a guess as to how it might work.*/ - PL_langinfo_buf[0] = '.'; - } - else if (lc->p_cs_precedes) { - PL_langinfo_buf[0] = '-'; + /* The hash gets populated with just the field(s) related to 'item'. */ + HV * result_hv = my_localeconv(item); + + SV* string; + if (item != CRNCYSTR) { + + /* These items have been populated with just one key => value */ + (void) hv_iterinit(result_hv); + HE * entry = hv_iternext(result_hv); + string = hv_iterval(result_hv, entry); + } + else { + + /* But CRNCYSTR localeconv() returns a slightly different value + * than the nl_langinfo() API calls for, so have to modify this one + * to conform. We need another value from localeconv() to know + * what to change it to. my_localeconv() has populated the hash + * with exactly both fields. Delete this one, leaving just the + * CRNCYSTR one in the hash */ + SV* precedes = hv_delete(result_hv, + P_CS_PRECEDES, STRLENs(P_CS_PRECEDES), + 0); + if (! precedes) { + locale_panic_("my_localeconv() unexpectedly didn't return" + " a value for " P_CS_PRECEDES); + } + + /* The modification is to prefix the localeconv() return with a + * single byte, calculated as follows: */ + char prefix = (LIKELY(SvIV(precedes) != -1)) + ? ((precedes != 0) ? '-' : '+') + + /* khw couldn't find any documentation that + * CHAR_MAX (which we modify to -1) is the signal, + * but cygwin uses it thusly, and it makes sense + * given that CHAR_MAX indicates the value isn't + * used, so it neither precedes nor succeeds */ + : '.'; + + /* Now get CRNCYSTR */ + (void) hv_iterinit(result_hv); + HE * entry = hv_iternext(result_hv); + string = hv_iterval(result_hv, entry); + + /* And perform the modification */ + Perl_sv_setpvf(aTHX_ string, "%c%s", prefix, SvPV_nolen(string)); + } + + /* Here, 'string' contains the value we want to return */ + retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep); + + if (utf8ness) { + is_utf8 = (SvUTF8(string)) + ? UTF8NESS_YES + : (is_utf8_invariant_string( (U8 *) retval, + strlen(retval))) + ? UTF8NESS_IMMATERIAL + : UTF8NESS_NO; + } + + break; + + } + +# endif /* Some form of localeconv */ +# ifdef HAS_STRFTIME + + /* These formats are only available in later strftime's */ + case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM: + + /* The rest can be gotten from most versions of strftime(). */ + case ABDAY_1: case ABDAY_2: case ABDAY_3: + case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7: + case ALT_DIGITS: + case AM_STR: case PM_STR: + case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4: + case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8: + case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12: + case DAY_1: case DAY_2: case DAY_3: case DAY_4: + case DAY_5: case DAY_6: case DAY_7: + case MON_1: case MON_2: case MON_3: case MON_4: + case MON_5: case MON_6: case MON_7: case MON_8: + case MON_9: case MON_10: case MON_11: case MON_12: + { + const char * format; + bool return_format = FALSE; + int mon = 0; + int mday = 1; + int hour = 6; + + GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough); + + switch (item) { + default: + locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item)); + NOT_REACHED; /* NOTREACHED */ + + case PM_STR: hour = 18; + case AM_STR: + format = "%p"; + break; + case ABDAY_7: mday++; + case ABDAY_6: mday++; + case ABDAY_5: mday++; + case ABDAY_4: mday++; + case ABDAY_3: mday++; + case ABDAY_2: mday++; + case ABDAY_1: + format = "%a"; + break; + case DAY_7: mday++; + case DAY_6: mday++; + case DAY_5: mday++; + case DAY_4: mday++; + case DAY_3: mday++; + case DAY_2: mday++; + case DAY_1: + format = "%A"; + break; + case ABMON_12: mon++; + case ABMON_11: mon++; + case ABMON_10: mon++; + case ABMON_9: mon++; + case ABMON_8: mon++; + case ABMON_7: mon++; + case ABMON_6: mon++; + case ABMON_5: mon++; + case ABMON_4: mon++; + case ABMON_3: mon++; + case ABMON_2: mon++; + case ABMON_1: + format = "%b"; + break; + case MON_12: mon++; + case MON_11: mon++; + case MON_10: mon++; + case MON_9: mon++; + case MON_8: mon++; + case MON_7: mon++; + case MON_6: mon++; + case MON_5: mon++; + case MON_4: mon++; + case MON_3: mon++; + case MON_2: mon++; + case MON_1: + format = "%B"; + break; + case T_FMT_AMPM: + format = "%r"; + return_format = TRUE; + break; + case ERA_D_FMT: + format = "%Ex"; + return_format = TRUE; + break; + case ERA_T_FMT: + format = "%EX"; + return_format = TRUE; + break; + case ERA_D_T_FMT: + format = "%Ec"; + return_format = TRUE; + break; + case ALT_DIGITS: + format = "%Ow"; /* Find the alternate digit for 0 */ + break; + } + + GCC_DIAG_RESTORE_STMT; + + /* The year was deliberately chosen so that January 1 is on the + * first day of the week. Since we're only getting one thing at a + * time, it all works */ + const char * temp = my_strftime8_temp(format, 30, 30, hour, mday, mon, + 2011, 0, 0, 0, &is_utf8); + retval = save_to_buffer(temp, retbufp, retbuf_sizep); + Safefree(temp); + + /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate + * format for wday 0. If the value is the same as the normal 0, + * there isn't an alternate, so clear the buffer. + * + * (wday was chosen because its range is all a single digit. + * Things like tm_sec have two digits as the minimum: '00'.) */ + if (item == ALT_DIGITS && strEQ(*retbufp, "0")) { + retval = ""; + break; + } + + /* ALT_DIGITS is problematic. Experiments on it showed that + * strftime() did not always work properly when going from alt-9 to + * alt-10. Only a few locales have this item defined, and in all + * of them on Linux that khw was able to find, nl_langinfo() merely + * returned the alt-0 character, possibly doubled. Most Unicode + * digits are in blocks of 10 consecutive code points, so that is + * sufficient information for such scripts, as we can infer alt-1, + * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is + * returned, and the CJK digits are not in code point order, so you + * can't really infer anything. The localedef for this locale did + * specify the succeeding digits, so that strftime() works properly + * on them, without needing to infer anything. But the + * nl_langinfo() return did not give sufficient information for the + * caller to understand what's going on. So until there is + * evidence that it should work differently, this returns the alt-0 + * string for ALT_DIGITS. */ + + if (return_format) { + + /* If to return the format, not the value, overwrite the buffer + * with it. But some strftime()s will keep the original format + * if illegal, so change those to "" */ + if (strEQ(*retbufp, format)) { + retval = ""; } else { - PL_langinfo_buf[0] = '+'; + retval = format; } -# ifdef TS_W32_BROKEN_LOCALECONV + /* A format is always in ASCII */ + is_utf8 = UTF8NESS_IMMATERIAL; + } - my_setlocale(LC_ALL, save_global); - _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); - my_setlocale(LC_ALL, save_thread); - Safefree(save_global); - Safefree(save_thread); + break; + } # endif - LOCALECONV_UNLOCK; - break; + case CODESET: -# ifdef TS_W32_BROKEN_LOCALECONV + /* The trivial case */ + if (isNAME_C_OR_POSIX(locale)) { + retval = C_codeset; + break; + } - case RADIXCHAR: +# ifdef WIN32 - /* For this, we output a known simple floating point number to - * a buffer, and parse it, looking for the radix */ + /* This function retrieves the code page. It is subject to change, but + * is documented and has been stable for many releases */ + UINT ___lc_codepage_func(void); - if (toggle) { - STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); - } + retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()), + retbufp, retbuf_sizep); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n", + locale, retval)); + break; - if (PL_langinfo_bufsize < 10) { - PL_langinfo_bufsize = 10; - Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); - } +# else - needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize, - "%.1f", 1.5); - if (needed_size >= (int) PL_langinfo_bufsize) { - PL_langinfo_bufsize = needed_size + 1; - Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); - needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize, - "%.1f", 1.5); - assert(needed_size < (int) PL_langinfo_bufsize); - } + /* The codeset is important, but khw did not figure out a way for it to + * be retrieved on non-Windows boxes without nl_langinfo(). But even + * if we can't get it directly, we can usually determine if it is a + * UTF-8 locale or not. If it is UTF-8, we (correctly) use that for + * the code set. */ + +# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) + + /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT + * CHARACTER as that Unicode code point, this has to be a UTF-8 locale. + * */ + wchar_t wc = 0; + (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */ + int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc, + STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); + if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "mbtowc returned REPLACEMENT\n")); + retval = "UTF-8"; + break; + } - ptr = PL_langinfo_buf; - e = PL_langinfo_buf + PL_langinfo_bufsize; + /* Here, it isn't a UTF-8 locale. */ - /* Find the '1' */ - while (ptr < e && *ptr != '1') { - ptr++; - } - ptr++; +# else /* mbtowc() is not available. */ - /* Find the '5' */ - item_start = ptr; - while (ptr < e && *ptr != '5') { - ptr++; - } + /* Sling together several possibilities, depending on platform + * capabilities and what we found. + * + * For non-English locales or non-dollar currency locales, we likely + * will find out whether a locale is UTF-8 or not */ - /* Everything in between is the radix string */ - if (ptr >= e) { - PL_langinfo_buf[0] = '?'; - PL_langinfo_buf[1] = '\0'; - } - else { - *ptr = '\0'; - Move(item_start, PL_langinfo_buf, ptr - PL_langinfo_buf, char); + utf8ness_t is_utf8 = UTF8NESS_UNKNOWN; + const char * scratch_buf = NULL; + +# if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV) + + /* Can't use this method unless localeconv() is available, as that's + * the way we find out the currency symbol. */ + + /* First try looking at the currency symbol (via a recursive call) to + * see if it disambiguates things. Often that will be in the native + * script, and if the symbol isn't legal UTF-8, we know that the locale + * isn't either. */ + (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale, &scratch_buf, NULL, + &is_utf8); + Safefree(scratch_buf); + +# endif +# ifdef USE_LOCALE_TIME + + /* If we have ruled out being UTF-8, no point in checking further. */ + if (is_utf8 != UTF8NESS_NO) { + + /* But otherwise do check more. This is done even if the currency + * symbol looks to be UTF-8, just in case that's a false positive. + * + * Look at the LC_TIME entries, like the names of the months or + * weekdays. We quit at the first one that is illegal UTF-8 */ + + utf8ness_t this_is_utf8 = UTF8NESS_UNKNOWN; + const int times[] = { + DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7, + MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8, + MON_9, MON_10, MON_11, MON_12, + ALT_DIGITS, AM_STR, PM_STR, + ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6, + ABDAY_7, + ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6, + ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12 + }; + + /* The code in the recursive call can handle switching the locales, + * but by doing it here, we avoid switching each iteration of the + * loop */ + const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale); + + for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(times); i++) { + scratch_buf = NULL; + (void) my_langinfo_c(times[i], LC_TIME, locale, &scratch_buf, + NULL, &this_is_utf8); + Safefree(scratch_buf); + if (this_is_utf8 == UTF8NESS_NO) { + is_utf8 = UTF8NESS_NO; + break; } - if (toggle) { - RESTORE_LC_NUMERIC(); + if (this_is_utf8 == UTF8NESS_YES) { + is_utf8 = UTF8NESS_YES; } + } - retval = PL_langinfo_buf; - break; + /* Here we have gone through all the LC_TIME elements. is_utf8 has + * been set as follows: + * UTF8NESS_NO If at least one is't legal UTF-8 + * UTF8NESS_IMMMATERIAL If all are ASCII + * UTF8NESS_YES If all are legal UTF-8 (including + * ASCIIi), and at least one isn't + * ASCII. */ -# else + restore_toggled_locale_c(LC_TIME, orig_TIME_locale); + } - case RADIXCHAR: /* No special handling needed */ +# endif /* LC_TIME */ -# endif + /* If nothing examined above rules out it being UTF-8, and at least one + * thing fits as UTF-8 (and not plain ASCII), assume the codeset is + * UTF-8. */ + if (is_utf8 == UTF8NESS_YES) { + retval = "UTF-8"; + break; + } - case THOUSEP: + /* Here, nothing examined indicates that the codeset is UTF-8. But + * what is it? The other locale categories are not likely to be of + * further help: + * + * LC_NUMERIC Only a few locales in the world have a non-ASCII radix + * or group separator. + * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and + * was reliable. This is unlikely in C99. There are + * other functions that could be used instead, but are + * they going to exist, and be able to distinguish between + * UTF-8 and 8859-1? Deal with this only if it becomes + * necessary. + * LC_MESSAGES The strings returned from strerror() would seem likely + * candidates, but experience has shown that many systems + * don't actually have translations installed for them. + * They are instead always in English, so everything in + * them is ASCII, which is of no help to us. A Configure + * probe could possibly be written to see if this platform + * has non-ASCII error messages. But again, wait until it + * turns out to be an actual problem. */ + +# endif /* ! mbtowc() */ + + /* Rejoin the mbtowc available/not-available cases. + * + * We got here only because we haven't been able to find the codeset. + * The only other option khw could think of is to see if the codeset is + * part of the locale name. This is very less than ideal; often there + * is no code set in the name; and at other times they even lie. + * + * But there is an XPG standard syntax, which many locales follow: + * + * language[_territory[.codeset]][@modifier] + * + * So we take the part between the dot and any '@' */ + retval = (const char *) strchr(locale, '.'); + if (! retval) { + retval = ""; /* Alas, no dot */ + break; + } - if (toggle) { - STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); - } + /* Don't include the dot */ + retval++; + + /* And stop before any '@' */ + const char * modifier = strchr(retval, '@'); + if (modifier) { + char * code_set_name; + const Size_t name_len = modifier - retval; + Newx(code_set_name, name_len + 1, char); /* +1 for NUL */ + my_strlcpy(code_set_name, retval, name_len + 1); + SAVEFREEPV(code_set_name); + retval = code_set_name; + } - LOCALECONV_LOCK; /* Prevent interference with other threads - using localeconv() */ - -# ifdef TS_W32_BROKEN_LOCALECONV - - /* This should only be for the thousands separator. A - * different work around would be to use GetNumberFormat on a - * known value and parse the result to find the separator */ - save_thread = savepv(my_setlocale(LC_ALL, NULL)); - _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); - save_global = savepv(my_setlocale(LC_ALL, NULL)); - my_setlocale(LC_ALL, save_thread); -# if 0 - /* This is the start of code that for broken Windows replaces - * the above and below code, and instead calls - * GetNumberFormat() and then would parse that to find the - * thousands separator. It needs to handle UTF-16 vs -8 - * issues. */ - - needed_size = GetNumberFormatEx(PL_numeric_name, 0, "1234.5", NULL, PL_langinfo_buf, PL_langinfo_bufsize); - DEBUG_L(PerlIO_printf(Perl_debug_log, - "%s: %d: return from GetNumber, count=%d, val=%s\n", - __FILE__, __LINE__, needed_size, PL_langinfo_buf)); +# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) + + /* When these functions, are available, they were tried earlier and + * indicated that the locale did not act like a proper UTF-8 one. So + * if it claims to be UTF-8, it is a lie */ + if (is_codeset_name_UTF8(retval)) { + retval = ""; + break; + } # endif -# endif - lc = localeconv(); - if (! lc) { - temp = ""; - } - else { - temp = (item == RADIXCHAR) - ? lc->decimal_point - : lc->thousands_sep; - if (! temp) { - temp = ""; - } - } + /* Otherwise the code set name is considered to be everything between + * the dot and the '@' */ + retval = save_to_buffer(retval, retbufp, retbuf_sizep); + + break; - retval = save_to_buffer(temp, &PL_langinfo_buf, - &PL_langinfo_bufsize, 0); +# endif -# ifdef TS_W32_BROKEN_LOCALECONV + } /* Giant switch() of nl_langinfo() items */ - my_setlocale(LC_ALL, save_global); - _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); - my_setlocale(LC_ALL, save_thread); - Safefree(save_global); - Safefree(save_thread); + restore_toggled_locale_i(cat_index, orig_switched_locale); +# ifdef USE_LOCALE_CTYPE + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); # endif - LOCALECONV_UNLOCK; + if (utf8ness) { + *utf8ness = is_utf8; + } - if (toggle) { - RESTORE_LC_NUMERIC(); - } + return retval; - break; +# endif /* All the implementations of my_langinfo() */ -# endif -# ifdef HAS_STRFTIME - - /* These are defined by C89, so we assume that strftime supports - * them, and so are returned unconditionally; they may not be what - * the locale actually says, but should give good enough results - * for someone using them as formats (as opposed to trying to parse - * them to figure out what the locale says). The other format - * items are actually tested to verify they work on the platform */ - case D_FMT: return "%x"; - case T_FMT: return "%X"; - case D_T_FMT: return "%c"; - - /* These formats are only available in later strfmtime's */ - case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM: - - /* The rest can be gotten from most versions of strftime(). */ - case ABDAY_1: case ABDAY_2: case ABDAY_3: - case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7: - case ALT_DIGITS: - case AM_STR: case PM_STR: - case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4: - case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8: - case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12: - case DAY_1: case DAY_2: case DAY_3: case DAY_4: - case DAY_5: case DAY_6: case DAY_7: - case MON_1: case MON_2: case MON_3: case MON_4: - case MON_5: case MON_6: case MON_7: case MON_8: - case MON_9: case MON_10: case MON_11: case MON_12: - - init_tm(&tm); /* Precaution against core dumps */ - tm.tm_sec = 30; - tm.tm_min = 30; - tm.tm_hour = 6; - tm.tm_year = 2017 - 1900; - tm.tm_wday = 0; - tm.tm_mon = 0; - - GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough); - - switch (item) { - default: - Perl_croak(aTHX_ - "panic: %s: %d: switch case: %d problem", - __FILE__, __LINE__, item); - NOT_REACHED; /* NOTREACHED */ - - case PM_STR: tm.tm_hour = 18; - case AM_STR: - format = "%p"; - break; - - case ABDAY_7: tm.tm_wday++; - case ABDAY_6: tm.tm_wday++; - case ABDAY_5: tm.tm_wday++; - case ABDAY_4: tm.tm_wday++; - case ABDAY_3: tm.tm_wday++; - case ABDAY_2: tm.tm_wday++; - case ABDAY_1: - format = "%a"; - break; - - case DAY_7: tm.tm_wday++; - case DAY_6: tm.tm_wday++; - case DAY_5: tm.tm_wday++; - case DAY_4: tm.tm_wday++; - case DAY_3: tm.tm_wday++; - case DAY_2: tm.tm_wday++; - case DAY_1: - format = "%A"; - break; - - case ABMON_12: tm.tm_mon++; - case ABMON_11: tm.tm_mon++; - case ABMON_10: tm.tm_mon++; - case ABMON_9: tm.tm_mon++; - case ABMON_8: tm.tm_mon++; - case ABMON_7: tm.tm_mon++; - case ABMON_6: tm.tm_mon++; - case ABMON_5: tm.tm_mon++; - case ABMON_4: tm.tm_mon++; - case ABMON_3: tm.tm_mon++; - case ABMON_2: tm.tm_mon++; - case ABMON_1: - format = "%b"; - break; - - case MON_12: tm.tm_mon++; - case MON_11: tm.tm_mon++; - case MON_10: tm.tm_mon++; - case MON_9: tm.tm_mon++; - case MON_8: tm.tm_mon++; - case MON_7: tm.tm_mon++; - case MON_6: tm.tm_mon++; - case MON_5: tm.tm_mon++; - case MON_4: tm.tm_mon++; - case MON_3: tm.tm_mon++; - case MON_2: tm.tm_mon++; - case MON_1: - format = "%B"; - break; - - case T_FMT_AMPM: - format = "%r"; - return_format = TRUE; - break; - - case ERA_D_FMT: - format = "%Ex"; - return_format = TRUE; - break; - - case ERA_T_FMT: - format = "%EX"; - return_format = TRUE; - break; - - case ERA_D_T_FMT: - format = "%Ec"; - return_format = TRUE; - break; - - case ALT_DIGITS: - tm.tm_wday = 0; - format = "%Ow"; /* Find the alternate digit for 0 */ - break; - } +/*--------------------------------------------------------------------------*/ - GCC_DIAG_RESTORE_STMT; +} /* my_langinfo() */ - /* We can't use my_strftime() because it doesn't look at - * tm_wday */ - while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize, - format, &tm)) - { - /* A zero return means one of: - * a) there wasn't enough space in PL_langinfo_buf - * b) the format, like a plain %p, returns empty - * c) it was an illegal format, though some - * implementations of strftime will just return the - * illegal format as a plain character sequence. - * - * To quickly test for case 'b)', try again but precede - * the format with a plain character. If that result is - * still empty, the problem is either 'a)' or 'c)' */ - - Size_t format_size = strlen(format) + 1; - Size_t mod_size = format_size + 1; - char * mod_format; - char * temp_result; - - Newx(mod_format, mod_size, char); - Newx(temp_result, PL_langinfo_bufsize, char); - *mod_format = ' '; - my_strlcpy(mod_format + 1, format, mod_size); - len = strftime(temp_result, - PL_langinfo_bufsize, - mod_format, &tm); - Safefree(mod_format); - Safefree(temp_result); - - /* If 'len' is non-zero, it means that we had a case like - * %p which means the current locale doesn't use a.m. or - * p.m., and that is valid */ - if (len == 0) { - - /* Here, still didn't work. If we get well beyond a - * reasonable size, bail out to prevent an infinite - * loop. */ - - if (PL_langinfo_bufsize > 100 * format_size) { - *PL_langinfo_buf = '\0'; - } - else { - /* Double the buffer size to retry; Add 1 in case - * original was 0, so we aren't stuck at 0. */ - PL_langinfo_bufsize *= 2; - PL_langinfo_bufsize++; - Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); - continue; - } - } +#endif /* USE_LOCALE */ - break; - } +char * +Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour, int mday, int mon, int year, int wday, int yday, int isdst) +{ +#ifdef HAS_STRFTIME - /* Here, we got a result. - * - * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the - * alternate format for wday 0. If the value is the same as - * the normal 0, there isn't an alternate, so clear the buffer. - * */ - if ( item == ALT_DIGITS - && strEQ(PL_langinfo_buf, "0")) - { - *PL_langinfo_buf = '\0'; - } +/* +=for apidoc_section $time +=for apidoc my_strftime - /* ALT_DIGITS is problematic. Experiments on it showed that - * strftime() did not always work properly when going from - * alt-9 to alt-10. Only a few locales have this item defined, - * and in all of them on Linux that khw was able to find, - * nl_langinfo() merely returned the alt-0 character, possibly - * doubled. Most Unicode digits are in blocks of 10 - * consecutive code points, so that is sufficient information - * for those scripts, as we can infer alt-1, alt-2, .... But - * for a Japanese locale, a CJK ideographic 0 is returned, and - * the CJK digits are not in code point order, so you can't - * really infer anything. The localedef for this locale did - * specify the succeeding digits, so that strftime() works - * properly on them, without needing to infer anything. But - * the nl_langinfo() return did not give sufficient information - * for the caller to understand what's going on. So until - * there is evidence that it should work differently, this - * returns the alt-0 string for ALT_DIGITS. - * - * wday was chosen because its range is all a single digit. - * Things like tm_sec have two digits as the minimum: '00' */ +strftime(), but with a different API so that the return value is a pointer +to the formatted result (which MUST be arranged to be FREED BY THE +CALLER). This allows this function to increase the buffer size as needed, +so that the caller doesn't have to worry about that. - retval = PL_langinfo_buf; +On failure it returns NULL. - /* If to return the format, not the value, overwrite the buffer - * with it. But some strftime()s will keep the original format - * if illegal, so change those to "" */ - if (return_format) { - if (strEQ(PL_langinfo_buf, format)) { - *PL_langinfo_buf = '\0'; - } - else { - retval = save_to_buffer(format, &PL_langinfo_buf, - &PL_langinfo_bufsize, 0); - } - } +Note that yday and wday effectively are ignored by this function, as +mini_mktime() overwrites them. - break; +Also note that it is always executed in the underlying C<LC_TIME> locale of +the program, giving results based on that locale. +=cut + */ + PERL_ARGS_ASSERT_MY_STRFTIME; + + /* An empty format yields an empty result */ + const int fmtlen = strlen(fmt); + if (fmtlen == 0) { + char *ret; + Newxz (ret, 1, char); + return ret; + } + + /* Set mytm to now */ + struct tm mytm; + init_tm(&mytm); /* XXX workaround - see Perl_init_tm() */ + + /* Override with the passed-in values */ + mytm.tm_sec = sec; + mytm.tm_min = min; + mytm.tm_hour = hour; + mytm.tm_mday = mday; + mytm.tm_mon = mon; + mytm.tm_year = year; + mytm.tm_wday = wday; + mytm.tm_yday = yday; + mytm.tm_isdst = isdst; + mini_mktime(&mytm); + + /* use libc to get the values for tm_gmtoff and tm_zone on platforms that + * have them [perl #18238] */ +#if defined(HAS_MKTIME) && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE)) + struct tm mytm2; + mytm2 = mytm; + MKTIME_LOCK; + mktime(&mytm2); + MKTIME_UNLOCK; +# ifdef HAS_TM_TM_GMTOFF + mytm.tm_gmtoff = mytm2.tm_gmtoff; # endif +# ifdef HAS_TM_TM_ZONE + mytm.tm_zone = mytm2.tm_zone; +# endif +#endif +#if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME) + + const char * orig_CTYPE_LOCALE = toggle_locale_c(LC_CTYPE, + querylocale_c(LC_TIME)); +#endif + /* Guess an initial size for the returned string based on an expansion + * factor of the input format, but with a minimum that should handle most + * common cases. If this guess is too small, we will try again with a + * larger one */ + int bufsize = MAX(fmtlen * 2, 64); + + char *buf = NULL; /* Makes Renew() act as Newx() on the first iteration */ + do { + Renew(buf, bufsize, char); + + GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */ + + STRFTIME_LOCK; + int len = strftime(buf, bufsize, fmt, &mytm); + STRFTIME_UNLOCK; + + GCC_DIAG_RESTORE_STMT; + + /* A non-zero return indicates success. But to make sure we're not + * dealing with some rogue strftime that returns how much space it + * needs instead of 0 when there isn't enough, check that the return + * indicates we have at least one byte of spare space (which will be + * used for the terminating NUL). */ + if (inRANGE(len, 1, bufsize - 1)) { + goto strftime_success; } + + /* There are several possible reasons for a 0 return code for a + * non-empty format, and they are not trivial to tease apart. This + * issue is a known bug in the strftime() API. What we do to cope is + * to assume that the reason is not enough space in the buffer, so + * increase it and try again. */ + bufsize *= 2; + + /* But don't just keep increasing the size indefinitely. Stop when it + * becomes obvious that the reason for failure is something besides not + * enough space. The most likely largest expanding format is %c. On + * khw's Linux box, the maximum result of this is 67 characters, in the + * km_KH locale. If a new script comes along that uses 4 UTF-8 bytes + * per character, and with a similar expansion factor, that would be a + * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1. Some strftime + * implementations allow you to say %1000c to pad to 1000 bytes. This + * shows that it is impossible to implement this without a heuristic + * (that can fail). But it indicates we need to be generous in the + * upper limit before failing. The previous heuristic used was too + * stingy. Since the size doubles per iteration, it doesn't take many + * to reach the limit */ + } while (bufsize < ((1 << 11) + 1) * fmtlen); + + /* Here, strftime() returned 0, and it likely wasn't for lack of space. + * There are two possible reasons: + * + * First is that the result is legitimately 0 length. This can happen + * when the format is precisely "%p". That is the only documented format + * that can have an empty result. */ + if (strEQ(fmt, "%p")) { + Renew(buf, 1, char); + *buf = '\0'; + goto strftime_success; } - return retval; + /* The other reason is that the format string is malformed. Probably it is + * an illegal conversion specifier.) */ + Safefree(buf); + return NULL; + strftime_success: + +#if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME) + + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_LOCALE); + +#endif + return buf; + +#else + Perl_croak(aTHX_ "panic: no strftime"); + return NULL; #endif } +char * +Perl_my_strftime8_temp(pTHX_ const char *fmt, int sec, int min, int hour, int mday, + int mon, int year, int wday, int yday, int isdst, + utf8ness_t * utf8ness) +{ /* Documented above */ + char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday, + yday, isdst); + + PERL_ARGS_ASSERT_MY_STRFTIME8_TEMP; + + if (utf8ness) { + +#ifdef USE_LOCALE_TIME + *utf8ness = get_locale_string_utf8ness_i(retval, + LOCALE_UTF8NESS_UNKNOWN, + NULL, LC_TIME_INDEX_); +#else + *utf8ness = UTF8NESS_IMMATERIAL; +#endif + + } + + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "fmt=%s, retval=%s", fmt, + ((is_utf8_string((U8 *) retval, 0)) + ? retval + :_byte_dump_string((U8 *) retval, strlen(retval), 0))); + if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d", + (int) *utf8ness); + PerlIO_printf(Perl_debug_log, "\n"); + ); + + return retval; +} + /* * Initialize locale awareness. */ @@ -3150,53 +5011,79 @@ Perl_init_i18nl10n(pTHX_ int printwarn) * error handling. * * Besides some asserts, data structure initialization, and specific - * platform complications, this routine is effectively just two things. + * platform complications, this routine is effectively represented by this + * pseudo-code: * - * a) setlocale(LC_ALL, ""); + * setlocale(LC_ALL, ""); x + * foreach (subcategory) { x + * curlocales[f(subcategory)] = setlocale(subcategory, NULL); x + * } x + * if (platform_so_requires) { + * foreach (subcategory) { + * PL_curlocales[f(subcategory)] = curlocales[f(subcategory)] + * } + * } + * foreach (subcategory) { + * if (needs_special_handling[f(subcategory)] &this_subcat_handler + * } * - * which sets LC_ALL to the values in the current environment. + * This sets all the categories to the values in the current environment, + * saves them temporarily in curlocales[] until they can be handled and/or + * on some platforms saved in a per-thread array PL_curlocales[]. * - * And for each individual category 'foo' whose value we care about: + * f(foo) is a mapping from the opaque system category numbers to small + * non-negative integers used most everywhere in this file as indices into + * arrays (such as curlocales[]) so the program doesn't have to otherwise + * deal with the opaqueness. * - * b) save_foo = setlocale(LC_foo, NULL); handle_foo(save_foo); + * If the platform doesn't have LC_ALL, the lines marked 'x' above are + * effectively replaced by: + * foreach (subcategory) { y + * curlocales[f(subcategory)] = setlocale(subcategory, ""); y + * } y * - * (We don't tend to care about categories like LC_PAPER, for example.) + * The only differences being the lack of an LC_ALL call, and using "" + * instead of NULL in the setlocale calls. * - * But there are complications. On systems without LC_ALL, it emulates - * step a) by looping through all the categories, and doing + * But there are, of course, complications. * - * setlocale(LC_foo, ""); + * it has to deal with if this is an embedded perl, whose locale doesn't + * come from the environment, but has been set up by the caller. This is + * pretty simply handled: the "" in the setlocale calls is not a string + * constant, but a variable which is set to NULL in the embedded case. * - * on each. + * But the major complication is handling failure and doing fallback. All + * the code marked 'x' or 'y' above is actually enclosed in an outer loop, + * using the array trial_locales[]. On entry, trial_locales[] is + * initialized to just one entry, containing the NULL or "" locale argument + * shown above. If, as is almost always the case, everything works, it + * exits after just the one iteration, going on to the next step. * - * And it has to deal with if this is an embedded perl, whose locale - * doesn't come from the environment, but has been set up by the caller. - * This is pretty simply handled: the "" in the setlocale calls is not a - * string constant, but a variable which is set to NULL in the embedded - * case. - * - * But the major complication is handling failure and doing fallback. - * There is an array, trial_locales, the elements of which are looped over - * until the locale is successfully set. The array is initialized with - * just one element, for - * setlocale(LC_ALL, $NULL_or_empty) - * If that works, as it almost always does, there's no more elements and - * the loop iterates just the once. Otherwise elements are added for each - * of the environment variables that POSIX dictates should control the - * program, in priority order, with a final one being "C". The loop is - * repeated until the first one succeeds. If all fail, we limp along with - * whatever state we got to. If there is no LC_ALL, an inner loop is run - * through all categories (making things look complex). + * But if there is a failure, the code tries its best to honor the + * environment as much as possible. It self-modifies trial_locales[] to + * have more elements, one for each of the POSIX-specified settings from + * the environment, such as LANG, ending in the ultimate fallback, the C + * locale. Thus if there is something bogus with a higher priority + * environment variable, it will try with the next highest, until something + * works. If everything fails, it limps along with whatever state it got + * to. * * A further complication is that Windows has an additional fallback, the * user-default ANSI code page obtained from the operating system. This is * added as yet another loop iteration, just before the final "C" * - * On Ultrix, the locale MUST come from the environment, so there is - * preliminary code to set it. I (khw) am not sure that it is necessary, - * and that this couldn't be folded into the loop, but barring any real - * platforms to test on, it's staying as-is - */ + * A slight complication is that in embedded Perls, the locale may already + * be set-up, and we don't want to get it from the normal environment + * variables. This is handled by having a special environment variable + * indicate we're in this situation. We simply set setlocale's 2nd + * parameter to be a NULL instead of "". That indicates to setlocale that + * it is not to change anything, but to return the current value, + * effectively initializing perl's db to what the locale already is. + * + * We play the same trick with NULL if a LC_ALL succeeds. We call + * setlocale() on the individual categories with NULL to get their existing + * values for our db, instead of trying to change them. + * */ int ok = 1; @@ -3215,7 +5102,13 @@ Perl_init_i18nl10n(pTHX_ int printwarn) const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT")) ? NULL : ""; - const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */ + typedef struct trial_locales_struct_s { + const char* trial_locale; + const char* fallback_desc; + const char* fallback_name; + } trial_locales_struct; + /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */ + trial_locales_struct trial_locales[5]; unsigned int trial_locales_count; const char * const lc_all = PerlEnv_getenv("LC_ALL"); const char * const lang = PerlEnv_getenv("LANG"); @@ -3233,165 +5126,140 @@ Perl_init_i18nl10n(pTHX_ int printwarn) *bad_lang_use_once && strNE("0", bad_lang_use_once))))); - /* setlocale() return vals; not copied so must be looked at immediately */ - const char * sl_result[NOMINAL_LC_ALL_INDEX + 1]; - /* current locale for given category; should have been copied so aren't * volatile */ const char * curlocales[NOMINAL_LC_ALL_INDEX + 1]; -# ifdef WIN32 - - /* In some systems you can find out the system default locale - * and use that as the fallback locale. */ -# define SYSTEM_DEFAULT_LOCALE -# endif -# ifdef SYSTEM_DEFAULT_LOCALE - - const char *system_default_locale = NULL; - -# endif - # ifndef DEBUGGING # define DEBUG_LOCALE_INIT(a,b,c) # else DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT"))); -# define DEBUG_LOCALE_INIT(category, locale, result) \ - STMT_START { \ - if (debug_initialization) { \ - PerlIO_printf(Perl_debug_log, \ - "%s:%d: %s\n", \ - __FILE__, __LINE__, \ - setlocale_debug_string(category, \ - locale, \ - result)); \ - } \ - } STMT_END +# define DEBUG_LOCALE_INIT(cat_index, locale, result) \ + DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", \ + setlocale_debug_string_i(cat_index, locale, result))); /* Make sure the parallel arrays are properly set up */ # ifdef USE_LOCALE_NUMERIC - assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC); - assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC")); + assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC); + assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_NUMERIC_INDEX] == LC_NUMERIC_MASK); + assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK); # endif # endif # ifdef USE_LOCALE_CTYPE - assert(categories[LC_CTYPE_INDEX] == LC_CTYPE); - assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE")); + assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE); + assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_CTYPE_INDEX] == LC_CTYPE_MASK); + assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK); # endif # endif # ifdef USE_LOCALE_COLLATE - assert(categories[LC_COLLATE_INDEX] == LC_COLLATE); - assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE")); + assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE); + assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_COLLATE_INDEX] == LC_COLLATE_MASK); + assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK); # endif # endif # ifdef USE_LOCALE_TIME - assert(categories[LC_TIME_INDEX] == LC_TIME); - assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME")); + assert(categories[LC_TIME_INDEX_] == LC_TIME); + assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_TIME_INDEX] == LC_TIME_MASK); + assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK); # endif # endif # ifdef USE_LOCALE_MESSAGES - assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES); - assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES")); + assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES); + assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_MESSAGES_INDEX] == LC_MESSAGES_MASK); + assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK); # endif # endif # ifdef USE_LOCALE_MONETARY - assert(categories[LC_MONETARY_INDEX] == LC_MONETARY); - assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY")); + assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY); + assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_MONETARY_INDEX] == LC_MONETARY_MASK); + assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK); # endif # endif # ifdef USE_LOCALE_ADDRESS - assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS); - assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS")); + assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS); + assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_ADDRESS_INDEX] == LC_ADDRESS_MASK); + assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK); # endif # endif # ifdef USE_LOCALE_IDENTIFICATION - assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION); - assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION")); + assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION); + assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION_MASK); + assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK); # endif # endif # ifdef USE_LOCALE_MEASUREMENT - assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT); - assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT")); + assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT); + assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT_MASK); + assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK); # endif # endif # ifdef USE_LOCALE_PAPER - assert(categories[LC_PAPER_INDEX] == LC_PAPER); - assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER")); + assert(categories[LC_PAPER_INDEX_] == LC_PAPER); + assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_PAPER_INDEX] == LC_PAPER_MASK); + assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK); # endif # endif # ifdef USE_LOCALE_TELEPHONE - assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE); - assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE")); + assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE); + assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE")); +# ifdef USE_POSIX_2008_LOCALE + assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK); +# endif +# endif +# ifdef USE_LOCALE_NAME + assert(categories[LC_NAME_INDEX_] == LC_NAME); + assert(strEQ(category_names[LC_NAME_INDEX_], "LC_NAME")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_TELEPHONE_INDEX] == LC_TELEPHONE_MASK); + assert(category_masks[LC_NAME_INDEX_] == LC_NAME_MASK); # endif # endif # ifdef USE_LOCALE_SYNTAX - assert(categories[LC_SYNTAX_INDEX] == LC_SYNTAX); - assert(strEQ(category_names[LC_SYNTAX_INDEX], "LC_SYNTAX")); + assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX); + assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_SYNTAX_INDEX] == LC_SYNTAX_MASK); + assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK); # endif # endif # ifdef USE_LOCALE_TOD - assert(categories[LC_TOD_INDEX] == LC_TOD); - assert(strEQ(category_names[LC_TOD_INDEX], "LC_TOD")); + assert(categories[LC_TOD_INDEX_] == LC_TOD); + assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD")); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_TOD_INDEX] == LC_TOD_MASK); + assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK); # endif # endif # ifdef LC_ALL - assert(categories[LC_ALL_INDEX] == LC_ALL); - assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL")); - assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX); + assert(categories[LC_ALL_INDEX_] == LC_ALL); + assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL")); + STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_); # ifdef USE_POSIX_2008_LOCALE - assert(category_masks[LC_ALL_INDEX] == LC_ALL_MASK); + assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK); # endif # endif # endif /* DEBUGGING */ /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for * why these particular incantations are used. */ -#ifdef HAS_MBRLEN +# ifdef HAS_MBRLEN memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps)); -#endif -#ifdef HAS_MBRTOWC +# endif +# ifdef HAS_MBRTOWC memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps)); -#endif -#ifdef HAS_WCTOMBR +# endif +# ifdef HAS_WCTOMBR wcrtomb(NULL, L'\0', &PL_wcrtomb_ps); -#endif - - /* Initialize the cache of the program's UTF-8ness for the always known - * locales C and POSIX */ - my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness, - sizeof(PL_locale_utf8ness)); - - /* See https://github.com/Perl/perl5/issues/17824 */ - Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *); - +# endif # ifdef USE_THREAD_SAFE_LOCALE # ifdef WIN32 @@ -3401,123 +5269,87 @@ Perl_init_i18nl10n(pTHX_ int printwarn) # endif # ifdef USE_POSIX_2008_LOCALE - PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0); if (! PL_C_locale_obj) { - Perl_croak_nocontext( - "panic: Cannot create POSIX 2008 C locale object; errno=%d", errno); + PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0); } - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "%s:%d: created C object %p\n", __FILE__, __LINE__, PL_C_locale_obj); + if (! PL_C_locale_obj) { + locale_panic_(Perl_form(aTHX_ + "Cannot create POSIX 2008 C locale object")); } -# endif + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n", + PL_C_locale_obj)); -# ifdef USE_LOCALE_NUMERIC + /* Switch to using the POSIX 2008 interface now. This would happen below + * anyway, but deferring it can lead to leaks of memory that would also get + * malloc'd in the interim */ + uselocale(PL_C_locale_obj); - PL_numeric_radix_sv = newSVpvs("."); +# ifdef USE_LOCALE_NUMERIC + PL_underlying_numeric_obj = duplocale(PL_C_locale_obj); + +# endif # endif +# ifdef USE_LOCALE_NUMERIC + + PL_numeric_radix_sv = newSV(1); + PL_underlying_radix_sv = newSV(1); + Newxz(PL_numeric_name, 1, char); /* Single NUL character */ + new_numeric("C", false); -# if defined(USE_POSIX_2008_LOCALE) && ! defined(HAS_QUERYLOCALE) +# endif +# ifdef USE_LOCALE_COLLATE - /* Initialize our records. If we have POSIX 2008, we have LC_ALL */ - do_setlocale_c(LC_ALL, my_setlocale(LC_ALL, NULL)); + Newxz(PL_collation_name, 1, char); + new_collate("C", false); # endif -# ifdef LOCALE_ENVIRON_REQUIRED +# ifdef USE_LOCALE_CTYPE - /* - * Ultrix setlocale(..., "") fails if there are no environment - * variables from which to get a locale name. - */ + Newxz(PL_ctype_name, 1, char); + new_ctype("C", false); -# ifndef LC_ALL -# error Ultrix without LC_ALL not implemented -# else +# endif +# ifdef USE_PL_CURLOCALES - { - bool done = FALSE; - if (lang) { - sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init); - DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]); - if (sl_result[LC_ALL_INDEX]) - done = TRUE; - else - setlocale_failure = TRUE; - } - if (! setlocale_failure) { - const char * locale_param; - for (i = 0; i < LC_ALL_INDEX; i++) { - locale_param = (! done && (lang || PerlEnv_getenv(category_names[i]))) - ? setlocale_init - : NULL; - sl_result[i] = do_setlocale_r(categories[i], locale_param); - if (! sl_result[i]) { - setlocale_failure = TRUE; - } - DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]); - } - } + /* Initialize our records. */ + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + (void) emulate_setlocale_i(i, posix_setlocale(categories[i], NULL), + RECALCULATE_LC_ALL_ON_FINAL_INTERATION, + __LINE__); } -# endif /* LC_ALL */ -# endif /* LOCALE_ENVIRON_REQUIRED */ +# endif /* We try each locale in the list until we get one that works, or exhaust * the list. Normally the loop is executed just once. But if setting the * locale fails, inside the loop we add fallback trials to the array and so * will execute the loop multiple times */ - trial_locales[0] = setlocale_init; + trial_locales_struct ts = { + .trial_locale = setlocale_init, + .fallback_desc = NULL, + .fallback_name = NULL, + }; + trial_locales[0] = ts; trial_locales_count = 1; - for (i= 0; i < trial_locales_count; i++) { - const char * trial_locale = trial_locales[i]; - - if (i > 0) { - - /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED - * when i==0, but I (khw) don't think that behavior makes much - * sense */ - setlocale_failure = FALSE; - -# ifdef SYSTEM_DEFAULT_LOCALE -# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */ - - /* On Windows machines, an entry of "" after the 0th means to use - * the system default locale, which we now proceed to get. */ - if (strEQ(trial_locale, "")) { - unsigned int j; - - /* Note that this may change the locale, but we are going to do - * that anyway just below */ - system_default_locale = do_setlocale_c(LC_ALL, ""); - DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale); - - /* Skip if invalid or if it's already on the list of locales to - * try */ - if (! system_default_locale) { - goto next_iteration; - } - for (j = 0; j < trial_locales_count; j++) { - if (strEQ(system_default_locale, trial_locales[j])) { - goto next_iteration; - } - } - - trial_locale = system_default_locale; - } -# else -# error SYSTEM_DEFAULT_LOCALE only implemented for Win32 -# endif -# endif /* SYSTEM_DEFAULT_LOCALE */ + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + curlocales[i] = NULL; + } - } /* For i > 0 */ + for (i= 0; i < trial_locales_count; i++) { + const char * trial_locale = trial_locales[i].trial_locale; + setlocale_failure = FALSE; # ifdef LC_ALL - sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale); - DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]); - if (! sl_result[LC_ALL_INDEX]) { + /* setlocale() return vals; not copied so must be looked at + * immediately. */ + const char * sl_result[NOMINAL_LC_ALL_INDEX + 1]; + sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale); + DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]); + if (! sl_result[LC_ALL_INDEX_]) { setlocale_failure = TRUE; } else { @@ -3536,12 +5368,12 @@ Perl_init_i18nl10n(pTHX_ int printwarn) if (! setlocale_failure) { unsigned int j; for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { - curlocales[j] - = savepv(do_setlocale_r(categories[j], trial_locale)); + curlocales[j] = stdized_setlocale(categories[j], trial_locale); if (! curlocales[j]) { setlocale_failure = TRUE; } - DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]); + curlocales[j] = savepv(curlocales[j]); + DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]); } if (LIKELY(! setlocale_failure)) { /* All succeeded */ @@ -3565,14 +5397,11 @@ Perl_init_i18nl10n(pTHX_ int printwarn) # else /* !LC_ALL */ PerlIO_printf(Perl_error_log, - "perl: warning: Setting locale failed for the categories:\n\t"); + "perl: warning: Setting locale failed for the categories:\n"); for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { if (! curlocales[j]) { - PerlIO_printf(Perl_error_log, category_names[j]); - } - else { - Safefree(curlocales[j]); + PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]); } } @@ -3656,21 +5485,35 @@ Perl_init_i18nl10n(pTHX_ int printwarn) * to change the behavior. */ if (lc_all) { for (j = 0; j < trial_locales_count; j++) { - if (strEQ(lc_all, trial_locales[j])) { + if (strEQ(lc_all, trial_locales[j].trial_locale)) { goto done_lc_all; } } - trial_locales[trial_locales_count++] = lc_all; + trial_locales_struct ts = { + .trial_locale = lc_all, + .fallback_desc = (strEQ(lc_all, "C") + ? "the standard locale" + : "a fallback locale"), + .fallback_name = lc_all, + }; + trial_locales[trial_locales_count++] = ts; } done_lc_all: if (lang) { for (j = 0; j < trial_locales_count; j++) { - if (strEQ(lang, trial_locales[j])) { + if (strEQ(lang, trial_locales[j].trial_locale)) { goto done_lang; } } - trial_locales[trial_locales_count++] = lang; + trial_locales_struct ts = { + .trial_locale = lang, + .fallback_desc = (strEQ(lang, "C") + ? "the standard locale" + : "a fallback locale"), + .fallback_name = lang, + }; + trial_locales[trial_locales_count++] = ts; } done_lang: @@ -3679,29 +5522,65 @@ Perl_init_i18nl10n(pTHX_ int printwarn) /* For Windows, we also try the system default locale before "C". * (If there exists a Windows without LC_ALL we skip this because * it gets too complicated. For those, the "C" is the next - * fallback possibility). The "" is the same as the 0th element of - * the array, but the code at the loop above knows to treat it - * differently when not the 0th */ - trial_locales[trial_locales_count++] = ""; + * fallback possibility). */ + { + /* Note that this may change the locale, but we are going to do + * that anyway. + * + * Our normal Windows setlocale() implementation ignores the + * system default locale to make things work like POSIX. This + * is the only place where we want to consider it, so have to + * use wrap_wsetlocale(). */ + const char *system_default_locale = + stdize_locale(LC_ALL, + wrap_wsetlocale(LC_ALL, ""), + &PL_stdize_locale_buf, + &PL_stdize_locale_bufsize, + __LINE__); + DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale); + + /* Skip if invalid or if it's already on the list of locales to + * try */ + if (! system_default_locale) { + goto done_system_default; + } + for (j = 0; j < trial_locales_count; j++) { + if (strEQ(system_default_locale, trial_locales[j].trial_locale)) { + goto done_system_default; + } + } + + trial_locales_struct ts = { + .trial_locale = system_default_locale, + .fallback_desc = (strEQ(system_default_locale, "C") + ? "the standard locale" + : "the system default locale"), + .fallback_name = system_default_locale, + }; + trial_locales[trial_locales_count++] = ts; + } + done_system_default: # endif for (j = 0; j < trial_locales_count; j++) { - if (strEQ("C", trial_locales[j])) { + if (strEQ("C", trial_locales[j].trial_locale)) { goto done_C; } } - trial_locales[trial_locales_count++] = "C"; - + { + /* new scope to avoid C++ complaining about + initialization being bypassed by goto. + */ + trial_locales_struct ts = { + .trial_locale = "C", + .fallback_desc = "the standard locale", + .fallback_name = "C", + }; + trial_locales[trial_locales_count++] = ts; + } done_C: ; } /* end of first time through the loop */ - -# ifdef WIN32 - - next_iteration: ; - -# endif - } /* end of looping through the trial locales */ if (ok < 1) { /* If we tried to fallback */ @@ -3723,34 +5602,15 @@ Perl_init_i18nl10n(pTHX_ int printwarn) for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { Safefree(curlocales[j]); - curlocales[j] = savepv(do_setlocale_r(categories[j], NULL)); - DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]); + curlocales[j] = savepv(stdized_setlocale(categories[j], NULL)); + DEBUG_LOCALE_INIT(j, NULL, curlocales[j]); } } if (locwarn) { - const char * description; - const char * name = ""; - if (strEQ(trial_locales[i], "C")) { - description = "the standard locale"; - name = "C"; - } - -# ifdef SYSTEM_DEFAULT_LOCALE - - else if (strEQ(trial_locales[i], "")) { - description = "the system default locale"; - if (system_default_locale) { - name = system_default_locale; - } - } - -# endif /* SYSTEM_DEFAULT_LOCALE */ + const char * description = trial_locales[i].fallback_desc; + const char * name = trial_locales[i].fallback_name; - else { - description = "a fallback locale"; - name = trial_locales[i]; - } if (name && strNE(name, "")) { PerlIO_printf(Perl_error_log, "perl: warning: %s %s (\"%s\").\n", msg, description, name); @@ -3762,43 +5622,23 @@ Perl_init_i18nl10n(pTHX_ int printwarn) } } /* End of tried to fallback */ - /* Done with finding the locales; update our records */ - -# ifdef USE_LOCALE_CTYPE +# ifdef USE_POSIX_2008_LOCALE - new_ctype(curlocales[LC_CTYPE_INDEX]); + /* The stdized setlocales haven't affected the P2008 locales. Initialize + * them now, calculating LC_ALL only on the final go round, when all have + * been set. */ + for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + (void) emulate_setlocale_i(i, curlocales[i], + RECALCULATE_LC_ALL_ON_FINAL_INTERATION, + __LINE__); + } # endif -# ifdef USE_LOCALE_COLLATE - - new_collate(curlocales[LC_COLLATE_INDEX]); -# endif -# ifdef USE_LOCALE_NUMERIC - - new_numeric(curlocales[LC_NUMERIC_INDEX]); - -# endif + /* Done with finding the locales; update the auxiliary records */ + new_LC_ALL(NULL, false); for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { - -# if defined(USE_ITHREADS) && ! defined(USE_THREAD_SAFE_LOCALE) - - /* This caches whether each category's locale is UTF-8 or not. This - * may involve changing the locale. It is ok to do this at - * initialization time before any threads have started, but not later - * unless thread-safe operations are used. - * Caching means that if the program heeds our dictate not to change - * locales in threaded applications, this data will remain valid, and - * it may get queried without having to change locales. If the - * environment is such that all categories have the same locale, this - * isn't needed, as the code will not change the locale; but this - * handles the uncommon case where the environment has disparate - * locales for the categories */ - (void) _is_cur_LC_category_utf8(categories[i]); - -# endif - Safefree(curlocales[i]); } @@ -3824,20 +5664,174 @@ Perl_init_i18nl10n(pTHX_ int printwarn) # endif #endif /* USE_LOCALE */ -#ifdef DEBUGGING /* So won't continue to output stuff */ DEBUG_INITIALIZATION_set(FALSE); -#endif - return ok; } #ifdef USE_LOCALE_COLLATE +STATIC void +S_compute_collxfrm_coefficients(pTHX) +{ + + /* A locale collation definition includes primary, secondary, tertiary, + * etc. weights for each character. To sort, the primary weights are used, + * and only if they compare equal, then the secondary weights are used, and + * only if they compare equal, then the tertiary, etc. + * + * strxfrm() works by taking the input string, say ABC, and creating an + * output transformed string consisting of first the primary weights, + * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary, + * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters may not have + * weights at every level. In our example, let's say B doesn't have a + * tertiary weight, and A doesn't have a secondary weight. The constructed + * string is then going to be + * A¹B¹C¹ B²C² A³C³ .... + * This has the desired effect that strcmp() will look at the secondary or + * tertiary weights only if the strings compare equal at all higher + * priority weights. The spaces shown here, like in + * "A¹B¹C¹ A²B²C² " + * are not just for readability. In the general case, these must actually + * be bytes, which we will call here 'separator weights'; and they must be + * smaller than any other weight value, but since these are C strings, only + * the terminating one can be a NUL (some implementations may include a + * non-NUL separator weight just before the NUL). Implementations tend to + * reserve 01 for the separator weights. They are needed so that a shorter + * string's secondary weights won't be misconstrued as primary weights of a + * longer string, etc. By making them smaller than any other weight, the + * shorter string will sort first. (Actually, if all secondary weights are + * smaller than all primary ones, there is no need for a separator weight + * between those two levels, etc.) + * + * The length of the transformed string is roughly a linear function of the + * input string. It's not exactly linear because some characters don't + * have weights at all levels. When we call strxfrm() we have to allocate + * some memory to hold the transformed string. The calculations below try + * to find coefficients 'm' and 'b' for this locale so that m*x + b equals + * how much space we need, given the size of the input string in 'x'. If + * we calculate too small, we increase the size as needed, and call + * strxfrm() again, but it is better to get it right the first time to + * avoid wasted expensive string transformations. + * + * We use the string below to find how long the transformation of it is. + * Almost all locales are supersets of ASCII, or at least the ASCII + * letters. We use all of them, half upper half lower, because if we used + * fewer, we might hit just the ones that are outliers in a particular + * locale. Most of the strings being collated will contain a preponderance + * of letters, and even if they are above-ASCII, they are likely to have + * the same number of weight levels as the ASCII ones. It turns out that + * digits tend to have fewer levels, and some punctuation has more, but + * those are relatively sparse in text, and khw believes this gives a + * reasonable result, but it could be changed if experience so dictates. */ + const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz"; + char * x_longer; /* Transformed 'longer' */ + Size_t x_len_longer; /* Length of 'x_longer' */ + + char * x_shorter; /* We also transform a substring of 'longer' */ + Size_t x_len_shorter; + + PL_in_utf8_COLLATE_locale = (PL_collation_standard) + ? 0 + : is_locale_utf8(PL_collation_name); + PL_strxfrm_NUL_replacement = '\0'; + PL_strxfrm_max_cp = 0; + + /* mem_collxfrm_() is used get the transformation (though here we are + * interested only in its length). It is used because it has the + * intelligence to handle all cases, but to work, it needs some values of + * 'm' and 'b' to get it started. For the purposes of this calculation we + * use a very conservative estimate of 'm' and 'b'. This assumes a weight + * can be multiple bytes, enough to hold any UV on the platform, and there + * are 5 levels, 4 weight bytes, and a trailing NUL. */ + PL_collxfrm_base = 5; + PL_collxfrm_mult = 5 * sizeof(UV); + + /* Find out how long the transformation really is */ + x_longer = mem_collxfrm_(longer, + sizeof(longer) - 1, + &x_len_longer, + + /* We avoid converting to UTF-8 in the called + * function by telling it the string is in UTF-8 + * if the locale is a UTF-8 one. Since the string + * passed here is invariant under UTF-8, we can + * claim it's UTF-8 even though it isn't. */ + PL_in_utf8_COLLATE_locale); + Safefree(x_longer); + + /* Find out how long the transformation of a substring of 'longer' is. + * Together the lengths of these transformations are sufficient to + * calculate 'm' and 'b'. The substring is all of 'longer' except the + * first character. This minimizes the chances of being swayed by outliers + * */ + x_shorter = mem_collxfrm_(longer + 1, + sizeof(longer) - 2, + &x_len_shorter, + PL_in_utf8_COLLATE_locale); + Safefree(x_shorter); + + /* If the results are nonsensical for this simple test, the whole locale + * definition is suspect. Mark it so that locale collation is not active + * at all for it. XXX Should we warn? */ + if ( x_len_shorter == 0 + || x_len_longer == 0 + || x_len_shorter >= x_len_longer) + { + PL_collxfrm_mult = 0; + PL_collxfrm_base = 1; + DEBUG_L(PerlIO_printf(Perl_debug_log, + "Disabling locale collation for LC_COLLATE='%s';" + " length for shorter sample=%zu; longer=%zu\n", + PL_collation_name, x_len_shorter, x_len_longer)); + } + else { + SSize_t base; /* Temporary */ + + /* We have both: m * strlen(longer) + b = x_len_longer + * m * strlen(shorter) + b = x_len_shorter; + * subtracting yields: + * m * (strlen(longer) - strlen(shorter)) + * = x_len_longer - x_len_shorter + * But we have set things up so that 'shorter' is 1 byte smaller than + * 'longer'. Hence: + * m = x_len_longer - x_len_shorter + * + * But if something went wrong, make sure the multiplier is at least 1. + */ + if (x_len_longer > x_len_shorter) { + PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter; + } + else { + PL_collxfrm_mult = 1; + } + + /* mx + b = len + * so: b = len - mx + * but in case something has gone wrong, make sure it is non-negative + * */ + base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1); + if (base < 0) { + base = 0; + } + + /* Add 1 for the trailing NUL */ + PL_collxfrm_base = base + 1; + } + + DEBUG_L(PerlIO_printf(Perl_debug_log, + "?UTF-8 locale=%d; x_len_shorter=%zu, " + "x_len_longer=%zu," + " collate multipler=%zu, collate base=%zu\n", + PL_in_utf8_COLLATE_locale, + x_len_shorter, x_len_longer, + PL_collxfrm_mult, PL_collxfrm_base)); +} + char * -Perl__mem_collxfrm(pTHX_ const char *input_string, +Perl_mem_collxfrm_(pTHX_ const char *input_string, STRLEN len, /* Length of 'input_string' */ STRLEN *xlen, /* Set to length of returned string (not including the collation index @@ -3845,15 +5839,18 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, bool utf8 /* Is the input in UTF-8? */ ) { - - /* _mem_collxfrm() is a bit like strxfrm() but with two important - * differences. First, it handles embedded NULs. Second, it allocates a bit - * more memory than needed for the transformed data itself. The real - * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to - * the length of that, and doesn't include the collation index size. + /* mem_collxfrm_() is like strxfrm() but with two important differences. + * First, it handles embedded NULs. Second, it allocates a bit more memory + * than needed for the transformed data itself. The real transformed data + * begins at offset COLLXFRM_HDR_LEN. *xlen is set to the length of that, + * and doesn't include the collation index size. + * + * It is the caller's responsibility to eventually free the memory returned + * by this function. + * * Please see sv_collxfrm() to see how this is used. */ -#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix) +# define COLLXFRM_HDR_LEN sizeof(PL_collation_ix) char * s = (char *) input_string; STRLEN s_strlen = strlen(input_string); @@ -3862,16 +5859,29 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, STRLEN length_in_chars; bool first_time = TRUE; /* Cleared after first loop iteration */ - PERL_ARGS_ASSERT__MEM_COLLXFRM; +# ifdef USE_LOCALE_CTYPE + const char * orig_CTYPE_locale = NULL; +# endif + +# if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L + locale_t constructed_locale = (locale_t) 0; +# endif + + PERL_ARGS_ASSERT_MEM_COLLXFRM_; /* Must be NUL-terminated */ assert(*(input_string + len) == '\0'); - /* If this locale has defective collation, skip */ - if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: locale's collation is defective\n")); - goto bad; + if (PL_collxfrm_mult == 0) { /* unknown or bad */ + if (PL_collxfrm_base != 0) { /* bad collation => skip */ + DEBUG_L(PerlIO_printf(Perl_debug_log, + "mem_collxfrm_: locale's collation is defective\n")); + goto bad; + } + + /* (mult, base) == (0,0) means we need to calculate mult and base + * before proceeding */ + S_compute_collxfrm_coefficients(aTHX); } /* Replace any embedded NULs with the control that sorts before any others. @@ -3912,6 +5922,16 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, try_non_controls < 2; try_non_controls++) { + +# ifdef USE_LOCALE_CTYPE + + /* In this case we use isCNTRL_LC() below, which relies on + * LC_CTYPE, so that must be switched to correspond with the + * LC_COLLATE locale */ + if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) { + orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name); + } +# endif /* Look through all legal code points (NUL isn't) */ for (j = 1; j < 256; j++) { char * x; /* j's xfrm plus collation index */ @@ -3932,7 +5952,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, cur_source[0] = (char) j; /* Then transform it */ - x = _mem_collxfrm(cur_source, trial_len, &x_len, + x = mem_collxfrm_(cur_source, trial_len, &x_len, 0 /* The string is not in UTF-8 */); /* Ignore any character that didn't successfully transform. @@ -3956,6 +5976,10 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, } } /* end of loop through all 255 characters */ +# ifdef USE_LOCALE_CTYPE + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); +# endif + /* Stop looking if found */ if (cur_min_x) { break; @@ -3965,18 +5989,18 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, * work in the locale, repeat the loop, looking for any * character that works */ DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: No control worked. Trying non-controls\n")); + "mem_collxfrm_: No control worked. Trying non-controls\n")); } /* End of loop to try first the controls, then any char */ if (! cur_min_x) { DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: Couldn't find any character to replace" + "mem_collxfrm_: Couldn't find any character to replace" " embedded NULs in locale %s with", PL_collation_name)); goto bad; } DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: Replacing embedded NULs in locale %s with " + "mem_collxfrm_: Replacing embedded NULs in locale %s with " "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement)); Safefree(cur_min_x); @@ -4089,7 +6113,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, cur_source[0] = (char) j; /* Then transform it */ - x = _mem_collxfrm(cur_source, 1, &x_len, FALSE); + x = mem_collxfrm_(cur_source, 1, &x_len, FALSE); /* If something went wrong (which it shouldn't), just * ignore this code point */ @@ -4114,14 +6138,14 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, if (! cur_max_x) { DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: Couldn't find any character to" + "mem_collxfrm_: Couldn't find any character to" " replace above-Latin1 chars in locale %s with", PL_collation_name)); goto bad; } DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: highest 1-byte collating character" + "mem_collxfrm_: highest 1-byte collating character" " in locale %s is 0x%02X\n", PL_collation_name, PL_strxfrm_max_cp)); @@ -4181,27 +6205,66 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, Newx(xbuf, xAlloc, char); if (UNLIKELY(! xbuf)) { DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc)); + "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc)); goto bad; } /* Store the collation id */ *(U32*)xbuf = PL_collation_ix; +# if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L +# ifdef USE_LOCALE_CTYPE + + constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name, + duplocale(use_curlocale_scratch())); +# else + + constructed_locale = duplocale(use_curlocale_scratch()); + +# endif +# define my_strxfrm(dest, src, n) strxfrm_l(dest, src, n, \ + constructed_locale) +# define CLEANUP_STRXFRM \ + STMT_START { \ + if (constructed_locale != (locale_t) 0) \ + freelocale(constructed_locale); \ + } STMT_END +# else +# define my_strxfrm(dest, src, n) strxfrm(dest, src, n) +# ifdef USE_LOCALE_CTYPE + + orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name); + +# define CLEANUP_STRXFRM \ + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale) +# else +# define CLEANUP_STRXFRM NOOP +# endif +# endif + /* Then the transformation of the input. We loop until successful, or we * give up */ for (;;) { - *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN); + errno = 0; + *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN); /* If the transformed string occupies less space than we told strxfrm() - * was available, it means it successfully transformed the whole - * string. */ + * was available, it means it transformed the whole string. */ if (*xlen < xAlloc - COLLXFRM_HDR_LEN) { - /* Some systems include a trailing NUL in the returned length. - * Ignore it, using a loop in case multiple trailing NULs are - * returned. */ + /* But there still could have been a problem */ + if (errno != 0) { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n", + PL_collation_name, errno, + _byte_dump_string((U8 *) s, len, 0))); + goto bad; + } + + /* Here, the transformation was successful. Some systems include a + * trailing NUL in the returned length. Ignore it, using a loop in + * case multiple trailing NULs are returned. */ while ( (*xlen) > 0 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0') { @@ -4223,9 +6286,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, : PL_collxfrm_mult; DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s: %d: initial size of %zu bytes for a length " + "initial size of %zu bytes for a length " "%zu string was insufficient, %zu needed\n", - __FILE__, __LINE__, computed_guess, length_in_chars, needed)); /* If slope increased, use it, but discard this result for @@ -4249,9 +6311,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, } DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s: %d: slope is now %zu; was %zu, base " + "slope is now %zu; was %zu, base " "is now %zu; was %zu\n", - __FILE__, __LINE__, PL_collxfrm_mult, old_m, PL_collxfrm_base, old_b)); } @@ -4260,9 +6321,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, - computed_guess + PL_collxfrm_base; DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s: %d: base is now %zu; was %zu\n", - __FILE__, __LINE__, - new_b, PL_collxfrm_base)); + "base is now %zu; was %zu\n", new_b, PL_collxfrm_base)); PL_collxfrm_base = new_b; } } @@ -4272,7 +6331,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, if (UNLIKELY(*xlen >= PERL_INT_MAX)) { DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n", + "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n", *xlen, PERL_INT_MAX)); goto bad; } @@ -4298,45 +6357,29 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, xAlloc += (xAlloc / 4) + 1; PL_strxfrm_is_behaved = FALSE; -# ifdef DEBUGGING - - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, - "_mem_collxfrm required more space than previously calculated" - " for locale %s, trying again with new guess=%zu+%zu\n", + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "mem_collxfrm_ required more space than previously" + " calculated for locale %s, trying again with new" + " guess=%zu+%zu\n", PL_collation_name, COLLXFRM_HDR_LEN, - xAlloc - COLLXFRM_HDR_LEN); - } - -# endif - + xAlloc - COLLXFRM_HDR_LEN)); } Renew(xbuf, xAlloc, char); if (UNLIKELY(! xbuf)) { DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc)); + "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc)); goto bad; } first_time = FALSE; } + CLEANUP_STRXFRM; -# ifdef DEBUGGING - - if (DEBUG_Lv_TEST || debug_initialization) { + DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8)); - print_collxfrm_input_and_return(s, s + len, xlen, utf8); - PerlIO_printf(Perl_debug_log, "Its xfrm is:"); - PerlIO_printf(Perl_debug_log, "%s\n", - _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, - *xlen, 1)); - } - -# endif - - /* Free up unneeded space; retain ehough for trailing NUL */ + /* Free up unneeded space; retain enough for trailing NUL */ Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char); if (s != input_string) { @@ -4347,13 +6390,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, bad: -# ifdef DEBUGGING - - if (DEBUG_Lv_TEST || debug_initialization) { - print_collxfrm_input_and_return(s, s + len, NULL, utf8); - } - -# endif + CLEANUP_STRXFRM; + DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8)); Safefree(xbuf); if (s != input_string) { @@ -4368,45 +6406,96 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, STATIC void S_print_collxfrm_input_and_return(pTHX_ - const char * const s, - const char * const e, - const STRLEN * const xlen, + const char * s, + const char * e, + const char * xbuf, + const STRLEN xlen, const bool is_utf8) { PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN; - PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ", - (UV)PL_collation_ix); - if (xlen) { - PerlIO_printf(Perl_debug_log, "%zu", *xlen); - } - else { - PerlIO_printf(Perl_debug_log, "NULL"); + PerlIO_printf(Perl_debug_log, + "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n" + " input=%s\n return=%s\n return len=%zu\n", + (UV) PL_collation_ix, PL_collation_name, + get_displayable_string(s, e, is_utf8), + ((xbuf == NULL) + ? "(null)" + : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, xlen, 0)), + xlen); +} + +# endif /* DEBUGGING */ + +SV * +Perl_strxfrm(pTHX_ SV * src) +{ + PERL_ARGS_ASSERT_STRXFRM; + + /* For use by POSIX::strxfrm(). If they differ, toggle LC_CTYPE to + * LC_COLLATE to avoid potential mojibake. + * + * If we can't calculate a collation, 'src' is instead returned, so that + * future comparisons will be by code point order */ + +# ifdef USE_LOCALE_CTYPE + + const char * orig_ctype = toggle_locale_c(LC_CTYPE, + querylocale_c(LC_COLLATE)); +# endif + + SV * dst = src; + STRLEN dstlen; + STRLEN srclen; + const char *p = SvPV_const(src,srclen); + const U32 utf8_flag = SvUTF8(src); + char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag)); + + assert(utf8_flag == 0 || utf8_flag == SVf_UTF8); + + if (d != NULL) { + assert(dstlen > 0); + dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN, + dstlen, SVs_TEMP|utf8_flag); + Safefree(d); } - PerlIO_printf(Perl_debug_log, " for locale '%s', string='", - PL_collation_name); - print_bytes_for_locale(s, e, is_utf8); - PerlIO_printf(Perl_debug_log, "'\n"); +# ifdef USE_LOCALE_CTYPE + + restore_toggled_locale_c(LC_CTYPE, orig_ctype); + +# endif + + return dst; } -# endif /* DEBUGGING */ #endif /* USE_LOCALE_COLLATE */ -#ifdef USE_LOCALE -# ifdef DEBUGGING +#if defined(DEBUGGING) || defined(USE_POSIX_2008_LOCALE) -STATIC void -S_print_bytes_for_locale(pTHX_ - const char * const s, - const char * const e, - const bool is_utf8) +STATIC const char * +S_get_displayable_string(pTHX_ + const char * const s, + const char * const e, + const bool is_utf8) { + PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING; + const char * t = s; bool prev_was_printable = TRUE; bool first_time = TRUE; + char * ret; + + if (e <= s) { + return ""; + } - PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE; + /* Worst case scenario: All are non-printable so have a blank between each. + * If UTF-8, all are the largest possible code point; otherwise all are a + * single byte. '(2 + 1)' is from each byte takes 2 characters to + * display, and a blank (or NUL for the final one) after it */ + Newxz(ret, (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1), char); + SAVEFREEPV(ret); while (t < e) { UV cp = (is_utf8) @@ -4414,932 +6503,453 @@ S_print_bytes_for_locale(pTHX_ : * (U8 *) t; if (isPRINT(cp)) { if (! prev_was_printable) { - PerlIO_printf(Perl_debug_log, " "); + my_strlcat(ret, " ", sizeof(ret)); } - PerlIO_printf(Perl_debug_log, "%c", (U8) cp); + + /* Escape these to avoid any ambiguity */ + if (cp == ' ' || cp == '\\') { + my_strlcat(ret, "\\", sizeof(ret)); + } + my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), sizeof(ret)); prev_was_printable = TRUE; } else { if (! first_time) { - PerlIO_printf(Perl_debug_log, " "); + my_strlcat(ret, " ", sizeof(ret)); } - PerlIO_printf(Perl_debug_log, "%02" UVXf, cp); + my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), sizeof(ret)); prev_was_printable = FALSE; } t += (is_utf8) ? UTF8SKIP(t) : 1; first_time = FALSE; } + + return ret; } -# endif /* #ifdef DEBUGGING */ +#endif +#ifdef USE_LOCALE STATIC const char * -S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale) +S_toggle_locale_i(pTHX_ const unsigned cat_index, + const char * new_locale, + const line_t caller_line) { - /* Changes the locale for LC_'switch_category" to that of - * LC_'template_category', if they aren't already the same. If not NULL, - * 'template_locale' is the locale that 'template_category' is in. + /* Changes the locale for the category specified by 'index' to 'new_locale, + * if they aren't already the same. * - * Returns a copy of the name of the original locale for 'switch_category' + * Returns a copy of the name of the original locale for 'cat_index' * so can be switched back to with the companion function - * restore_switched_locale(), (NULL if no restoral is necessary.) */ + * restore_toggled_locale_i(), (NULL if no restoral is necessary.) */ - char * restore_to_locale = NULL; + const char * locale_to_restore_to = NULL; - if (switch_category == template_category) { /* No changes needed */ - return NULL; - } + PERL_ARGS_ASSERT_TOGGLE_LOCALE_I; + assert(cat_index <= NOMINAL_LC_ALL_INDEX); /* Find the original locale of the category we may need to change, so that * it can be restored to later */ - restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category, - NULL))); - if (! restore_to_locale) { - Perl_croak(aTHX_ - "panic: %s: %d: Could not find current %s locale, errno=%d\n", - __FILE__, __LINE__, category_name(switch_category), errno); - } - - /* If the locale of the template category wasn't passed in, find it now */ - if (template_locale == NULL) { - template_locale = do_setlocale_r(template_category, NULL); - if (! template_locale) { - Perl_croak(aTHX_ - "panic: %s: %d: Could not find current %s locale, errno=%d\n", - __FILE__, __LINE__, category_name(template_category), errno); - } - } - - /* It the locales are the same, there's nothing to do */ - if (strEQ(restore_to_locale, template_locale)) { - Safefree(restore_to_locale); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n", - category_name(switch_category), template_locale)); + locale_to_restore_to = querylocale_i(cat_index); - return NULL; - } - - /* Finally, change the locale to the template one */ - if (! do_setlocale_r(switch_category, template_locale)) { - Perl_croak(aTHX_ - "panic: %s: %d: Could not change %s locale to %s, errno=%d\n", - __FILE__, __LINE__, category_name(switch_category), - template_locale, errno); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s," + " actual=%s\n", + caller_line, cat_index, category_names[cat_index], + new_locale, locale_to_restore_to)); + + if (! locale_to_restore_to) { + locale_panic_(Perl_form(aTHX_ + "Could not find current %s locale, errno=%d", + category_names[cat_index], errno)); } - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n", - category_name(switch_category), template_locale)); - - return restore_to_locale; -} - -STATIC void -S_restore_switched_locale(pTHX_ const int category, const char * const original_locale) -{ - /* Restores the locale for LC_'category' to 'original_locale' (which is a - * copy that will be freed by this function), or do nothing if the latter - * parameter is NULL */ - - if (original_locale == NULL) { - return; - } + /* If the locales are the same, there's nothing to do */ + if (strEQ(locale_to_restore_to, new_locale)) { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): %s locale unchanged as %s\n", + caller_line, category_names[cat_index], + new_locale)); - if (! do_setlocale_r(category, original_locale)) { - Perl_croak(aTHX_ - "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n", - __FILE__, __LINE__, - category_name(category), original_locale, errno); + return NULL; } - Safefree(original_locale); -} - -/* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */ -#define CUR_LC_BUFFER_SIZE 64 - -bool -Perl__is_cur_LC_category_utf8(pTHX_ int category) -{ - /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE - * otherwise. 'category' may not be LC_ALL. If the platform doesn't have - * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence - * could give the wrong result. The result will very likely be correct for - * languages that have commonly used non-ASCII characters, but for notably - * English, it comes down to if the locale's name ends in something like - * "UTF-8". It errs on the side of not being a UTF-8 locale. - * - * If the platform is early C89, not containing mbtowc(), or we are - * compiled to not pay attention to LC_CTYPE, this employs heuristics. - * These work very well for non-Latin locales or those whose currency - * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at - * least MB_CUR_MAX, English locales with an ASCII currency symbol depend - * on the name containing UTF-8 or not. */ - - /* Name of current locale corresponding to the input category */ - const char *save_input_locale = NULL; - - bool is_utf8 = FALSE; /* The return value */ - - /* The variables below are for the cache of previous lookups using this - * function. The cache is a C string, described at the definition for - * 'C_and_POSIX_utf8ness'. - * - * The first part of the cache is fixed, for the C and POSIX locales. The - * varying part starts just after them. */ - char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness); - - Size_t utf8ness_cache_size; /* Size of the varying portion */ - Size_t input_name_len; /* Length in bytes of save_input_locale */ - Size_t input_name_len_with_overhead; /* plus extra chars used to store - the name in the cache */ - char * delimited; /* The name plus the delimiters used to store - it in the cache */ - char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */ - char * name_pos; /* position of 'delimited' in the cache, or 0 - if not there */ + /* Finally, change the locale to the new one */ + void_setlocale_i(cat_index, new_locale); + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): %s locale switched to %s\n", + caller_line, category_names[cat_index], new_locale)); -# ifdef LC_ALL - - assert(category != LC_ALL); + return locale_to_restore_to; +# ifndef DEBUGGING + PERL_UNUSED_ARG(caller_line); # endif - /* Get the desired category's locale */ - save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL))); - if (! save_input_locale) { - Perl_croak(aTHX_ - "panic: %s: %d: Could not find current %s locale, errno=%d\n", - __FILE__, __LINE__, category_name(category), errno); - } - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Current locale for %s is %s\n", - category_name(category), save_input_locale)); +} - input_name_len = strlen(save_input_locale); +STATIC void +S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index, + const char * restore_locale, + const line_t caller_line) +{ + /* Restores the locale for LC_category corresponding to cat_indes to + * 'restore_locale' (which is a copy that will be freed by this function), + * or do nothing if the latter parameter is NULL */ - /* In our cache, each name is accompanied by two delimiters and a single - * utf8ness digit */ - input_name_len_with_overhead = input_name_len + 3; + PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I; + assert(cat_index <= NOMINAL_LC_ALL_INDEX); - if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) { - /* we can use the buffer, avoid a malloc */ - delimited = buffer; - } else { /* need a malloc */ - /* Allocate and populate space for a copy of the name surrounded by the - * delimiters */ - Newx(delimited, input_name_len_with_overhead, char); + if (restore_locale == NULL) { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): No need to restore %s\n", + caller_line, category_names[cat_index])); + return; } - delimited[0] = UTF8NESS_SEP[0]; - Copy(save_input_locale, delimited + 1, input_name_len, char); - delimited[input_name_len+1] = UTF8NESS_PREFIX[0]; - delimited[input_name_len+2] = '\0'; - - /* And see if that is in the cache */ - name_pos = instr(PL_locale_utf8ness, delimited); - if (name_pos) { - is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0'; - -# ifdef DEBUGGING + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "(%" LINE_Tf "): %s restoring locale to %s\n", + caller_line, category_names[cat_index], + restore_locale)); - if (DEBUG_Lv_TEST || debug_initialization) { - PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n", - save_input_locale, is_utf8); - } + void_setlocale_i(cat_index, restore_locale); +# ifndef DEBUGGING + PERL_UNUSED_ARG(caller_line); # endif - /* And, if not already in that position, move it to the beginning of - * the non-constant portion of the list, since it is the most recently - * used. (We don't have to worry about overflow, since just moving - * existing names around) */ - if (name_pos > utf8ness_cache) { - Move(utf8ness_cache, - utf8ness_cache + input_name_len_with_overhead, - name_pos - utf8ness_cache, char); - Copy(delimited, - utf8ness_cache, - input_name_len_with_overhead - 1, char); - utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; - } - - /* free only when not using the buffer */ - if ( delimited != buffer ) Safefree(delimited); - Safefree(save_input_locale); - return is_utf8; - } - - /* Here we don't have stored the utf8ness for the input locale. We have to - * calculate it */ - -# if defined(USE_LOCALE_CTYPE) \ - && ( defined(HAS_NL_LANGINFO) \ - || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC))) - - { - const char *original_ctype_locale - = switch_category_locale_to_template(LC_CTYPE, - category, - save_input_locale); - - /* Here the current LC_CTYPE is set to the locale of the category whose - * information is desired. This means that nl_langinfo() and mbtowc() - * should give the correct results */ - -# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding - calling the functions if we have this */ - - /* Standard UTF-8 needs at least 4 bytes to represent the maximum - * Unicode code point. */ - - DEBUG_L(PerlIO_printf(Perl_debug_log, "%s: %d: MB_CUR_MAX=%d\n", - __FILE__, __LINE__, (int) MB_CUR_MAX)); - if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) { - is_utf8 = FALSE; - restore_switched_locale(LC_CTYPE, original_ctype_locale); - goto finish_and_return; - } - -# endif -# if defined(HAS_NL_LANGINFO) - - { /* The task is easiest if the platform has this POSIX 2001 function. - Except on some platforms it can wrongly return "", so have to have - a fallback. And it can return that it's UTF-8, even if there are - variances from that. For example, Turkish locales may use the - alternate dotted I rules, and sometimes it appears to be a - defective locale definition. XXX We should probably check for - these in the Latin1 range and warn (but on glibc, requires - iswalnum() etc. due to their not handling 80-FF correctly */ - const char *codeset = my_nl_langinfo(CODESET, FALSE); - /* FALSE => already in dest locale */ - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "\tnllanginfo returned CODESET '%s'\n", codeset)); - - if (codeset && strNE(codeset, "")) { - - /* If the implementation of foldEQ() somehow were - * to change to not go byte-by-byte, this could - * read past end of string, as only one length is - * checked. But currently, a premature NUL will - * compare false, and it will stop there */ - is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8")) - || foldEQ(codeset, STR_WITH_LEN("UTF8"))); - - DEBUG_L(PerlIO_printf(Perl_debug_log, - "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", - codeset, is_utf8)); - restore_switched_locale(LC_CTYPE, original_ctype_locale); - goto finish_and_return; - } - } - -# endif -# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) - /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a - * late adder to C89, so very likely to have it. However, testing has - * shown that, like nl_langinfo() above, there are locales that are not - * strictly UTF-8 that this will return that they are */ - - { - wchar_t wc; - int len; - dSAVEDERRNO; - -# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) - - mbstate_t ps; - -# endif - - /* mbrtowc() and mbtowc() convert a byte string to a wide - * character. Feed a byte string to one of them and check that the - * result is the expected Unicode code point */ - -# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) - /* Prefer this function if available, as it's reentrant */ - - memzero(&ps, sizeof(ps));; - PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift - state */ - SETERRNO(0, 0); - len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps); - SAVE_ERRNO; - -# else - - MBTOWC_LOCK; - PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */ - SETERRNO(0, 0); - len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); - SAVE_ERRNO; - MBTOWC_UNLOCK; +} -# endif +# ifdef USE_LOCALE_CTYPE - RESTORE_ERRNO; - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n", - len, (unsigned int) wc, GET_ERRNO)); +STATIC bool +S_is_codeset_name_UTF8(const char * name) +{ + /* Return a boolean as to if the passed-in name indicates it is a UTF-8 + * code set. Several variants are possible */ + const Size_t len = strlen(name); - is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8) - && wc == (wchar_t) UNICODE_REPLACEMENT); - } + PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8; -# endif +# ifdef WIN32 - restore_switched_locale(LC_CTYPE, original_ctype_locale); - goto finish_and_return; + /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ + if (memENDs(name, len, "65001")) { + return TRUE; } -# else - - /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next - * try looking at the currency symbol to see if it disambiguates - * things. Often that will be in the native script, and if the symbol - * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII - * UTF-8, we infer that the locale is too, as the odds of a non-UTF8 - * string being valid UTF-8 are quite small */ - -# ifdef USE_LOCALE_MONETARY - - /* If have LC_MONETARY, we can look at the currency symbol. Often that - * will be in the native script. We do this one first because there is - * just one string to examine, so potentially avoids work */ - - { - const char *original_monetary_locale - = switch_category_locale_to_template(LC_MONETARY, - category, - save_input_locale); - bool only_ascii = FALSE; - const U8 * currency_string - = (const U8 *) my_nl_langinfo(CRNCYSTR, FALSE); - /* 2nd param not relevant for this item */ - const U8 * first_variant; - - assert( *currency_string == '-' - || *currency_string == '+' - || *currency_string == '.'); - - currency_string++; - - if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant)) - { - DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); - only_ascii = TRUE; - } - else { - is_utf8 = is_strict_utf8_string(first_variant, 0); - } - - restore_switched_locale(LC_MONETARY, original_monetary_locale); - - if (! only_ascii) { - - /* It isn't a UTF-8 locale if the symbol is not legal UTF-8; - * otherwise assume the locale is UTF-8 if and only if the symbol - * is non-ascii UTF-8. */ - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", - save_input_locale, is_utf8)); - goto finish_and_return; - } - } - -# endif /* USE_LOCALE_MONETARY */ -# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME) - - /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try - * the names of the months and weekdays, timezone, and am/pm indicator */ - { - const char *original_time_locale - = switch_category_locale_to_template(LC_TIME, - category, - save_input_locale); - int hour = 10; - bool is_dst = FALSE; - int dom = 1; - int month = 0; - int i; - char * formatted_time; - - /* Here the current LC_TIME is set to the locale of the category - * whose information is desired. Look at all the days of the week and - * month names, and the timezone and am/pm indicator for UTF-8 variant - * characters. The first such a one found will tell us if the locale - * is UTF-8 or not */ - - for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */ - formatted_time = my_strftime("%A %B %Z %p", - 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst); - if ( ! formatted_time - || is_utf8_invariant_string((U8 *) formatted_time, 0)) - { - - /* Here, we didn't find a non-ASCII. Try the next time through - * with the complemented dst and am/pm, and try with the next - * weekday. After we have gotten all weekdays, try the next - * month */ - is_dst = ! is_dst; - hour = (hour + 12) % 24; - dom++; - if (i > 6) { - month++; - } - continue; - } - - /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8; - * false otherwise. But first, restore LC_TIME to its original - * locale if we changed it */ - restore_switched_locale(LC_TIME, original_time_locale); - - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", - save_input_locale, - is_utf8_string((U8 *) formatted_time, 0))); - is_utf8 = is_utf8_string((U8 *) formatted_time, 0); - goto finish_and_return; - } - - /* Falling off the end of the loop indicates all the names were just - * ASCII. Go on to the next test. If we changed it, restore LC_TIME - * to its original locale */ - restore_switched_locale(LC_TIME, original_time_locale); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); - } - # endif + /* 'UTF8' or 'UTF-8' */ + return ( inRANGE(len, 4, 5) + && name[len-1] == '8' + && ( memBEGINs(name, len, "UTF") + || memBEGINs(name, len, "utf")) + && (len == 4 || name[3] == '-')); +} -# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST) - - /* This code is ifdefd out because it was found to not be necessary in testing - * on our dromedary test machine, which has over 700 locales. There, this - * added no value to looking at the currency symbol and the time strings. I - * left it in so as to avoid rewriting it if real-world experience indicates - * that dromedary is an outlier. Essentially, instead of returning abpve if we - * haven't found illegal utf8, we continue on and examine all the strerror() - * messages on the platform for utf8ness. If all are ASCII, we still don't - * know the answer; but otherwise we have a pretty good indication of the - * utf8ness. The reason this doesn't help much is that the messages may not - * have been translated into the locale. The currency symbol and time strings - * are much more likely to have been translated. */ - { - int e; - bool non_ascii = FALSE; - const char *original_messages_locale - = switch_category_locale_to_template(LC_MESSAGES, - category, - save_input_locale); - const char * errmsg = NULL; - - /* Here the current LC_MESSAGES is set to the locale of the category - * whose information is desired. Look through all the messages. We - * can't use Strerror() here because it may expand to code that - * segfaults in miniperl */ - - for (e = 0; e <= sys_nerr; e++) { - errno = 0; - errmsg = sys_errlist[e]; - if (errno || !errmsg) { - break; - } - errmsg = savepv(errmsg); - if (! is_utf8_invariant_string((U8 *) errmsg, 0)) { - non_ascii = TRUE; - is_utf8 = is_utf8_string((U8 *) errmsg, 0); - break; - } - } - Safefree(errmsg); +# endif +#endif /* USE_LOCALE */ - restore_switched_locale(LC_MESSAGES, original_messages_locale); +bool +Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) +{ + /* Internal function which returns if we are in the scope of a pragma that + * enables the locale category 'category'. 'compiling' should indicate if + * this is during the compilation phase (TRUE) or not (FALSE). */ - if (non_ascii) { + const COP * const cop = (compiling) ? &PL_compiling : PL_curcop; - /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid, - * any non-ascii means it is one; otherwise we assume it isn't */ - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", - save_input_locale, - is_utf8)); - goto finish_and_return; - } + SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0); + if (! these_categories || these_categories == &PL_sv_placeholder) { + return FALSE; + } - DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); - } + /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get + * a valid unsigned */ + assert(category >= -1); + return cBOOL(SvUV(these_categories) & (1U << (category + 1))); +} -# endif -# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a - UTF-8 locale */ +/* my_strerror() returns a mortalized copy of the text of the error message + * associated with 'errnum'. + * + * If not called from within the scope of 'use locale', it uses the text from + * the C locale. If Perl is compiled to not pay attention to LC_CTYPE nor + * LC_MESSAGES, it uses whatever strerror() returns. Otherwise the text is + * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not. + * + * It returns in *utf8ness the result's UTF-8ness + * + * The function just calls strerror(), but temporarily switches locales, if + * needed. Many platforms require LC_CTYPE and LC_MESSAGES to be in the same + * CODESET in order for the return from strerror() to not contain '?' symbols, + * or worse, mojibaked. It's cheaper to just use the stricter criteria of + * being in the same locale. So the code below uses a common locale for both + * categories. Again, that is C if not within 'use locale' scope; or the + * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we + * don't have LC_MESSAGES; and whatever strerror returns if we don't have + * either category. + * + * There are two sets of implementations. The first below is if we have + * strerror_l(). This is the simpler. We just use the already-built C locale + * object if not in locale scope, or build up a custom one otherwise. + * + * When strerror_l() is not available, we may have to swap locales temporarily + * to bring the two categories into sync with each other, and possibly to the C + * locale. + * + * Because the prepropessing directives to conditionally compile this function + * would greatly obscure the logic of the various implementations, the whole + * function is repeated for each configuration, with some common macros. */ + +/* Used to shorten the definitions of the following implementations of + * my_strerror() */ +#define DEBUG_STRERROR_ENTER(errnum, in_locale) \ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, \ + "my_strerror called with errnum %d;" \ + " Within locale scope=%d\n", \ + errnum, in_locale)) +#define DEBUG_STRERROR_RETURN(errstr, utf8ness) \ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, \ + "Strerror returned; saving a copy: '%s';" \ + " utf8ness=%d\n", \ + get_displayable_string(errstr, \ + errstr + strlen(errstr), \ + *utf8ness), \ + (int) *utf8ness)) + +/* On platforms that have precisely one of these categories (Windows + * qualifies), these yield the correct one */ +#if defined(USE_LOCALE_CTYPE) +# define WHICH_LC_INDEX LC_CTYPE_INDEX_ +#elif defined(USE_LOCALE_MESSAGES) +# define WHICH_LC_INDEX LC_MESSAGES_INDEX_ +#endif - /* As a last resort, look at the locale name to see if it matches - * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the - * return of setlocale(), is actually defined to be opaque, so we can't - * really rely on the absence of various substrings in the name to indicate - * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to - * be a UTF-8 locale. Similarly for the other common names */ +/*==========================================================================*/ +/* First set of implementations, when have strerror_l() */ - { - const Size_t final_pos = strlen(save_input_locale) - 1; +#if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L) - if (final_pos >= 3) { - const char *name = save_input_locale; +# if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES) - /* Find next 'U' or 'u' and look from there */ - while ((name += strcspn(name, "Uu") + 1) - <= save_input_locale + final_pos - 2) - { - if ( isALPHA_FOLD_NE(*name, 't') - || isALPHA_FOLD_NE(*(name + 1), 'f')) - { - continue; - } - name += 2; - if (*(name) == '-') { - if ((name > save_input_locale + final_pos - 1)) { - break; - } - name++; - } - if (*(name) == '8') { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s ends with UTF-8 in name\n", - save_input_locale)); - is_utf8 = TRUE; - goto finish_and_return; - } - } - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s doesn't end with UTF-8 in name\n", - save_input_locale)); - } +/* Here, neither category is defined: use the C locale */ +const char * +Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) +{ + PERL_ARGS_ASSERT_MY_STRERROR; -# ifdef WIN32 + DEBUG_STRERROR_ENTER(errnum, 0); - /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ - if (memENDs(save_input_locale, final_pos, "65001")) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s ends with 65001 in name, is UTF-8 locale\n", - save_input_locale)); - is_utf8 = TRUE; - goto finish_and_return; - } + const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); + *utf8ness = UTF8NESS_IMMATERIAL; -# endif - } -# endif + DEBUG_STRERROR_RETURN(errstr, utf8ness); - /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But - * since we are about to return FALSE anyway, there is no point in doing - * this extra work */ + SAVEFREEPV(errstr); + return errstr; +} -# if 0 - if (instr(save_input_locale, "8859")) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Locale %s has 8859 in name, not UTF-8 locale\n", - save_input_locale)); - is_utf8 = FALSE; - goto finish_and_return; - } -# endif +# elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES) - DEBUG_L(PerlIO_printf(Perl_debug_log, - "Assuming locale %s is not a UTF-8 locale\n", - save_input_locale)); - is_utf8 = FALSE; +/*--------------------------------------------------------------------------*/ -# endif /* the code that is compiled when no modern LC_CTYPE */ +/* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we + * are not within 'use locale' scope of the only one defined, we use the C + * locale; otherwise use the current locale object */ - finish_and_return: +const char * +Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) +{ + PERL_ARGS_ASSERT_MY_STRERROR; - /* Cache this result so we don't have to go through all this next time. */ - utf8ness_cache_size = sizeof(PL_locale_utf8ness) - - (utf8ness_cache - PL_locale_utf8ness); + DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX])); - /* But we can't save it if it is too large for the total space available */ - if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) { - Size_t utf8ness_cache_len = strlen(utf8ness_cache); + /* Use C if not within locale scope; Otherwise, use current locale */ + const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX])) + ? PL_C_locale_obj + : use_curlocale_scratch(); - /* Here it can fit, but we may need to clear out the oldest cached - * result(s) to do so. Check */ - if (utf8ness_cache_len + input_name_len_with_overhead - >= utf8ness_cache_size) - { - /* Here we have to clear something out to make room for this. - * Start looking at the rightmost place where it could fit and find - * the beginning of the entry that extends past that. */ - char * cutoff = (char *) my_memrchr(utf8ness_cache, - UTF8NESS_SEP[0], - utf8ness_cache_size - - input_name_len_with_overhead); - - assert(cutoff); - assert(cutoff >= utf8ness_cache); - - /* This and all subsequent entries must be removed */ - *cutoff = '\0'; - utf8ness_cache_len = strlen(utf8ness_cache); - } + const char *errstr = savepv(strerror_l(errnum, which_obj)); + *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN, + NULL, WHICH_LC_INDEX); + DEBUG_STRERROR_RETURN(errstr, utf8ness); - /* Make space for the new entry */ - Move(utf8ness_cache, - utf8ness_cache + input_name_len_with_overhead, - utf8ness_cache_len + 1 /* Incl. trailing NUL */, char); - - /* And insert it */ - Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char); - utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; - - if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') { - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu," - " inserted_name=%s, its_len=%zu\n", - __FILE__, __LINE__, - PL_locale_utf8ness, strlen(PL_locale_utf8ness), - delimited, input_name_len_with_overhead); - } - } + SAVEFREEPV(errstr); + return errstr; +} -# ifdef DEBUGGING +/*--------------------------------------------------------------------------*/ +# else /* Are using both categories. Place them in the same CODESET, + * either C or the LC_MESSAGES locale */ - if (DEBUG_Lv_TEST) { - const char * s = PL_locale_utf8ness; +const char * +Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) +{ + PERL_ARGS_ASSERT_MY_STRERROR; - /* Audit the structure */ - while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) { - const char *e; + DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES)); - if (*s != UTF8NESS_SEP[0]) { - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache: missing" - " separator %.*s<-- HERE %s\n", - __FILE__, __LINE__, - (int) (s - PL_locale_utf8ness), PL_locale_utf8ness, - s); - } - s++; - e = strchr(s, UTF8NESS_PREFIX[0]); - if (! e) { - e = PL_locale_utf8ness + strlen(PL_locale_utf8ness); - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache: missing" - " separator %.*s<-- HERE %s\n", - __FILE__, __LINE__, - (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, - e); - } - e++; - if (*e != '0' && *e != '1') { - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache: utf8ness" - " must be [01] %.*s<-- HERE %s\n", - __FILE__, __LINE__, - (int) (e + 1 - PL_locale_utf8ness), - PL_locale_utf8ness, e + 1); - } - if (ninstr(PL_locale_utf8ness, s, s-1, e)) { - Perl_croak(aTHX_ - "panic: %s: %d: Corrupt utf8ness_cache: entry" - " has duplicate %.*s<-- HERE %s\n", - __FILE__, __LINE__, - (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, - e); - } - s = e + 1; - } + const char *errstr; + if (! IN_LC(LC_MESSAGES)) { /* Use C if not within locale scope */ + errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); + *utf8ness = UTF8NESS_IMMATERIAL; } - - if (DEBUG_Lv_TEST || debug_initialization) { - - PerlIO_printf(Perl_debug_log, - "PL_locale_utf8ness is now %s; returning %d\n", - PL_locale_utf8ness, is_utf8); + else { /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE + matches */ + locale_t cur = duplocale(use_curlocale_scratch()); + + cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur); + errstr = savepv(strerror_l(errnum, cur)); + *utf8ness = get_locale_string_utf8ness_i(errstr, + LOCALE_UTF8NESS_UNKNOWN, + NULL, LC_MESSAGES_INDEX_); + freelocale(cur); } -# endif + DEBUG_STRERROR_RETURN(errstr, utf8ness); - /* free only when not using the buffer */ - if ( delimited != buffer ) Safefree(delimited); - Safefree(save_input_locale); - return is_utf8; + SAVEFREEPV(errstr); + return errstr; } +# endif /* Above is using strerror_l */ +/*==========================================================================*/ +#else /* Below is not using strerror_l */ +# if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES) -#endif +/* If not using using either of the categories, return plain, unadorned + * strerror */ -bool -Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) +const char * +Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) { - /* Internal function which returns if we are in the scope of a pragma that - * enables the locale category 'category'. 'compiling' should indicate if - * this is during the compilation phase (TRUE) or not (FALSE). */ + PERL_ARGS_ASSERT_MY_STRERROR; - const COP * const cop = (compiling) ? &PL_compiling : PL_curcop; + DEBUG_STRERROR_ENTER(errnum, 0); - SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0); - if (! these_categories || these_categories == &PL_sv_placeholder) { - return FALSE; - } + const char *errstr = savepv(Strerror(errnum)); + *utf8ness = UTF8NESS_IMMATERIAL; - /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get - * a valid unsigned */ - assert(category >= -1); - return cBOOL(SvUV(these_categories) & (1U << (category + 1))); -} - -char * -Perl_my_strerror(pTHX_ const int errnum) -{ - /* Returns a mortalized copy of the text of the error message associated - * with 'errnum'. It uses the current locale's text unless the platform - * doesn't have the LC_MESSAGES category or we are not being called from - * within the scope of 'use locale'. In the former case, it uses whatever - * strerror returns; in the latter case it uses the text from the C locale. - * - * The function just calls strerror(), but temporarily switches, if needed, - * to the C locale */ + DEBUG_STRERROR_RETURN(errstr, utf8ness); - char *errstr; - -#ifndef USE_LOCALE_MESSAGES - - /* If platform doesn't have messages category, we don't do any switching to - * the C locale; we just use whatever strerror() returns */ + SAVEFREEPV(errstr); + return errstr; +} - errstr = savepv(Strerror(errnum)); +/*--------------------------------------------------------------------------*/ +# elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES) -#else /* Has locale messages */ +/* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we + * are not within 'use locale' scope of the only one defined, we use the C + * locale; otherwise use the current locale */ - const bool within_locale_scope = IN_LC(LC_MESSAGES); +const char * +Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) +{ + PERL_ARGS_ASSERT_MY_STRERROR; -# ifndef USE_ITHREADS + DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX])); - /* This function is trivial without threads. */ - if (within_locale_scope) { - errstr = savepv(strerror(errnum)); + const char *errstr; + if (IN_LC(categories[WHICH_LC_INDEX])) { + errstr = savepv(Strerror(errnum)); + *utf8ness = get_locale_string_utf8ness_i(errstr, + LOCALE_UTF8NESS_UNKNOWN, + NULL, WHICH_LC_INDEX); } else { - const char * save_locale = savepv(do_setlocale_c(LC_MESSAGES, NULL)); - do_setlocale_c(LC_MESSAGES, "C"); - errstr = savepv(strerror(errnum)); - do_setlocale_c(LC_MESSAGES, save_locale); - Safefree(save_locale); - } + SETLOCALE_LOCK; -# elif defined(USE_POSIX_2008_LOCALE) \ - && defined(HAS_STRERROR_L) + const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C"); - /* This function is also trivial if we don't have to worry about thread - * safety and have strerror_l(), as it handles the switch of locales so we - * don't have to deal with that. We don't have to worry about thread - * safety if strerror_r() is also available. Both it and strerror_l() are - * thread-safe. Plain strerror() isn't thread safe. But on threaded - * builds when strerror_r() is available, the apparent call to strerror() - * below is actually a macro that behind-the-scenes calls strerror_r(). */ + errstr = savepv(Strerror(errnum)); -# ifdef HAS_STRERROR_R + restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale); - if (within_locale_scope) { - errstr = savepv(strerror(errnum)); - } - else { - errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); - } + SETLOCALE_UNLOCK; -# else + *utf8ness = UTF8NESS_IMMATERIAL; - /* Here we have strerror_l(), but not strerror_r() and we are on a - * threaded-build. We use strerror_l() for everything, constructing a - * locale to pass to it if necessary */ + } - bool do_free = FALSE; - locale_t locale_to_use; + DEBUG_STRERROR_RETURN(errstr, utf8ness); - if (within_locale_scope) { - locale_to_use = uselocale((locale_t) 0); - if (locale_to_use == LC_GLOBAL_LOCALE) { - locale_to_use = duplocale(LC_GLOBAL_LOCALE); - do_free = TRUE; - } - } - else { /* Use C locale if not within 'use locale' scope */ - locale_to_use = PL_C_locale_obj; - } + SAVEFREEPV(errstr); + return errstr; +} - errstr = savepv(strerror_l(errnum, locale_to_use)); +/*--------------------------------------------------------------------------*/ +# else - if (do_free) { - freelocale(locale_to_use); - } +/* Below, have both LC_CTYPE and LC_MESSAGES. Place them in the same CODESET, + * either C or the LC_MESSAGES locale */ -# endif -# else /* Doesn't have strerror_l() */ +const char * +Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness) +{ + PERL_ARGS_ASSERT_MY_STRERROR; + + DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES)); - const char * save_locale = NULL; - bool locale_is_C = FALSE; + const char * desired_locale = (IN_LC(LC_MESSAGES)) + ? querylocale_c(LC_MESSAGES) + : "C"; + /* XXX Can fail on z/OS */ - /* We have a critical section to prevent another thread from executing this - * same code at the same time. (On thread-safe perls, the LOCK is a - * no-op.) Since this is the only place in core that changes LC_MESSAGES - * (unless the user has called setlocale(), this works to prevent races. */ SETLOCALE_LOCK; - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "my_strerror called with errnum %d\n", errnum)); - if (! within_locale_scope) { - save_locale = do_setlocale_c(LC_MESSAGES, NULL); - if (! save_locale) { - SETLOCALE_UNLOCK; - Perl_croak(aTHX_ - "panic: %s: %d: Could not find current LC_MESSAGES locale," - " errno=%d\n", __FILE__, __LINE__, errno); - } - else { - locale_is_C = isNAME_C_OR_POSIX(save_locale); - - /* Switch to the C locale if not already in it */ - if (! locale_is_C) { - - /* The setlocale() just below likely will zap 'save_locale', so - * create a copy. */ - save_locale = savepv(save_locale); - if (! do_setlocale_c(LC_MESSAGES, "C")) { - - /* If, for some reason, the locale change failed, we - * soldier on as best as possible under the circumstances, - * using the current locale, and clear save_locale, so we - * don't try to change back. On z/0S, all setlocale() - * calls fail after you've created a thread. This is their - * way of making sure the entire process is always a single - * locale. This means that 'use locale' is always in place - * for messages under these circumstances. */ - Safefree(save_locale); - save_locale = NULL; - } - } - } - } /* end of ! within_locale_scope */ - else { - DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n", - __FILE__, __LINE__)); - } + const char* orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, desired_locale); + const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES, + desired_locale); + const char *errstr = savepv(Strerror(errnum)); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "Any locale change has been done; about to call Strerror\n")); - errstr = savepv(Strerror(errnum)); - - if (! within_locale_scope) { - if (save_locale && ! locale_is_C) { - if (! do_setlocale_c(LC_MESSAGES, save_locale)) { - SETLOCALE_UNLOCK; - Perl_croak(aTHX_ - "panic: %s: %d: setlocale restore to '%s' failed, errno=%d\n", - __FILE__, __LINE__, save_locale, errno); - } - Safefree(save_locale); - } - } + restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale); + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); SETLOCALE_UNLOCK; -# endif /* End of doesn't have strerror_l */ -# ifdef DEBUGGING - - if (DEBUG_Lv_TEST) { - PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '"); - print_bytes_for_locale(errstr, errstr + strlen(errstr), 0); - PerlIO_printf(Perl_debug_log, "'\n"); - } - -# endif -#endif /* End of does have locale messages */ + *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN, + NULL, LC_MESSAGES_INDEX_); + DEBUG_STRERROR_RETURN(errstr, utf8ness); SAVEFREEPV(errstr); return errstr; } +/*--------------------------------------------------------------------------*/ +# endif /* end of not using strerror_l() */ +#endif /* end of all the my_strerror() implementations */ + /* =for apidoc switch_to_global_locale -On systems without locale support, or on typical single-threaded builds, or on -platforms that do not support per-thread locale operations, this function does -nothing. On such systems that do have locale support, only a locale global to -the whole program is available. +This function copies the locale state of the calling thread into the program's +global locale, and converts the thread to use that global locale. + +It is intended so that Perl can safely be used with C libraries that access the +global locale and which can't be converted to not access it. Effectively, this +means libraries that call C<L<setlocale(3)>> on non-Windows systems. (For +portability, it is a good idea to use it on Windows as well.) -On multi-threaded builds on systems that do have per-thread locale operations, -this function converts the thread it is running in to use the global locale. -This is for code that has not yet or cannot be updated to handle multi-threaded -locale operation. As long as only a single thread is so-converted, everything -works fine, as all the other threads continue to ignore the global one, so only -this thread looks at it. +A downside of using it is that it disables the services that Perl provides to +hide locale gotchas from your code. The service you most likely will miss +regards the radix character (decimal point) in floating point numbers. Code +executed after this function is called can no longer just assume that this +character is correct for the current circumstances. -However, on Windows systems this isn't quite true prior to Visual Studio 15, -at which point Microsoft fixed a bug. A race can occur if you use the -following operations on earlier Windows platforms: +To return to Perl control, and restart the gotcha prevention services, call +C<L</sync_locale>>. Behavior is undefined for any pure Perl code that executes +while the switch is in effect. + +The global locale and the per-thread locales are independent. As long as just +one thread converts to the global locale, everything works smoothly. But if +more than one does, they can easily interfere with each other, and races are +likely. On Windows systems prior to Visual Studio 15 (at which point Microsoft +fixed a bug), races can occur (even if only one thread has been converted to +the global locale), but only if you use the following operations: =over @@ -5352,59 +6962,98 @@ following operations on earlier Windows platforms: =back The first item is not fixable (except by upgrading to a later Visual Studio -release), but it would be possible to work around the latter two items by using -the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches +release), but it would be possible to work around the latter two items by +having Perl change its algorithm for calculating these to use Windows API +functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches welcome. -Without this function call, threads that use the L<C<setlocale(3)>> system -function will not work properly, as all the locale-sensitive functions will -look at the per-thread locale, and C<setlocale> will have no effect on this -thread. - -Perl code should convert to either call -L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system -C<setlocale>) or use the methods given in L<perlcall> to call +XS code should never call plain C<setlocale>, but should instead be converted +to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in +for the system C<setlocale>) or use the methods given in L<perlcall> to call L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly handle all cases of single- vs multi-thread, POSIX 2008-supported or not. -Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can -continue to work if this function is called before transferring control to the -library. - -Upon return from the code that needs to use the global locale, -L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe -multi-thread operation. - =cut */ void -Perl_switch_to_global_locale() +Perl_switch_to_global_locale(pTHX) { -#ifdef USE_THREAD_SAFE_LOCALE -# ifdef WIN32 +#ifdef USE_LOCALE - _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); + DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n", + get_LC_ALL_display())); + bool perl_controls = false; -# else -# ifdef HAS_QUERYLOCALE +# ifdef USE_THREAD_SAFE_LOCALE + + /* In these cases, we use the system state to determine if we are in the + * global locale or not. */ + +# ifdef USE_POSIX_2008_LOCALE + + perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0)); - setlocale(LC_ALL, querylocale(LC_ALL_MASK, uselocale((locale_t) 0))); +# elif defined(WIN32) + + perl_controls = (_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE); # else +# error Unexpected Configuration +# endif +# endif - { - unsigned int i; + /* No-op if already in global */ + if (! perl_controls) { + return; + } - for (i = 0; i < LC_ALL_INDEX; i++) { - setlocale(categories[i], do_setlocale_r(categories[i], NULL)); - } +# ifdef USE_THREAD_SAFE_LOCALE +# if defined(WIN32) + + const char * thread_locale = posix_setlocale(LC_ALL, NULL); + _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); + posix_setlocale(LC_ALL, thread_locale); + +# else /* Must be USE_POSIX_2008_LOCALE) */ + + const char * cur_thread_locales[NOMINAL_LC_ALL_INDEX + 1]; + + /* Save each category's current per-thread state */ + for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + cur_thread_locales[i] = querylocale_i(i); + } + + /* Now switch to global */ + DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Switching to global locale\n")); + + locale_t old_locale = uselocale(LC_GLOBAL_LOCALE); + if (! old_locale) { + locale_panic_("Could not change to global locale"); + } + + /* Free the per-thread memory */ + if (old_locale != LC_GLOBAL_LOCALE && old_locale != PL_C_locale_obj) { + freelocale(old_locale); + } + + /* Set the global to what was our per-thread state */ + POSIX_SETLOCALE_LOCK; + for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + posix_setlocale(categories[i], cur_thread_locales[i]); } + POSIX_SETLOCALE_UNLOCK; # endif +# endif +# ifdef USE_LOCALE_NUMERIC - uselocale(LC_GLOBAL_LOCALE); + /* Switch to the underlying C numeric locale; the application is on its + * own. */ + POSIX_SETLOCALE_LOCK; + posix_setlocale(LC_NUMERIC, PL_numeric_name); + POSIX_SETLOCALE_UNLOCK; # endif #endif @@ -5415,27 +7064,45 @@ Perl_switch_to_global_locale() =for apidoc sync_locale +This function copies the state of the program global locale into the calling +thread, and converts that thread to using per-thread locales, if it wasn't +already, and the platform supports them. The LC_NUMERIC locale is toggled into +the standard state (using the C locale's conventions), if not within the +lexical scope of S<C<use locale>>. + +Perl will now consider itself to have control of the locale. + +Since unthreaded perls have only a global locale, this function is a no-op +without threads. + +This function is intended for use with C libraries that do locale manipulation. +It allows Perl to accommodate the use of them. Call this function before +transferring back to Perl space so that it knows what state the C code has left +things in. + +XS code should not manipulate the locale on its own. Instead, L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or change the locale (though changing the locale is antisocial and dangerous on multi-threaded systems that don't have multi-thread safe locale operations. -(See L<perllocale/Multi-threaded operation>). Using the system -L<C<setlocale(3)>> should be avoided. Nevertheless, certain non-Perl libraries -called from XS, such as C<Gtk> do so, and this can't be changed. When the -locale is changed by XS code that didn't use -L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the -locale has changed. Use this function to do so, before returning to Perl. +(See L<perllocale/Multi-threaded operation>). + +Using the libc L<C<setlocale(3)>> function should be avoided. Nevertheless, +certain non-Perl libraries called from XS, do call it, and their behavior may +not be able to be changed. This function, along with +C<L</switch_to_global_locale>>, can be used to get seamless behavior in these +circumstances, as long as only one thread is involved. + +If the library has an option to turn off its locale manipulation, doing that is +preferable to using this mechanism. C<Gtk> is such a library. The return value is a boolean: TRUE if the global locale at the time of call -was in effect; and FALSE if a per-thread locale was in effect. This can be -used by the caller that needs to restore things as-they-were to decide whether -or not to call -L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>. +was in effect for the caller; and FALSE if a per-thread locale was in effect. =cut */ bool -Perl_sync_locale() +Perl_sync_locale(pTHX) { #ifndef USE_LOCALE @@ -5444,77 +7111,52 @@ Perl_sync_locale() #else - const char * newlocale; - dTHX; - -# ifdef USE_POSIX_2008_LOCALE + bool was_in_global = TRUE; - bool was_in_global_locale = FALSE; - locale_t cur_obj = uselocale((locale_t) 0); +# ifdef USE_THREAD_SAFE_LOCALE +# if defined(WIN32) - /* On Windows, unless the foreign code has turned off the thread-safe - * locale setting, any plain setlocale() will have affected what we see, so - * no need to worry. Otherwise, If the foreign code has done a plain - * setlocale(), it will only affect the global locale on POSIX systems, but - * will affect the */ - if (cur_obj == LC_GLOBAL_LOCALE) { + was_in_global = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE) + == _DISABLE_PER_THREAD_LOCALE; -# ifdef HAS_QUERY_LOCALE +# elif defined(USE_POSIX_2008_LOCALE) - do_setlocale_c(LC_ALL, setlocale(LC_ALL, NULL)); + was_in_global = (LC_GLOBAL_LOCALE == uselocale((locale_t) 0)); # else - - unsigned int i; - - /* We can't trust that we can read the LC_ALL format on the - * platform, so do them individually */ - for (i = 0; i < LC_ALL_INDEX; i++) { - do_setlocale_r(categories[i], setlocale(categories[i], NULL)); - } - +# error Unexpected Configuration # endif - - was_in_global_locale = TRUE; +# endif /* USE_THREAD_SAFE_LOCALE */ + + /* Here, we are in the global locale. Get and save the values for each + * category. */ + const char * current_globals[NOMINAL_LC_ALL_INDEX]; + for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + POSIX_SETLOCALE_LOCK; + current_globals[i] = savepv(stdized_setlocale(categories[i], NULL)); + POSIX_SETLOCALE_UNLOCK; } -# else - - bool was_in_global_locale = TRUE; + /* Now we have to convert the current thread to use them */ -# endif -# ifdef USE_LOCALE_CTYPE - - newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL)); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s:%d: %s\n", __FILE__, __LINE__, - setlocale_debug_string(LC_CTYPE, NULL, newlocale))); - new_ctype(newlocale); - Safefree(newlocale); +# if defined(WIN32) -# endif /* USE_LOCALE_CTYPE */ -# ifdef USE_LOCALE_COLLATE - - newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL)); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s:%d: %s\n", __FILE__, __LINE__, - setlocale_debug_string(LC_COLLATE, NULL, newlocale))); - new_collate(newlocale); - Safefree(newlocale); + /* On Windows, convert to per-thread behavior. This isn't necessary in + * POSIX 2008, as the conversion gets done automatically in the loop below. + * */ + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); # endif -# ifdef USE_LOCALE_NUMERIC - newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL)); - DEBUG_Lv(PerlIO_printf(Perl_debug_log, - "%s:%d: %s\n", __FILE__, __LINE__, - setlocale_debug_string(LC_NUMERIC, NULL, newlocale))); - new_numeric(newlocale); - Safefree(newlocale); + for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { + setlocale_i(i, current_globals[i]); + Safefree(current_globals[i]); + } -# endif /* USE_LOCALE_NUMERIC */ + /* And update our remaining records. 'true' => force recalculation */ + new_LC_ALL(NULL, true); - return was_in_global_locale; + return was_in_global; #endif @@ -5523,75 +7165,127 @@ Perl_sync_locale() #if defined(DEBUGGING) && defined(USE_LOCALE) STATIC char * -S_setlocale_debug_string(const int category, /* category number, - like LC_ALL */ - const char* const locale, /* locale name */ +S_my_setlocale_debug_string_i(pTHX_ + const unsigned cat_index, + const char* locale, /* Optional locale name */ + + /* return value from setlocale() when attempting + * to set 'category' to 'locale' */ + const char* retval, - /* return value from setlocale() when attempting to - * set 'category' to 'locale' */ - const char* const retval) + const line_t line) { /* Returns a pointer to a NUL-terminated string in static storage with * added text about the info passed in. This is not thread safe and will * be overwritten by the next call, so this should be used just to * formulate a string to immediately print or savepv() on. */ - static char ret[256]; + const char * locale_quote; + const char * retval_quote; - my_strlcpy(ret, "setlocale(", sizeof(ret)); - my_strlcat(ret, category_name(category), sizeof(ret)); - my_strlcat(ret, ", ", sizeof(ret)); + assert(cat_index <= NOMINAL_LC_ALL_INDEX); - if (locale) { - my_strlcat(ret, "\"", sizeof(ret)); - my_strlcat(ret, locale, sizeof(ret)); - my_strlcat(ret, "\"", sizeof(ret)); + if (locale == NULL) { + locale_quote = ""; + locale = "NULL"; } else { - my_strlcat(ret, "NULL", sizeof(ret)); + locale_quote = "\""; } - my_strlcat(ret, ") returned ", sizeof(ret)); - - if (retval) { - my_strlcat(ret, "\"", sizeof(ret)); - my_strlcat(ret, retval, sizeof(ret)); - my_strlcat(ret, "\"", sizeof(ret)); + if (retval == NULL) { + retval_quote = ""; + retval = "NULL"; } else { - my_strlcat(ret, "NULL", sizeof(ret)); + retval_quote = "\""; } - assert(strlen(ret) < sizeof(ret)); +# ifdef USE_LOCALE_THREADS +# define THREAD_FORMAT "%p:" +# define THREAD_ARGUMENT aTHX_ +# else +# define THREAD_FORMAT +# define THREAD_ARGUMENT +# endif - return ret; + return Perl_form(aTHX_ + "%s:%" LINE_Tf ": " THREAD_FORMAT + " setlocale(%s[%d], %s%s%s) returned %s%s%s\n", + + __FILE__, line, THREAD_ARGUMENT + category_names[cat_index], categories[cat_index], + locale_quote, locale, locale_quote, + retval_quote, retval, retval_quote); } #endif +#ifdef USE_PERL_SWITCH_LOCALE_CONTEXT void -Perl_thread_locale_init() +Perl_switch_locale_context() { - /* Called from a thread on startup*/ + /* libc keeps per-thread locale status information in some configurations. + * So, we can't just switch out aTHX to switch to a new thread. libc has + * to follow along. This routine does that based on per-interpreter + * variables we keep just for this purpose */ -#ifdef USE_THREAD_SAFE_LOCALE + /* Can't use pTHX, because we may be called from a place where that + * isn't available */ + dTHX; - dTHX_DEBUGGING; + if (UNLIKELY( aTHX == NULL + || PL_veto_switch_non_tTHX_context + || PL_phase == PERL_PHASE_CONSTRUCT)) + { + return; + } - /* C starts the new thread in the global C locale. If we are thread-safe, - * we want to not be in the global locale */ +# ifdef USE_POSIX_2008_LOCALE - DEBUG_L(PerlIO_printf(Perl_debug_log, - "%s:%d: new thread, initial locale is %s; calling setlocale\n", - __FILE__, __LINE__, setlocale(LC_ALL, NULL))); + if (! uselocale(PL_cur_locale_obj)) { + locale_panic_(Perl_form(aTHX_ + "Can't uselocale(%p), LC_ALL supposed to be '%s", + PL_cur_locale_obj, get_LC_ALL_display())); + } -# ifdef WIN32 +# elif defined(WIN32) - _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) { + locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL)); + } -# else +# endif - Perl_setlocale(LC_ALL, "C"); +} + +#endif + +void +Perl_thread_locale_init(pTHX) +{ + +#ifdef USE_THREAD_SAFE_LOCALE +# ifdef USE_POSIX_2008_LOCALE + + /* Called from a thread on startup. + * + * The operations here have to be done from within the calling thread, as + * they affect libc's knowledge of the thread; libc has no knowledge of + * aTHX */ + + DEBUG_L(PerlIO_printf(Perl_debug_log, + "new thread, initial locale is %s;" + " calling setlocale(LC_ALL, \"C\")\n", + get_LC_ALL_display())); + + uselocale(PL_C_locale_obj); + +# elif defined(WIN32) + + /* On Windows, make sure new thread has per-thread locales enabled */ + _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); + void_setlocale_c(LC_ALL, "C"); # endif #endif @@ -5599,25 +7293,36 @@ Perl_thread_locale_init() } void -Perl_thread_locale_term() +Perl_thread_locale_term(pTHX) { - /* Called from a thread as it gets ready to terminate */ + /* Called from a thread as it gets ready to terminate. + * + * The operations here have to be done from within the calling thread, as + * they affect libc's knowledge of the thread; libc has no knowledge of + * aTHX */ -#ifdef USE_THREAD_SAFE_LOCALE +#ifdef USE_POSIX_2008_LOCALE /* C starts the new thread in the global C locale. If we are thread-safe, * we want to not be in the global locale */ -# ifndef WIN32 + /* Free up */ + locale_t actual_obj = uselocale(LC_GLOBAL_LOCALE); + if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) { + freelocale(actual_obj); + } - { /* Free up */ - locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE); - if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) { - freelocale(cur_obj); - } + /* Prevent leaks even if something has gone wrong */ + locale_t expected_obj = PL_cur_locale_obj; + if (UNLIKELY( expected_obj != actual_obj + && expected_obj != LC_GLOBAL_LOCALE + && expected_obj != PL_C_locale_obj)) + { + freelocale(expected_obj); } -# endif + PL_cur_locale_obj = LC_GLOBAL_LOCALE; + #endif } |