summaryrefslogtreecommitdiff
path: root/gnu/usr.bin/perl/locale.c
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/usr.bin/perl/locale.c')
-rw-r--r--gnu/usr.bin/perl/locale.c8357
1 files changed, 5031 insertions, 3326 deletions
diff --git a/gnu/usr.bin/perl/locale.c b/gnu/usr.bin/perl/locale.c
index c83076da3f0..92b6c1a6ec1 100644
--- a/gnu/usr.bin/perl/locale.c
+++ b/gnu/usr.bin/perl/locale.c
@@ -38,14 +38,113 @@
* it would be possible to emulate thread-safe locales, but this likely would
* involve a lot of locale switching, and would require XS code changes.
* Macros could be written so that the code wouldn't have to know which type of
- * system is being used. It's unlikely that we would ever do that, since most
- * modern systems support thread-safe locales, but there was code written to
- * this end, and is retained, #ifdef'd out.
+ * system is being used.
+ *
+ * Table-driven code is used for simplicity and clarity, as many operations
+ * differ only in which category is being worked on. However the system
+ * categories need not be small contiguous integers, so do not lend themselves
+ * to table lookup. Instead we have created our own equivalent values which
+ * are all small contiguous non-negative integers, and translation functions
+ * between the two sets. For category 'LC_foo', the name of our index is
+ * LC_foo_INDEX_. Various parallel tables, indexed by these, are used.
+ *
+ * Many of the macros and functions in this file have one of the suffixes '_c',
+ * '_r', or '_i'. khw found these useful in remembering what type of locale
+ * category to use as their parameter. '_r' takes an int category number as
+ * passed to setlocale(), like LC_ALL, LC_CTYPE, etc. The 'r' indicates that
+ * the value isn't known until runtime. '_c' also indicates such a category
+ * number, but its value is known at compile time. These are both converted
+ * into unsigned indexes into various tables of category information, where the
+ * real work is generally done. The tables are generated at compile-time based
+ * on platform characteristics and Configure options. They hide from the code
+ * many of the vagaries of the different locale implementations out there. You
+ * may have already guessed that '_i' indicates the parameter is such an
+ * unsigned index. Converting from '_r' to '_i' requires run-time lookup.
+ * '_c' is used to get cpp to do this at compile time. To avoid the runtime
+ * expense, the code is structured to use '_r' at the API level, and once
+ * converted, everything possible is done using the table indexes.
+ *
+ * On unthreaded perls, most operations expand out to just the basic
+ * setlocale() calls. The same is true on threaded perls on modern Windows
+ * systems where the same API, after set up, is used for thread-safe locale
+ * handling. On other systems, there is a completely different API, specified
+ * in POSIX 2008, to do thread-safe locales. On these systems, our
+ * emulate_setlocale_i() function is used to hide the different API from the
+ * outside. This makes it completely transparent to most XS code.
+ *
+ * A huge complicating factor is that the LC_NUMERIC category is normally held
+ * in the C locale, except during those relatively rare times when it needs to
+ * be in the underlying locale. There is a bunch of code to accomplish this,
+ * and to allow easy switches from one state to the other.
+ *
+ * In addition, the setlocale equivalents have versions for the return context,
+ * 'void' and 'bool', besides the full return value. This can present
+ * opportunities for avoiding work. We don't have to necessarily create a safe
+ * copy to return if no return is desired.
+ *
+ * There are 3.5 major implementations here; which one chosen depends on what
+ * the platform has available, and Configuration options.
+ *
+ * 1) Raw my_setlocale(). Here the layer adds nothing. This is used for
+ * unthreaded perls, and when the API for safe locale threading is identical
+ * to the unsafe API (Windows, currently).
+ *
+ * 2) A minimal layer that makes my_setlocale() uninterruptible and returns a
+ * per-thread/per-category value.
+ *
+ * 3a and 3b) A layer that implements POSIX 2008 thread-safe locale handling,
+ * mapping the setlocale() API to them. This automatically makes almost all
+ * code thread-safe without need for changes. This layer is chosen on
+ * threaded perls when the platform supports the POSIX 2008 functions, and
+ * when there is no manual override in Configure.
+ *
+ * 3a) is when the platform has a reliable querylocale() function or
+ * equivalent that is selected to be used.
+ * 3b) is when we have to emulate that functionality.
+ *
+ * z/OS (os390) is an outlier. Locales really don't work under threads when
+ * either the radix character isn't a dot, or attempts are made to change
+ * locales after the first thread is created. The reason is that IBM has made
+ * it thread-safe by refusing to change locales (returning failure if
+ * attempted) any time after an application has called pthread_create() to
+ * create another thread. The expectation is that an application will set up
+ * its locale information before the first fork, and be stable thereafter. But
+ * perl toggles LC_NUMERIC if the locale's radix character isn't a dot, as do
+ * the other toggles, which are less common.
*/
+/* If the environment says to, we can output debugging information during
+ * initialization. This is done before option parsing, and before any thread
+ * creation, so can be a file-level static. (Must come before #including
+ * perl.h) */
+#ifdef DEBUGGING
+static int debug_initialization = 0;
+# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
+# define DEBUG_LOCALE_INITIALIZATION_ debug_initialization
+/* C standards seem to say that __LINE__ is merely "an integer constant",
+ * which means it might be either int, long (with L suffix), or long long
+ * (or their corresponding unsigned type). So, we have to explicitly cast
+ * __LINE__ to a particular integer type to pass it reliably to variadic
+ * functions like (PerlIO_)printf, as below: */
+# ifdef USE_LOCALE_THREADS
+# define DEBUG_PRE_STMTS \
+ dSAVE_ERRNO; dTHX; PerlIO_printf(Perl_debug_log,"\n%s: %" LINE_Tf ": %p: ",\
+ __FILE__, (line_t)__LINE__, aTHX);
+# else
+# define DEBUG_PRE_STMTS \
+ dSAVE_ERRNO; dTHX; PerlIO_printf(Perl_debug_log, "\n%s: %" LINE_Tf ": ", \
+ __FILE__, (line_t)__LINE__);
+# endif
+# define DEBUG_POST_STMTS RESTORE_ERRNO;
+#else
+# define debug_initialization 0
+# define DEBUG_INITIALIZATION_set(v)
+# define DEBUG_PRE_STMTS
+# define DEBUG_POST_STMTS
+#endif
+
#include "EXTERN.h"
#define PERL_IN_LOCALE_C
-#include "perl_langinfo.h"
#include "perl.h"
#include "reentr.h"
@@ -57,27 +156,53 @@
# include <wctype.h>
#endif
-/* If the environment says to, we can output debugging information during
- * initialization. This is done before option parsing, and before any thread
- * creation, so can be a file-level static */
-#if ! defined(DEBUGGING)
-# define debug_initialization 0
-# define DEBUG_INITIALIZATION_set(v)
+ /* The main errno that gets used is this one, on platforms that support it */
+#ifdef EINVAL
+# define SET_EINVAL SETERRNO(EINVAL, LIB_INVARG)
#else
-static bool debug_initialization = FALSE;
-# define DEBUG_INITIALIZATION_set(v) (debug_initialization = v)
+# define SET_EINVAL
+#endif
+
+/* If we have any of these library functions, we can reliably determine is a
+ * locale is a UTF-8 one or not. And if we aren't using locales at all, we act
+ * as if everything is the C locale, so the answer there is always "No, it
+ * isn't UTF-8"; this too is reliably accurate */
+#if defined(HAS_SOME_LANGINFO) || defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) \
+ || ! defined(USE_LOCALE)
+# define HAS_RELIABLE_UTF8NESS_DETERMINATION
#endif
+#ifdef USE_LOCALE
+
+PERL_STATIC_INLINE const char *
+S_mortalized_pv_copy(pTHX_ const char * const pv)
+{
+ PERL_ARGS_ASSERT_MORTALIZED_PV_COPY;
+
+ /* Copies the input pv, and arranges for it to be freed at an unspecified
+ * later time. */
+
+ if (pv == NULL) {
+ return NULL;
+ }
+
+ const char * copy = savepv(pv);
+ SAVEFREEPV(copy);
+ return copy;
+}
+
+#endif
/* Returns the Unix errno portion; ignoring any others. This is a macro here
* instead of putting it into perl.h, because unclear to khw what should be
* done generally. */
#define GET_ERRNO saved_errno
-/* strlen() of a literal string constant. We might want this more general,
- * but using it in just this file for now. A problem with more generality is
- * the compiler warnings about comparing unlike signs */
-#define STRLENs(s) (sizeof("" s "") - 1)
+/* Default values come from the C locale */
+#define C_codeset "ANSI_X3.4-1968" /* Only in some Configurations, and usually
+ a single instance, so is a #define */
+static const char C_decimal_point[] = ".";
+static const char C_thousands_sep[] = "";
/* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the
* return of setlocale(), then this is extremely likely to be the C or POSIX
@@ -94,85 +219,47 @@ static bool debug_initialization = FALSE;
&& (( *(name) == 'C' && (*(name + 1)) == '\0') \
|| strEQ((name), "POSIX")))
-#ifdef USE_LOCALE
-
-/* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far
- * looked up. This is in the form of a C string: */
-
-#define UTF8NESS_SEP "\v"
-#define UTF8NESS_PREFIX "\f"
-
-/* So, the string looks like:
- *
- * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0
- *
- * where the digit 0 after the \a indicates that the locale starting just
- * after the preceding \v is not UTF-8, and the digit 1 mean it is. */
-
-STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1);
-STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1);
-
-#define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \
- UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0"
-
-/* The cache is initialized to C_and_POSIX_utf8ness at start up. These are
- * kept there always. The remining portion of the cache is LRU, with the
- * oldest looked-up locale at the tail end */
-
-STATIC char *
-S_stdize_locale(pTHX_ char *locs)
-{
- /* Standardize the locale name from a string returned by 'setlocale',
- * possibly modifying that string.
- *
- * The typical return value of setlocale() is either
- * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL
- * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL
- * (the space-separated values represent the various sublocales,
- * in some unspecified order). This is not handled by this function.
- *
- * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n",
- * which is harmful for further use of the string in setlocale(). This
- * function removes the trailing new line and everything up through the '='
- * */
-
- const char * const s = strchr(locs, '=');
- bool okay = TRUE;
+#if defined(HAS_NL_LANGINFO_L) || defined(HAS_NL_LANGINFO)
+# define HAS_SOME_LANGINFO
+#endif
- PERL_ARGS_ASSERT_STDIZE_LOCALE;
+#define my_langinfo_c(item, category, locale, retbufp, retbuf_sizep, utf8ness) \
+ my_langinfo_i(item, category##_INDEX_, locale, retbufp, \
+ retbuf_sizep, utf8ness)
- if (s) {
- const char * const t = strchr(s, '.');
- okay = FALSE;
- if (t) {
- const char * const u = strchr(t, '\n');
- if (u && (u[1] == 0)) {
- const STRLEN len = u - s;
- Move(s + 1, locs, len, char);
- locs[len] = 0;
- okay = TRUE;
- }
- }
- }
+#ifdef USE_LOCALE
- if (!okay)
- Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs);
+# ifdef DEBUGGING
+# define setlocale_debug_string_i(index, locale, result) \
+ my_setlocale_debug_string_i(index, locale, result, __LINE__)
+# define setlocale_debug_string_c(category, locale, result) \
+ setlocale_debug_string_i(category##_INDEX_, locale, result)
+# define setlocale_debug_string_r(category, locale, result) \
+ setlocale_debug_string_i(get_category_index(category, locale), \
+ locale, result)
+# endif
- return locs;
-}
+# define toggle_locale_i(index, locale) \
+ S_toggle_locale_i(aTHX_ index, locale, __LINE__)
+# define toggle_locale_c(cat, locale) toggle_locale_i(cat##_INDEX_, locale)
+# define restore_toggled_locale_i(index, locale) \
+ S_restore_toggled_locale_i(aTHX_ index, locale, __LINE__)
+# define restore_toggled_locale_c(cat, locale) \
+ restore_toggled_locale_i(cat##_INDEX_, locale)
-/* Two parallel arrays; first the locale categories Perl uses on this system;
- * the second array is their names. These arrays are in mostly arbitrary
- * order. */
+/* Two parallel arrays indexed by our mapping of category numbers into small
+ * non-negative indexes; first the locale categories Perl uses on this system,
+ * used to do the inverse mapping. The second array is their names. These
+ * arrays are in mostly arbitrary order. */
-const int categories[] = {
+STATIC const int categories[] = {
-# ifdef USE_LOCALE_NUMERIC
- LC_NUMERIC,
-# endif
# ifdef USE_LOCALE_CTYPE
LC_CTYPE,
# endif
+# ifdef USE_LOCALE_NUMERIC
+ LC_NUMERIC,
+# endif
# ifdef USE_LOCALE_COLLATE
LC_COLLATE,
# endif
@@ -200,6 +287,9 @@ const int categories[] = {
# ifdef USE_LOCALE_TELEPHONE
LC_TELEPHONE,
# endif
+# ifdef USE_LOCALE_NAME
+ LC_NAME,
+# endif
# ifdef USE_LOCALE_SYNTAX
LC_SYNTAX,
# endif
@@ -209,21 +299,22 @@ const int categories[] = {
# ifdef LC_ALL
LC_ALL,
# endif
- -1 /* Placeholder because C doesn't allow a
- trailing comma, and it would get complicated
- with all the #ifdef's */
+
+ /* Placeholder as a precaution if code fails to check the return of
+ * get_category_index(), which returns this element to indicate an error */
+ -1
};
/* The top-most real element is LC_ALL */
-const char * const category_names[] = {
+STATIC const char * const category_names[] = {
-# ifdef USE_LOCALE_NUMERIC
- "LC_NUMERIC",
-# endif
# ifdef USE_LOCALE_CTYPE
"LC_CTYPE",
# endif
+# ifdef USE_LOCALE_NUMERIC
+ "LC_NUMERIC",
+# endif
# ifdef USE_LOCALE_COLLATE
"LC_COLLATE",
# endif
@@ -251,6 +342,9 @@ const char * const category_names[] = {
# ifdef USE_LOCALE_TELEPHONE
"LC_TELEPHONE",
# endif
+# ifdef USE_LOCALE_NAME
+ "LC_NAME",
+# endif
# ifdef USE_LOCALE_SYNTAX
"LC_SYNTAX",
# endif
@@ -260,211 +354,445 @@ const char * const category_names[] = {
# ifdef LC_ALL
"LC_ALL",
# endif
- NULL /* Placeholder */
- };
+
+ /* Placeholder as a precaution if code fails to check the return of
+ * get_category_index(), which returns this element to indicate an error */
+ NULL
+};
+
+/* A few categories require additional setup when they are changed. This table
+ * points to the functions that do that setup */
+STATIC void (*update_functions[]) (pTHX_ const char *, bool force) = {
+# ifdef USE_LOCALE_CTYPE
+ S_new_ctype,
+# endif
+# ifdef USE_LOCALE_NUMERIC
+ S_new_numeric,
+# endif
+# ifdef USE_LOCALE_COLLATE
+ S_new_collate,
+# endif
+# ifdef USE_LOCALE_TIME
+ NULL,
+# endif
+# ifdef USE_LOCALE_MESSAGES
+ NULL,
+# endif
+# ifdef USE_LOCALE_MONETARY
+ NULL,
+# endif
+# ifdef USE_LOCALE_ADDRESS
+ NULL,
+# endif
+# ifdef USE_LOCALE_IDENTIFICATION
+ NULL,
+# endif
+# ifdef USE_LOCALE_MEASUREMENT
+ NULL,
+# endif
+# ifdef USE_LOCALE_PAPER
+ NULL,
+# endif
+# ifdef USE_LOCALE_TELEPHONE
+ NULL,
+# endif
+# ifdef USE_LOCALE_NAME
+ NULL,
+# endif
+# ifdef USE_LOCALE_SYNTAX
+ NULL,
+# endif
+# ifdef USE_LOCALE_TOD
+ NULL,
+# endif
+ /* No harm done to have this even without an LC_ALL */
+ S_new_LC_ALL,
+
+ /* Placeholder as a precaution if code fails to check the return of
+ * get_category_index(), which returns this element to indicate an error */
+ NULL
+};
# ifdef LC_ALL
/* On systems with LC_ALL, it is kept in the highest index position. (-2
* to account for the final unused placeholder element.) */
# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2)
-
# else
/* On systems without LC_ALL, we pretend it is there, one beyond the real
* top element, hence in the unused placeholder element. */
# define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1)
-
# endif
/* Pretending there is an LC_ALL element just above allows us to avoid most
* special cases. Most loops through these arrays in the code below are
* written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work
* on either type of system. But the code must be written to not access the
- * element at 'LC_ALL_INDEX' except on platforms that have it. This can be
- * checked for at compile time by using the #define LC_ALL_INDEX which is only
+ * element at 'LC_ALL_INDEX_' except on platforms that have it. This can be
+ * checked for at compile time by using the #define LC_ALL_INDEX_ which is only
* defined if we do have LC_ALL. */
-STATIC const char *
-S_category_name(const int category)
+STATIC int
+S_get_category_index_nowarn(const int category)
{
- unsigned int i;
-
-#ifdef LC_ALL
+ /* Given a category, return the equivalent internal index we generally use
+ * instead, or negative if not found.
+ *
+ * Some sort of hash could be used instead of this loop, but the number of
+ * elements is so far at most 12 */
- if (category == LC_ALL) {
- return "LC_ALL";
- }
+ unsigned int i;
-#endif
+ PERL_ARGS_ASSERT_GET_CATEGORY_INDEX;
- for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+# ifdef LC_ALL
+ for (i = 0; i <= LC_ALL_INDEX_; i++)
+# else
+ for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)
+# endif
+ {
if (category == categories[i]) {
- return category_names[i];
+ dTHX_DEBUGGING;
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "index of category %d (%s) is %d\n",
+ category, category_names[i], i));
+ return i;
}
}
- {
- const char suffix[] = " (unknown)";
- int temp = category;
- Size_t length = sizeof(suffix) + 1;
- char * unknown;
- dTHX;
-
- if (temp < 0) {
- length++;
- temp = - temp;
- }
+ return -1;
+}
- /* Calculate the number of digits */
- while (temp >= 10) {
- temp /= 10;
- length++;
- }
+STATIC unsigned int
+S_get_category_index(const int category, const char * locale)
+{
+ /* Given a category, return the equivalent internal index we generally use
+ * instead.
+ *
+ * 'locale' is for use in any generated diagnostics, and may be NULL
+ */
+
+ const char * conditional_warn_text = "; can't set it to ";
+ const int index = get_category_index_nowarn(category);
+
+ if (index >= 0) {
+ return index;
+ }
+
+ /* Here, we don't know about this category, so can't handle it. */
- Newx(unknown, length, char);
- my_snprintf(unknown, length, "%d%s", category, suffix);
- SAVEFREEPV(unknown);
- return unknown;
+ if (! locale) {
+ locale = "";
+ conditional_warn_text = "";
}
+
+ /* diag_listed_as: Unknown locale category %d; can't set it to %s */
+ Perl_warner_nocontext(packWARN(WARN_LOCALE),
+ "Unknown locale category %d%s%s",
+ category, conditional_warn_text, locale);
+
+ SET_EINVAL;
+
+ /* Return an out-of-bounds value */
+ return NOMINAL_LC_ALL_INDEX + 1;
}
-/* Now create LC_foo_INDEX #defines for just those categories on this system */
-# ifdef USE_LOCALE_NUMERIC
-# define LC_NUMERIC_INDEX 0
-# define _DUMMY_NUMERIC LC_NUMERIC_INDEX
-# else
-# define _DUMMY_NUMERIC -1
-# endif
-# ifdef USE_LOCALE_CTYPE
-# define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1
-# define _DUMMY_CTYPE LC_CTYPE_INDEX
-# else
-# define _DUMMY_CTYPE _DUMMY_NUMERIC
-# endif
-# ifdef USE_LOCALE_COLLATE
-# define LC_COLLATE_INDEX _DUMMY_CTYPE + 1
-# define _DUMMY_COLLATE LC_COLLATE_INDEX
-# else
-# define _DUMMY_COLLATE _DUMMY_CTYPE
-# endif
-# ifdef USE_LOCALE_TIME
-# define LC_TIME_INDEX _DUMMY_COLLATE + 1
-# define _DUMMY_TIME LC_TIME_INDEX
-# else
-# define _DUMMY_TIME _DUMMY_COLLATE
-# endif
-# ifdef USE_LOCALE_MESSAGES
-# define LC_MESSAGES_INDEX _DUMMY_TIME + 1
-# define _DUMMY_MESSAGES LC_MESSAGES_INDEX
-# else
-# define _DUMMY_MESSAGES _DUMMY_TIME
-# endif
-# ifdef USE_LOCALE_MONETARY
-# define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1
-# define _DUMMY_MONETARY LC_MONETARY_INDEX
-# else
-# define _DUMMY_MONETARY _DUMMY_MESSAGES
-# endif
-# ifdef USE_LOCALE_ADDRESS
-# define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1
-# define _DUMMY_ADDRESS LC_ADDRESS_INDEX
-# else
-# define _DUMMY_ADDRESS _DUMMY_MONETARY
-# endif
-# ifdef USE_LOCALE_IDENTIFICATION
-# define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1
-# define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX
-# else
-# define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS
-# endif
-# ifdef USE_LOCALE_MEASUREMENT
-# define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1
-# define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX
-# else
-# define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION
-# endif
-# ifdef USE_LOCALE_PAPER
-# define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1
-# define _DUMMY_PAPER LC_PAPER_INDEX
-# else
-# define _DUMMY_PAPER _DUMMY_MEASUREMENT
-# endif
-# ifdef USE_LOCALE_TELEPHONE
-# define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1
-# define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX
-# else
-# define _DUMMY_TELEPHONE _DUMMY_PAPER
-# endif
-# ifdef USE_LOCALE_SYNTAX
-# define LC_SYNTAX_INDEX _DUMMY_TELEPHONE + 1
-# define _DUMMY_SYNTAX LC_SYNTAX_INDEX
-# else
-# define _DUMMY_SYNTAX _DUMMY_TELEPHONE
-# endif
-# ifdef USE_LOCALE_TOD
-# define LC_TOD_INDEX _DUMMY_SYNTAX + 1
-# define _DUMMY_TOD LC_TOD_INDEX
-# else
-# define _DUMMY_TOD _DUMMY_SYNTAX
-# endif
-# ifdef LC_ALL
-# define LC_ALL_INDEX _DUMMY_TOD + 1
-# endif
#endif /* ifdef USE_LOCALE */
-/* Windows requres a customized base-level setlocale() */
+void
+Perl_force_locale_unlock()
+{
+
+#if defined(USE_LOCALE_THREADS)
+
+ dTHX;
+
+ /* If recursively locked, clear all at once */
+ if (PL_locale_mutex_depth > 1) {
+ PL_locale_mutex_depth = 1;
+ }
+
+ if (PL_locale_mutex_depth > 0) {
+ LOCALE_UNLOCK_;
+ }
+
+#endif
+
+}
+
+#ifdef USE_POSIX_2008_LOCALE
+
+STATIC locale_t
+S_use_curlocale_scratch(pTHX)
+{
+ /* This function is used to hide from the caller the case where the current
+ * locale_t object in POSIX 2008 is the global one, which is illegal in
+ * many of the P2008 API calls. This checks for that and, if necessary
+ * creates a proper P2008 object. Any prior object is deleted, as is any
+ * remaining object during global destruction. */
+
+ locale_t cur = uselocale((locale_t) 0);
+
+ if (cur != LC_GLOBAL_LOCALE) {
+ return cur;
+ }
+
+ if (PL_scratch_locale_obj) {
+ freelocale(PL_scratch_locale_obj);
+ }
+
+ PL_scratch_locale_obj = duplocale(LC_GLOBAL_LOCALE);
+ return PL_scratch_locale_obj;
+}
+
+#endif
+
+void
+Perl_locale_panic(const char * msg,
+ const char * file_name,
+ const line_t line,
+ const int errnum)
+{
+ dTHX;
+
+ PERL_ARGS_ASSERT_LOCALE_PANIC;
+
+ force_locale_unlock();
+
+#ifdef USE_C_BACKTRACE
+ dump_c_backtrace(Perl_debug_log, 20, 1);
+#endif
+
+ /* diag_listed_as: panic: %s */
+ Perl_croak(aTHX_ "%s: %" LINE_Tf ": panic: %s; errno=%d\n",
+ file_name, line, msg, errnum);
+}
+
+#define setlocale_failure_panic_c( \
+ cat, current, failed, caller_0_line, caller_1_line) \
+ setlocale_failure_panic_i(cat##_INDEX_, current, failed, \
+ caller_0_line, caller_1_line)
+
+/* posix_setlocale() presents a consistent POSIX-compliant interface to
+ * setlocale(). Windows requres a customized base-level setlocale(). Any
+ * necessary mutex locking needs to be done at a higher level */
#ifdef WIN32
-# define my_setlocale(cat, locale) win32_setlocale(cat, locale)
+# define posix_setlocale(cat, locale) win32_setlocale(cat, locale)
#else
-# define my_setlocale(cat, locale) setlocale(cat, locale)
+# define posix_setlocale(cat, locale) ((const char *) setlocale(cat, locale))
#endif
-#ifndef USE_POSIX_2008_LOCALE
+/* The next layer up is to catch vagaries and bugs in the libc setlocale return
+ * value. Again, any necessary mutex locking needs to be done at a higher
+ * level */
+#ifdef stdize_locale
+# define stdized_setlocale(cat, locale) \
+ stdize_locale(cat, posix_setlocale(cat, locale), \
+ &PL_stdize_locale_buf, &PL_stdize_locale_bufsize, __LINE__)
+#else
+# define stdized_setlocale(cat, locale) posix_setlocale(cat, locale)
+#endif
-/* "do_setlocale_c" is intended to be called when the category is a constant
- * known at compile time; "do_setlocale_r", not known until run time */
-# define do_setlocale_c(cat, locale) my_setlocale(cat, locale)
-# define do_setlocale_r(cat, locale) my_setlocale(cat, locale)
-# define FIX_GLIBC_LC_MESSAGES_BUG(i)
+/* The next many lines form a layer above the close-to-the-metal 'posix'
+ * and 'stdized' macros. They are used to present a uniform API to the rest of
+ * the code in this file in spite of the disparate underlying implementations.
+ * */
-#else /* Below uses POSIX 2008 */
+#if (! defined(USE_LOCALE_THREADS) && ! defined(USE_POSIX_2008_LOCALE)) \
+ || ( defined(WIN32) && defined(USE_THREAD_SAFE_LOCALE))
-/* We emulate setlocale with our own function. LC_foo is not valid for the
- * POSIX 2008 functions. Instead LC_foo_MASK is used, which we use an array
- * lookup to convert to. At compile time we have defined LC_foo_INDEX as the
- * proper offset into the array 'category_masks[]'. At runtime, we have to
- * search through the array (as the actual numbers may not be small contiguous
- * positive integers which would lend themselves to array lookup). */
-# define do_setlocale_c(cat, locale) \
- emulate_setlocale(cat, locale, cat ## _INDEX, TRUE)
-# define do_setlocale_r(cat, locale) emulate_setlocale(cat, locale, 0, FALSE)
+/* For non-threaded perls, the added layer just expands to the base-level
+ * functions, except if we are supposed to use the POSIX 2008 interface anyway.
+ * On perls where threading is invisible to us, the base-level functions are
+ * used regardless of threading. Currently this is only on later Windows
+ * versions.
+ *
+ * See the introductory comments in this file for the meaning of the suffixes
+ * '_c', '_r', '_i'. */
+
+# define setlocale_r(cat, locale) stdized_setlocale(cat, locale)
+# define setlocale_i(i, locale) setlocale_r(categories[i], locale)
+# define setlocale_c(cat, locale) setlocale_r(cat, locale)
+
+# define void_setlocale_i(i, locale) \
+ STMT_START { \
+ if (! posix_setlocale(categories[i], locale)) { \
+ setlocale_failure_panic_i(i, NULL, locale, __LINE__, 0); \
+ NOT_REACHED; /* NOTREACHED */ \
+ } \
+ } STMT_END
+# define void_setlocale_c(cat, locale) \
+ void_setlocale_i(cat##_INDEX_, locale)
+# define void_setlocale_r(cat, locale) \
+ void_setlocale_i(get_category_index(cat, locale), locale)
+
+# define bool_setlocale_r(cat, locale) cBOOL(posix_setlocale(cat, locale))
+# define bool_setlocale_i(i, locale) \
+ bool_setlocale_c(categories[i], locale)
+# define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
+
+/* All the querylocale...() forms return a mortalized copy. If you need
+ * something stable across calls, you need to savepv() the result yourself */
+
+# define querylocale_r(cat) mortalized_pv_copy(setlocale_r(cat, NULL))
+# define querylocale_c(cat) querylocale_r(cat)
+# define querylocale_i(i) querylocale_c(categories[i])
+
+#elif defined(USE_LOCALE_THREADS) \
+ && ! defined(USE_THREAD_SAFE_LOCALE)
+
+ /* Here, there are threads, and there is no support for thread-safe
+ * operation. This is a dangerous situation, which perl is documented as
+ * not supporting, but it arises in practice. We can do a modicum of
+ * automatic mitigation by making sure there is a per-thread return from
+ * setlocale(), and that a mutex protects it from races */
+STATIC const char *
+S_less_dicey_setlocale_r(pTHX_ const int category, const char * locale)
+{
+ const char * retval;
-# if ! defined(__GLIBC__) || ! defined(USE_LOCALE_MESSAGES)
+ PERL_ARGS_ASSERT_LESS_DICEY_SETLOCALE_R;
-# define FIX_GLIBC_LC_MESSAGES_BUG(i)
+ POSIX_SETLOCALE_LOCK;
-# else /* Invalidate glibc cache of loaded translations, see [perl #134264] */
+ retval = stdized_setlocale(category, locale);
-# include <libintl.h>
-# define FIX_GLIBC_LC_MESSAGES_BUG(i) \
- STMT_START { \
- if ((i) == LC_MESSAGES_INDEX) { \
- textdomain(textdomain(NULL)); \
- } \
- } STMT_END
+ /* We reuse PL_stdize_locale_buf as it doesn't conflict, but the call may
+ * already have used it, in which case we don't have to do anything further
+ * */
+ retval = save_to_buffer(retval,
+ &PL_stdize_locale_buf, &PL_stdize_locale_bufsize);
+
+ POSIX_SETLOCALE_UNLOCK;
+
+ return retval;
+}
+
+# define setlocale_r(cat, locale) less_dicey_setlocale_r(cat, locale)
+# define setlocale_c(cat, locale) setlocale_r(cat, locale)
+# define setlocale_i(i, locale) setlocale_r(categories[i], locale)
+
+# define querylocale_r(cat) mortalized_pv_copy(setlocale_r(cat, NULL))
+# define querylocale_c(cat) querylocale_r(cat)
+# define querylocale_i(i) querylocale_r(categories[i])
+
+STATIC void
+S_less_dicey_void_setlocale_i(pTHX_ const unsigned cat_index,
+ const char * locale,
+ const line_t line)
+{
+ PERL_ARGS_ASSERT_LESS_DICEY_VOID_SETLOCALE_I;
+
+ POSIX_SETLOCALE_LOCK;
+ if (! posix_setlocale(categories[cat_index], locale)) {
+ POSIX_SETLOCALE_UNLOCK;
+ setlocale_failure_panic_i(cat_index, NULL, locale, __LINE__, line);
+ }
+ POSIX_SETLOCALE_UNLOCK;
+}
+
+# define void_setlocale_i(i, locale) \
+ less_dicey_void_setlocale_i(i, locale, __LINE__)
+# define void_setlocale_c(cat, locale) \
+ void_setlocale_i(cat##_INDEX_, locale)
+# define void_setlocale_r(cat, locale) \
+ void_setlocale_i(get_category_index(cat, locale), locale)
+
+# if 0 /* Not currently used */
+
+STATIC bool
+S_less_dicey_bool_setlocale_r(pTHX_ const int cat, const char * locale)
+{
+ bool retval;
+
+ PERL_ARGS_ASSERT_LESS_DICEY_BOOL_SETLOCALE_R;
+
+ POSIX_SETLOCALE_LOCK;
+ retval = cBOOL(posix_setlocale(cat, locale));
+ POSIX_SETLOCALE_UNLOCK;
+
+ return retval;
+}
# endif
+# define bool_setlocale_r(cat, locale) \
+ less_dicey_bool_setlocale_r(cat, locale)
+# define bool_setlocale_i(i, locale) \
+ bool_setlocale_r(categories[i], locale)
+# define bool_setlocale_c(cat, locale) bool_setlocale_r(cat, locale)
+#else
-/* A third array, parallel to the ones above to map from category to its
- * equivalent mask */
-const int category_masks[] = {
-# ifdef USE_LOCALE_NUMERIC
- LC_NUMERIC_MASK,
+/* Here, there is a completely different API to get thread-safe locales. We
+ * emulate the setlocale() API with our own function(s). setlocale categories,
+ * like LC_NUMERIC, are not valid here for the POSIX 2008 API. Instead, there
+ * are equivalents, like LC_NUMERIC_MASK, which we use instead, converting to
+ * by using get_category_index() followed by table lookup. */
+
+# define emulate_setlocale_c(cat, locale, recalc_LC_ALL, line) \
+ emulate_setlocale_i(cat##_INDEX_, locale, recalc_LC_ALL, line)
+
+ /* A wrapper for the macros below. */
+# define common_emulate_setlocale(i, locale) \
+ emulate_setlocale_i(i, locale, YES_RECALC_LC_ALL, __LINE__)
+
+# define setlocale_i(i, locale) \
+ save_to_buffer(common_emulate_setlocale(i, locale), \
+ &PL_stdize_locale_buf, \
+ &PL_stdize_locale_bufsize)
+# define setlocale_c(cat, locale) setlocale_i(cat##_INDEX_, locale)
+# define setlocale_r(cat, locale) \
+ setlocale_i(get_category_index(cat, locale), locale)
+
+# define void_setlocale_i(i, locale) \
+ ((void) common_emulate_setlocale(i, locale))
+# define void_setlocale_c(cat, locale) \
+ void_setlocale_i(cat##_INDEX_, locale)
+# define void_setlocale_r(cat, locale) ((void) setlocale_r(cat, locale))
+
+# define bool_setlocale_i(i, locale) \
+ cBOOL(common_emulate_setlocale(i, locale))
+# define bool_setlocale_c(cat, locale) \
+ bool_setlocale_i(cat##_INDEX_, locale)
+# define bool_setlocale_r(cat, locale) cBOOL(setlocale_r(cat, locale))
+
+# define querylocale_i(i) mortalized_pv_copy(my_querylocale_i(i))
+# define querylocale_c(cat) querylocale_i(cat##_INDEX_)
+# define querylocale_r(cat) querylocale_i(get_category_index(cat,NULL))
+
+# ifdef USE_QUERYLOCALE
+# define isSINGLE_BIT_SET(mask) isPOWER_OF_2(mask)
+
+ /* This code used to think querylocale() was valid on LC_ALL. Make sure
+ * all instances of that have been removed */
+# define QUERYLOCALE_ASSERT(index) \
+ __ASSERT_(isSINGLE_BIT_SET(category_masks[index]))
+# if ! defined(HAS_QUERYLOCALE) && ( defined(_NL_LOCALE_NAME) \
+ && defined(HAS_NL_LANGINFO_L))
+# define querylocale_l(index, locale_obj) \
+ (QUERYLOCALE_ASSERT(index) \
+ mortalized_pv_copy(nl_langinfo_l( \
+ _NL_LOCALE_NAME(categories[index]), locale_obj)))
+# else
+# define querylocale_l(index, locale_obj) \
+ (QUERYLOCALE_ASSERT(index) \
+ mortalized_pv_copy(querylocale(category_masks[index], locale_obj)))
+# endif
# endif
+# if defined(__GLIBC__) && defined(USE_LOCALE_MESSAGES)
+# define HAS_GLIBC_LC_MESSAGES_BUG
+# include <libintl.h>
+# endif
+
+/* A fourth array, parallel to the ones above to map from category to its
+ * equivalent mask */
+STATIC const int category_masks[] = {
# ifdef USE_LOCALE_CTYPE
LC_CTYPE_MASK,
# endif
+# ifdef USE_LOCALE_NUMERIC
+ LC_NUMERIC_MASK,
+# endif
# ifdef USE_LOCALE_COLLATE
LC_COLLATE_MASK,
# endif
@@ -492,6 +820,9 @@ const int category_masks[] = {
# ifdef USE_LOCALE_TELEPHONE
LC_TELEPHONE_MASK,
# endif
+# ifdef USE_LOCALE_NAME
+ LC_NAME_MASK,
+# endif
# ifdef USE_LOCALE_SYNTAX
LC_SYNTAX_MASK,
# endif
@@ -503,781 +834,1031 @@ const int category_masks[] = {
* here, so compile it in unconditionally.
* This could catch some glitches at compile
* time */
- LC_ALL_MASK
- };
+ LC_ALL_MASK,
+
+ /* Placeholder as a precaution if code fails to check the return of
+ * get_category_index(), which returns this element to indicate an error */
+ 0
+};
+
+# define my_querylocale_c(cat) my_querylocale_i(cat##_INDEX_)
STATIC const char *
-S_emulate_setlocale(const int category,
- const char * locale,
- unsigned int index,
- const bool is_index_valid
- )
+S_my_querylocale_i(pTHX_ const unsigned int index)
{
- /* This function effectively performs a setlocale() on just the current
- * thread; thus it is thread-safe. It does this by using the POSIX 2008
- * locale functions to emulate the behavior of setlocale(). Similar to
- * regular setlocale(), the return from this function points to memory that
- * can be overwritten by other system calls, so needs to be copied
- * immediately if you need to retain it. The difference here is that
- * system calls besides another setlocale() can overwrite it.
- *
- * By doing this, most locale-sensitive functions become thread-safe. The
- * exceptions are mostly those that return a pointer to static memory.
- *
- * This function takes the same parameters, 'category' and 'locale', that
- * the regular setlocale() function does, but it also takes two additional
- * ones. This is because the 2008 functions don't use a category; instead
- * they use a corresponding mask. Because this function operates in both
- * worlds, it may need one or the other or both. This function can
- * calculate the mask from the input category, but to avoid this
- * calculation, if the caller knows at compile time what the mask is, it
- * can pass it, setting 'is_index_valid' to TRUE; otherwise the mask
- * parameter is ignored.
+ /* This function returns the name of the locale category given by the input
+ * index into our parallel tables of them.
*
* POSIX 2008, for some sick reason, chose not to provide a method to find
- * the category name of a locale. Some vendors have created a
- * querylocale() function to do just that. This function is a lot simpler
- * to implement on systems that have this. Otherwise, we have to keep
- * track of what the locale has been set to, so that we can return its
- * name to emulate setlocale(). It's also possible for C code in some
- * library to change the locale without us knowing it, though as of
+ * the category name of a locale, discarding a basic linguistic tenet that
+ * for any object, people will create a name for it. Some vendors have
+ * created a querylocale() function to do just that. This function is a
+ * lot simpler to implement on systems that have this. Otherwise, we have
+ * to keep track of what the locale has been set to, so that we can return
+ * its name so as to emulate setlocale(). It's also possible for C code in
+ * some library to change the locale without us knowing it, though as of
* September 2017, there are no occurrences in CPAN of uselocale(). Some
* libraries do use setlocale(), but that changes the global locale, and
- * threads using per-thread locales will just ignore those changes.
- * Another problem is that without querylocale(), we have to guess at what
- * was meant by setting a locale of "". We handle this by not actually
- * ever setting to "" (unless querylocale exists), but to emulate what we
- * think should happen for "".
- */
-
- int mask;
- locale_t old_obj;
- locale_t new_obj;
- const char * safelocale = locale ? locale : "(null)";
- dTHX;
+ * threads using per-thread locales will just ignore those changes. */
-# ifdef DEBUGGING
+ int category;
+ const locale_t cur_obj = uselocale((locale_t) 0);
+ const char * retval;
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale input=%d (%s), \"%s\", %d, %d\n", __FILE__, __LINE__, category, category_name(category), safelocale, index, is_index_valid);
- }
+ PERL_ARGS_ASSERT_MY_QUERYLOCALE_I;
+ assert(index <= NOMINAL_LC_ALL_INDEX);
-# endif
+ category = categories[index];
- /* If the input mask might be incorrect, calculate the correct one */
- if (! is_index_valid) {
- unsigned int i;
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, "my_querylocale_i(%s) on %p\n",
+ category_names[index], cur_obj));
+ if (cur_obj == LC_GLOBAL_LOCALE) {
+ POSIX_SETLOCALE_LOCK;
+ retval = posix_setlocale(category, NULL);
+ POSIX_SETLOCALE_UNLOCK;
+ }
+ else {
-# ifdef DEBUGGING
+# ifdef USE_QUERYLOCALE
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: finding index of category %d (%s)\n", __FILE__, __LINE__, category, category_name(category));
- }
+ /* We don't currently keep records when there is querylocale(), so have
+ * to get it anew each time */
+ retval = (index == LC_ALL_INDEX_)
+ ? calculate_LC_ALL(cur_obj)
+ : querylocale_l(index, cur_obj);
-# endif
+# else
- for (i = 0; i <= LC_ALL_INDEX; i++) {
- if (category == categories[i]) {
- index = i;
- goto found_index;
- }
+ /* But we do have up-to-date values when we keep our own records
+ * (except some times in initialization, where we get the value from
+ * the system. */
+ const char ** which = (index == LC_ALL_INDEX_)
+ ? &PL_cur_LC_ALL
+ : &PL_curlocales[index];
+ if (*which == NULL) {
+ retval = stdized_setlocale(category, NULL);
+ *which = savepv(retval);
}
-
- /* Here, we don't know about this category, so can't handle it.
- * Fallback to the early POSIX usages */
- Perl_warner(aTHX_ packWARN(WARN_LOCALE),
- "Unknown locale category %d; can't set it to %s\n",
- category, safelocale);
- return NULL;
-
- found_index: ;
-
-# ifdef DEBUGGING
-
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: index is %d for %s\n", __FILE__, __LINE__, index, category_name(category));
+ else {
+ retval = *which;
}
# endif
}
- mask = category_masks[index];
-
-# ifdef DEBUGGING
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "my_querylocale_i(%s) returning '%s'\n",
+ category_names[index], retval));
+ assert(strNE(retval, ""));
+ return retval;
+}
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: category name is %s; mask is 0x%x\n", __FILE__, __LINE__, category_names[index], mask);
- }
+# ifdef USE_PL_CURLOCALES
-# endif
+STATIC const char *
+S_update_PL_curlocales_i(pTHX_
+ const unsigned int index,
+ const char * new_locale,
+ recalc_lc_all_t recalc_LC_ALL)
+{
+ /* This is a helper function for emulate_setlocale_i(), mostly used to
+ * make that function easier to read. */
- /* If just querying what the existing locale is ... */
- if (locale == NULL) {
- locale_t cur_obj = uselocale((locale_t) 0);
+ PERL_ARGS_ASSERT_UPDATE_PL_CURLOCALES_I;
+ assert(index <= NOMINAL_LC_ALL_INDEX);
-# ifdef DEBUGGING
+ if (index == LC_ALL_INDEX_) {
+ unsigned int i;
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale querying %p\n", __FILE__, __LINE__, cur_obj);
+ /* For LC_ALL, we change all individual categories to correspond */
+ /* PL_curlocales is a parallel array, so has same
+ * length as 'categories' */
+ for (i = 0; i < LC_ALL_INDEX_; i++) {
+ Safefree(PL_curlocales[i]);
+ PL_curlocales[i] = savepv(new_locale);
}
-# endif
+ Safefree(PL_cur_LC_ALL);
+ PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales));
+ return PL_cur_LC_ALL;
+ }
- if (cur_obj == LC_GLOBAL_LOCALE) {
- return my_setlocale(category, NULL);
- }
+ /* Update the single category's record */
+ Safefree(PL_curlocales[index]);
+ PL_curlocales[index] = savepv(new_locale);
-# ifdef HAS_QUERYLOCALE
+ /* And also LC_ALL if the input says to, including if this is the final
+ * iteration of a loop updating all sub-categories */
+ if ( recalc_LC_ALL == YES_RECALC_LC_ALL
+ || ( recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
+ && index == NOMINAL_LC_ALL_INDEX - 1))
+ {
+ Safefree(PL_cur_LC_ALL);
+ PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales));
+ }
- return (char *) querylocale(mask, cur_obj);
+ return PL_curlocales[index];
+}
-# else
+# endif /* Need PL_curlocales[] */
- /* If this assert fails, adjust the size of curlocales in intrpvar.h */
- STATIC_ASSERT_STMT(C_ARRAY_LENGTH(PL_curlocales) > LC_ALL_INDEX);
+STATIC const char *
+S_setlocale_from_aggregate_LC_ALL(pTHX_ const char * locale, const line_t line)
+{
+ /* This function parses the value of the LC_ALL locale, assuming glibc
+ * syntax, and sets each individual category on the system to the proper
+ * value.
+ *
+ * This is likely to only ever be called from one place, so exists to make
+ * the calling function easier to read by moving this ancillary code out of
+ * the main line.
+ *
+ * The locale for each category is independent of the other categories.
+ * Often, they are all the same, but certainly not always. Perl, in fact,
+ * usually keeps LC_NUMERIC in the C locale, regardless of the underlying
+ * locale. LC_ALL has to be able to represent the case of when there are
+ * varying locales. Platforms have differing ways of representing this.
+ * Because of this, the code in this file goes to lengths to avoid the
+ * issue, generally looping over the component categories instead of
+ * referring to them in the aggregate, wherever possible. However, there
+ * are cases where we have to parse our own constructed aggregates, which use
+ * the glibc syntax. */
+
+ const char * locale_on_entry = querylocale_c(LC_ALL);
+
+ PERL_ARGS_ASSERT_SETLOCALE_FROM_AGGREGATE_LC_ALL;
+
+ /* If the string that gives what to set doesn't include all categories,
+ * the omitted ones get set to "C". To get this behavior, first set
+ * all the individual categories to "C", and override the furnished
+ * ones below. FALSE => No need to recalculate LC_ALL, as this is a
+ * temporary state */
+ if (! emulate_setlocale_c(LC_ALL, "C", DONT_RECALC_LC_ALL, line)) {
+ setlocale_failure_panic_c(LC_ALL, locale_on_entry,
+ "C", __LINE__, line);
+ NOT_REACHED; /* NOTREACHED */
+ }
-# if defined(_NL_LOCALE_NAME) \
- && defined(DEBUGGING) \
- /* On systems that accept any locale name, the real underlying \
- * locale is often returned by this internal function, so we \
- * can't use it */ \
- && ! defined(SETLOCALE_ACCEPTS_ANY_LOCALE_NAME)
- {
- /* Internal glibc for querylocale(), but doesn't handle
- * empty-string ("") locale properly; who knows what other
- * glitches. Check for it now, under debug. */
-
- char * temp_name = nl_langinfo_l(_NL_LOCALE_NAME(category),
- uselocale((locale_t) 0));
- /*
- PerlIO_printf(Perl_debug_log, "%s:%d: temp_name=%s\n", __FILE__, __LINE__, temp_name ? temp_name : "NULL");
- PerlIO_printf(Perl_debug_log, "%s:%d: index=%d\n", __FILE__, __LINE__, index);
- PerlIO_printf(Perl_debug_log, "%s:%d: PL_curlocales[index]=%s\n", __FILE__, __LINE__, PL_curlocales[index]);
- */
- if (temp_name && PL_curlocales[index] && strNE(temp_name, "")) {
- if ( strNE(PL_curlocales[index], temp_name)
- && ! ( isNAME_C_OR_POSIX(temp_name)
- && isNAME_C_OR_POSIX(PL_curlocales[index]))) {
-
-# ifdef USE_C_BACKTRACE
-
- dump_c_backtrace(Perl_debug_log, 20, 1);
+ const char * s = locale;
+ const char * e = locale + strlen(locale);
+ while (s < e) {
+ const char * p = s;
-# endif
+ /* Parse through the category */
+ while (isWORDCHAR(*p)) {
+ p++;
+ }
- Perl_croak(aTHX_ "panic: Mismatch between what Perl thinks %s is"
- " (%s) and what internal glibc thinks"
- " (%s)\n", category_names[index],
- PL_curlocales[index], temp_name);
- }
+ const char * category_end = p;
- return temp_name;
- }
+ if (*p++ != '=') {
+ locale_panic_(Perl_form(aTHX_
+ "Unexpected character in locale category name '%s"
+ "<-- HERE",
+ get_displayable_string(s, p - 1, 0)));
}
-# endif
+ /* Parse through the locale name */
+ const char * name_start = p;
+ while (p < e && *p != ';') {
+ p++;
+ }
+ if (UNLIKELY( p < e && *p != ';')) {
+ locale_panic_(Perl_form(aTHX_
+ "Unexpected character in locale name '%s<-- HERE",
+ get_displayable_string(s, p, 0)));
+ }
- /* Without querylocale(), we have to use our record-keeping we've
- * done. */
+ const char * name_end = p;
- if (category != LC_ALL) {
+ /* Space past the semi-colon */
+ if (p < e) {
+ p++;
+ }
-# ifdef DEBUGGING
+ /* Find the index of the category name in our lists */
+ for (PERL_UINT_FAST8_T i = 0; i < LC_ALL_INDEX_; i++) {
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[index]);
+ /* Keep going if this index doesn't point to the category being
+ * parsed. The strnNE() avoids a Perl_form(), but would fail if
+ * ever a category name could be a substring of another one, e.g.,
+ * if there were a "LC_TIME_DATE" */
+ if strnNE(s, category_names[i], category_end - s) {
+ continue;
}
-# endif
-
- return PL_curlocales[index];
- }
- else { /* For LC_ALL */
- unsigned int i;
- Size_t names_len = 0;
- char * all_string;
- bool are_all_categories_the_same_locale = TRUE;
+ /* Here i points to the category being parsed. Now isolate the
+ * locale it is being changed to */
+ const char * individ_locale = Perl_form(aTHX_ "%.*s",
+ (int) (name_end - name_start), name_start);
- /* If we have a valid LC_ALL value, just return it */
- if (PL_curlocales[LC_ALL_INDEX]) {
-
-# ifdef DEBUGGING
+ /* And do the change. Don't recalculate LC_ALL; we'll do it
+ * ourselves after the loop */
+ if (! emulate_setlocale_i(i, individ_locale,
+ DONT_RECALC_LC_ALL, line))
+ {
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[LC_ALL_INDEX]);
+ /* But if we have to back out, do fix up LC_ALL */
+ if (! emulate_setlocale_c(LC_ALL, locale_on_entry,
+ YES_RECALC_LC_ALL, line))
+ {
+ setlocale_failure_panic_i(i, individ_locale,
+ locale, __LINE__, line);
+ NOT_REACHED; /* NOTREACHED */
}
-# endif
-
- return PL_curlocales[LC_ALL_INDEX];
+ /* Reverting to the entry value succeeded, but the operation
+ * failed to go to the requested locale. */
+ return NULL;
}
- /* Otherwise, we need to construct a string of name=value pairs.
- * We use the glibc syntax, like
- * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
- * First calculate the needed size. Along the way, check if all
- * the locale names are the same */
- for (i = 0; i < LC_ALL_INDEX; i++) {
-
-# ifdef DEBUGGING
+ /* Found and handled the desired category. Quit the inner loop to
+ * try the next category */
+ break;
+ }
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]);
- }
+ /* Finished with this category; iterate to the next one in the input */
+ s = p;
+ }
-# endif
+# ifdef USE_PL_CURLOCALES
- names_len += strlen(category_names[i])
- + 1 /* '=' */
- + strlen(PL_curlocales[i])
- + 1; /* ';' */
+ /* Here we have set all the individual categories. Update the LC_ALL entry
+ * as well. We can't just use the input 'locale' as the value may omit
+ * categories whose locale is 'C'. khw thinks it's better to store a
+ * complete LC_ALL. So calculate it. */
+ const char * retval = savepv(calculate_LC_ALL(PL_curlocales));
+ Safefree(PL_cur_LC_ALL);
+ PL_cur_LC_ALL = retval;
- if (i > 0 && strNE(PL_curlocales[i], PL_curlocales[i-1])) {
- are_all_categories_the_same_locale = FALSE;
- }
- }
+# else
- /* If they are the same, we don't actually have to construct the
- * string; we just make the entry in LC_ALL_INDEX valid, and be
- * that single name */
- if (are_all_categories_the_same_locale) {
- PL_curlocales[LC_ALL_INDEX] = savepv(PL_curlocales[0]);
- return PL_curlocales[LC_ALL_INDEX];
- }
+ const char * retval = querylocale_c(LC_ALL);
- names_len++; /* Trailing '\0' */
- SAVEFREEPV(Newx(all_string, names_len, char));
- *all_string = '\0';
+# endif
- /* Then fill in the string */
- for (i = 0; i < LC_ALL_INDEX; i++) {
+ return retval;
+}
-# ifdef DEBUGGING
+STATIC const char *
+S_emulate_setlocale_i(pTHX_
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]);
- }
+ /* Our internal index of the 'category' setlocale is called with */
+ const unsigned int index,
-# endif
+ const char * new_locale, /* The locale to set the category to */
+ const recalc_lc_all_t recalc_LC_ALL, /* Explained below */
+ const line_t line /* Called from this line number */
+ )
+{
+ PERL_ARGS_ASSERT_EMULATE_SETLOCALE_I;
+ assert(index <= NOMINAL_LC_ALL_INDEX);
- my_strlcat(all_string, category_names[i], names_len);
- my_strlcat(all_string, "=", names_len);
- my_strlcat(all_string, PL_curlocales[i], names_len);
- my_strlcat(all_string, ";", names_len);
- }
+ /* Otherwise could have undefined behavior, as the return of this function
+ * may be copied to this buffer, which this function could change in the
+ * middle of its work */
+ assert(new_locale != PL_stdize_locale_buf);
-# ifdef DEBUGGING
+ /* This function effectively performs a setlocale() on just the current
+ * thread; thus it is thread-safe. It does this by using the POSIX 2008
+ * locale functions to emulate the behavior of setlocale(). Similar to
+ * regular setlocale(), the return from this function points to memory that
+ * can be overwritten by other system calls, so needs to be copied
+ * immediately if you need to retain it. The difference here is that
+ * system calls besides another setlocale() can overwrite it.
+ *
+ * By doing this, most locale-sensitive functions become thread-safe. The
+ * exceptions are mostly those that return a pointer to static memory.
+ *
+ * This function may be called in a tight loop that iterates over all
+ * categories. Because LC_ALL is not a "real" category, but merely the sum
+ * of all the other ones, such loops don't include LC_ALL. On systems that
+ * have querylocale() or similar, the current LC_ALL value is immediately
+ * retrievable; on systems lacking that feature, we have to keep track of
+ * LC_ALL ourselves. We could do that on each iteration, only to throw it
+ * away on the next, but the calculation is more than a trivial amount of
+ * work. Instead, the 'recalc_LC_ALL' parameter is set to
+ * RECALCULATE_LC_ALL_ON_FINAL_INTERATION to only do the calculation once.
+ * This function calls itself recursively in such a loop.
+ *
+ * When not in such a loop, the parameter is set to the other enum values
+ * DONT_RECALC_LC_ALL or YES_RECALC_LC_ALL. */
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, all_string);
- }
+ int mask = category_masks[index];
+ const locale_t entry_obj = uselocale((locale_t) 0);
+ const char * locale_on_entry = querylocale_i(index);
- #endif
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "emulate_setlocale_i input=%d (%s), mask=0x%x,"
+ " new locale=\"%s\", current locale=\"%s\","
+ "index=%d, object=%p\n",
+ categories[index], category_names[index], mask,
+ ((new_locale == NULL) ? "(nil)" : new_locale),
+ locale_on_entry, index, entry_obj));
+
+ /* Return the already-calculated info if just querying what the existing
+ * locale is */
+ if (new_locale == NULL) {
+ return locale_on_entry;
+ }
- return all_string;
+ /* Here, trying to change the locale, but it is a no-op if the new boss is
+ * the same as the old boss. Except this routine is called when converting
+ * from the global locale, so in that case we will create a per-thread
+ * locale below (with the current values). It also seemed that newlocale()
+ * could free up the basis locale memory if we called it with the new and
+ * old being the same, but khw now thinks that this was due to some other
+ * bug, since fixed, as there are other places where newlocale() gets
+ * similarly called without problems. */
+ if ( entry_obj != LC_GLOBAL_LOCALE
+ && locale_on_entry
+ && strEQ(new_locale, locale_on_entry))
+ {
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): emulate_setlocale_i"
+ " no-op to change to what it already was\n",
+ line));
+
+# ifdef USE_PL_CURLOCALES
+
+ /* On the final iteration of a loop that needs to recalculate LC_ALL, do
+ * so. If no iteration changed anything, LC_ALL also doesn't change,
+ * but khw believes the complexity needed to keep track of that isn't
+ * worth it. */
+ if (UNLIKELY( recalc_LC_ALL == RECALCULATE_LC_ALL_ON_FINAL_INTERATION
+ && index == NOMINAL_LC_ALL_INDEX - 1))
+ {
+ Safefree(PL_cur_LC_ALL);
+ PL_cur_LC_ALL = savepv(calculate_LC_ALL(PL_curlocales));
}
-# ifdef EINVAL
+# endif
- SETERRNO(EINVAL, LIB_INVARG);
+ return locale_on_entry;
+ }
-# endif
+# ifndef USE_QUERYLOCALE
- return NULL;
+ /* Without a querylocale() mechanism, we have to figure out ourselves what
+ * happens with setting a locale to "" */
+ if (strEQ(new_locale, "")) {
+ new_locale = find_locale_from_environment(index);
+ }
# endif
- } /* End of this being setlocale(LC_foo, NULL) */
+ /* So far, it has worked that a semi-colon in the locale name means that
+ * the category is LC_ALL and it subsumes categories which don't all have
+ * the same locale. This is the glibc syntax. */
+ if (strchr(new_locale, ';')) {
+ assert(index == LC_ALL_INDEX_);
+ return setlocale_from_aggregate_LC_ALL(new_locale, line);
+ }
- /* Here, we are switching locales. */
+# ifdef HAS_GLIBC_LC_MESSAGES_BUG
-# ifndef HAS_QUERYLOCALE
+ /* For this bug, if the LC_MESSAGES locale changes, we have to do an
+ * expensive workaround. Save the current value so we can later determine
+ * if it changed. */
+ const char * old_messages_locale = NULL;
+ if ( (index == LC_MESSAGES_INDEX_ || index == LC_ALL_INDEX_)
+ && LIKELY(PL_phase != PERL_PHASE_CONSTRUCT))
+ {
+ old_messages_locale = querylocale_c(LC_MESSAGES);
+ }
- if (strEQ(locale, "")) {
+# endif
- /* For non-querylocale() systems, we do the setting of "" ourselves to
- * be sure that we really know what's going on. We follow the Linux
- * documented behavior (but if that differs from the actual behavior,
- * this won't work exactly as the OS implements). We go out and
- * examine the environment based on our understanding of how the system
- * works, and use that to figure things out */
+ assert(PL_C_locale_obj);
- const char * const lc_all = PerlEnv_getenv("LC_ALL");
+ /* Now ready to switch to the input 'new_locale' */
- /* Use any "LC_ALL" environment variable, as it overrides everything
- * else. */
- if (lc_all && strNE(lc_all, "")) {
- locale = lc_all;
- }
- else {
+ /* Switching locales generally entails freeing the current one's space (at
+ * the C library's discretion), hence we can't be using that locale at the
+ * time of the switch (this wasn't obvious to khw from the man pages). So
+ * switch to a known locale object that we don't otherwise mess with. */
+ if (! uselocale(PL_C_locale_obj)) {
+
+ /* Not being able to change to the C locale is severe; don't keep
+ * going. */
+ setlocale_failure_panic_i(index, locale_on_entry, "C", __LINE__, line);
+ NOT_REACHED; /* NOTREACHED */
+ }
- /* Otherwise, we need to dig deeper. Unless overridden, the
- * default is the LANG environment variable; if it doesn't exist,
- * then "C" */
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): emulate_setlocale_i now using C"
+ " object=%p\n", line, PL_C_locale_obj));
- const char * default_name;
+ locale_t new_obj;
- default_name = PerlEnv_getenv("LANG");
+ /* We created a (never changing) object at start-up for LC_ALL being in the
+ * C locale. If this call is to switch to LC_ALL=>C, simply use that
+ * object. But in fact, we already have switched to it just above, in
+ * preparation for the general case. Since we're already there, no need to
+ * do further switching. */
+ if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(new_locale)) {
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, "(%" LINE_Tf "):"
+ " emulate_setlocale_i will stay"
+ " in C object\n", line));
+ new_obj = PL_C_locale_obj;
- if (! default_name || strEQ(default_name, "")) {
- default_name = "C";
+ /* And free the old object if it isn't a special one */
+ if (entry_obj != LC_GLOBAL_LOCALE && entry_obj != PL_C_locale_obj) {
+ freelocale(entry_obj);
+ }
+ }
+ else { /* Here is the general case, not to LC_ALL=>C */
+ locale_t basis_obj = entry_obj;
+
+ /* Specially handle two objects */
+ if (basis_obj == LC_GLOBAL_LOCALE || basis_obj == PL_C_locale_obj) {
+
+ /* For these two objects, we make duplicates to hand to newlocale()
+ * below. For LC_GLOBAL_LOCALE, this is because newlocale()
+ * doesn't necessarily accept it as input (the results are
+ * undefined). For PL_C_locale_obj, it is so that it never gets
+ * modified, as otherwise newlocale() is free to do so */
+ basis_obj = duplocale(basis_obj);
+ if (! basis_obj) {
+ locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): duplocale failed",
+ line));
+ NOT_REACHED; /* NOTREACHED */
}
- if (category != LC_ALL) {
- const char * const name = PerlEnv_getenv(category_names[index]);
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): emulate_setlocale_i"
+ " created %p by duping the input\n",
+ line, basis_obj));
+ }
- /* Here we are setting a single category. Assume will have the
- * default name */
- locale = default_name;
+ /* Ready to create a new locale by modification of the existing one.
+ *
+ * NOTE: This code may incorrectly show up as a leak under the address
+ * sanitizer. We do not free this object under normal teardown, however
+ * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
+ */
+ new_obj = newlocale(mask, new_locale, basis_obj);
- /* But then look for an overriding environment variable */
- if (name && strNE(name, "")) {
- locale = name;
- }
+ if (! new_obj) {
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ " (%" LINE_Tf "): emulate_setlocale_i"
+ " creating new object from %p failed:"
+ " errno=%d\n",
+ line, basis_obj, GET_ERRNO));
+
+ /* Failed. Likely this is because the proposed new locale isn't
+ * valid on this system. But we earlier switched to the LC_ALL=>C
+ * locale in anticipation of it succeeding, Now have to switch
+ * back to the state upon entry */
+ if (! uselocale(entry_obj)) {
+ setlocale_failure_panic_i(index, "switching back to",
+ locale_on_entry, __LINE__, line);
+ NOT_REACHED; /* NOTREACHED */
}
- else {
- bool did_override = FALSE;
- unsigned int i;
-
- /* Here, we are getting LC_ALL. Any categories that don't have
- * a corresponding environment variable set should be set to
- * LANG, or to "C" if there is no LANG. If no individual
- * categories differ from this, we can just set LC_ALL. This
- * is buggy on systems that have extra categories that we don't
- * know about. If there is an environment variable that sets
- * that category, we won't know to look for it, and so our use
- * of LANG or "C" improperly overrides it. On the other hand,
- * if we don't do what is done here, and there is no
- * environment variable, the category's locale should be set to
- * LANG or "C". So there is no good solution. khw thinks the
- * best is to look at systems to see what categories they have,
- * and include them, and then to assume that we know the
- * complete set */
-
- for (i = 0; i < LC_ALL_INDEX; i++) {
- const char * const env_override
- = PerlEnv_getenv(category_names[i]);
- const char * this_locale = ( env_override
- && strNE(env_override, ""))
- ? env_override
- : default_name;
- if (! emulate_setlocale(categories[i], this_locale, i, TRUE))
- {
- return NULL;
- }
- if (strNE(this_locale, default_name)) {
- did_override = TRUE;
- }
- }
-
- /* If all the categories are the same, we can set LC_ALL to
- * that */
- if (! did_override) {
- locale = default_name;
- }
- else {
-
- /* Here, LC_ALL is no longer valid, as some individual
- * categories don't match it. We call ourselves
- * recursively, as that will execute the code that
- * generates the proper locale string for this situation.
- * We don't do the remainder of this function, as that is
- * to update our records, and we've just done that for the
- * individual categories in the loop above, and doing so
- * would cause LC_ALL to be done as well */
- return emulate_setlocale(LC_ALL, NULL, LC_ALL_INDEX, TRUE);
+# ifdef USE_PL_CURLOCALES
+
+ if (entry_obj == LC_GLOBAL_LOCALE) {
+
+ /* Here, we are back in the global locale. We may never have
+ * set PL_curlocales. If the locale change had succeeded, the
+ * code would have then set them up, but since it didn't, do so
+ * here. khw isn't sure if this prevents some issues or not,
+ * This will calculate LC_ALL's entry only on the final
+ * iteration */
+ POSIX_SETLOCALE_LOCK;
+ for (PERL_UINT_FAST8_T i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ update_PL_curlocales_i(i,
+ posix_setlocale(categories[i], NULL),
+ RECALCULATE_LC_ALL_ON_FINAL_INTERATION);
}
+ POSIX_SETLOCALE_UNLOCK;
}
+# endif
+
+ return NULL;
}
- } /* End of this being setlocale(LC_foo, "") */
- else if (strchr(locale, ';')) {
- /* LC_ALL may actually incude a conglomeration of various categories.
- * Without querylocale, this code uses the glibc (as of this writing)
- * syntax for representing that, but that is not a stable API, and
- * other platforms do it differently, so we have to handle all cases
- * ourselves */
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): emulate_setlocale_i created %p"
+ " while freeing %p\n", line, new_obj, basis_obj));
- unsigned int i;
- const char * s = locale;
- const char * e = locale + strlen(locale);
- const char * p = s;
- const char * category_end;
- const char * name_start;
- const char * name_end;
-
- /* If the string that gives what to set doesn't include all categories,
- * the omitted ones get set to "C". To get this behavior, first set
- * all the individual categories to "C", and override the furnished
- * ones below */
- for (i = 0; i < LC_ALL_INDEX; i++) {
- if (! emulate_setlocale(categories[i], "C", i, TRUE)) {
- return NULL;
- }
+ /* Here, successfully created an object representing the desired
+ * locale; now switch into it */
+ if (! uselocale(new_obj)) {
+ freelocale(new_obj);
+ locale_panic_(Perl_form(aTHX_ "(%" LINE_Tf "): emulate_setlocale_i"
+ " switching into new locale failed",
+ line));
}
+ }
- while (s < e) {
+ /* Here, we are using 'new_obj' which matches the input 'new_locale'. */
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): emulate_setlocale_i now using %p\n",
+ line, new_obj));
- /* Parse through the category */
- while (isWORDCHAR(*p)) {
- p++;
- }
- category_end = p;
+#ifdef MULTIPLICITY
+ PL_cur_locale_obj = new_obj;
+#endif
- if (*p++ != '=') {
- Perl_croak(aTHX_
- "panic: %s: %d: Unexpected character in locale name '%02X",
- __FILE__, __LINE__, *(p-1));
- }
+ /* We are done, except for updating our records (if the system doesn't keep
+ * them) and in the case of locale "", we don't actually know what the
+ * locale that got switched to is, as it came from the environment. So
+ * have to find it */
- /* Parse through the locale name */
- name_start = p;
- while (p < e && *p != ';') {
- if (! isGRAPH(*p)) {
- Perl_croak(aTHX_
- "panic: %s: %d: Unexpected character in locale name '%02X",
- __FILE__, __LINE__, *(p-1));
- }
- p++;
- }
- name_end = p;
+# ifdef USE_QUERYLOCALE
- /* Space past the semi-colon */
- if (p < e) {
- p++;
- }
+ if (strEQ(new_locale, "")) {
+ new_locale = querylocale_i(index);
+ }
- /* Find the index of the category name in our lists */
- for (i = 0; i < LC_ALL_INDEX; i++) {
- char * individ_locale;
+ PERL_UNUSED_ARG(recalc_LC_ALL);
- /* Keep going if this isn't the index. The strnNE() avoids a
- * Perl_form(), but would fail if ever a category name could be
- * a substring of another one, like if there were a
- * "LC_TIME_DATE" */
- if strnNE(s, category_names[i], category_end - s) {
- continue;
- }
+# else
- /* If this index is for the single category we're changing, we
- * have found the locale to set it to. */
- if (category == categories[i]) {
- locale = Perl_form(aTHX_ "%.*s",
- (int) (name_end - name_start),
- name_start);
- goto ready_to_set;
- }
+ new_locale = update_PL_curlocales_i(index, new_locale, recalc_LC_ALL);
- assert(category == LC_ALL);
- individ_locale = Perl_form(aTHX_ "%.*s",
- (int) (name_end - name_start), name_start);
- if (! emulate_setlocale(categories[i], individ_locale, i, TRUE))
- {
- return NULL;
- }
- }
+# endif
+# ifdef HAS_GLIBC_LC_MESSAGES_BUG
- s = p;
+ /* Invalidate the glibc cache of loaded translations if the locale has
+ * changed, see [perl #134264] */
+ if (old_messages_locale) {
+ if (strNE(old_messages_locale, my_querylocale_c(LC_MESSAGES))) {
+ textdomain(textdomain(NULL));
}
+ }
- /* Here we have set all the individual categories by recursive calls.
- * These collectively should have fixed up LC_ALL, so can just query
- * what that now is */
- assert(category == LC_ALL);
+# endif
- return do_setlocale_c(LC_ALL, NULL);
- } /* End of this being setlocale(LC_ALL,
- "LC_CTYPE=foo;LC_NUMERIC=bar;...") */
+ return new_locale;
+}
- ready_to_set: ;
+#endif /* End of the various implementations of the setlocale and
+ querylocale macros used in the remainder of this program */
- /* Here at the end of having to deal with the absence of querylocale().
- * Some cases have already been fully handled by recursive calls to this
- * function. But at this point, we haven't dealt with those, but are now
- * prepared to, knowing what the locale name to set this category to is.
- * This would have come for free if this system had had querylocale() */
+#ifdef USE_LOCALE
-# endif /* end of ! querylocale */
+/* So far, the locale strings returned by modern 2008-compliant systems have
+ * been fine */
- assert(PL_C_locale_obj);
+STATIC const char *
+S_stdize_locale(pTHX_ const int category,
+ const char *input_locale,
+ const char **buf,
+ Size_t *buf_size,
+ const line_t caller_line)
+{
+ /* The return value of setlocale() is opaque, but is required to be usable
+ * as input to a future setlocale() to create the same state.
+ * Unfortunately not all systems are compliant. But most often they are of
+ * a very restricted set of forms that this file has been coded to expect.
+ *
+ * There are some outliers, though, that this function tries to tame:
+ *
+ * 1) A new-line. This function chomps any \n characters
+ * 2) foo=bar. 'bar' is what is generally meant, and the foo= part is
+ * stripped. This form is legal for LC_ALL. When found in
+ * that category group, the function calls itself
+ * recursively on each possible component category to make
+ * sure the individual categories are ok.
+ *
+ * If no changes to the input were made, it is returned; otherwise the
+ * changed version is stored into memory at *buf, with *buf_size set to its
+ * new value, and *buf is returned.
+ */
- /* Switching locales generally entails freeing the current one's space (at
- * the C library's discretion). We need to stop using that locale before
- * the switch. So switch to a known locale object that we don't otherwise
- * mess with. This returns the locale object in effect at the time of the
- * switch. */
- old_obj = uselocale(PL_C_locale_obj);
+ const char * first_bad;
+ const char * retval;
-# ifdef DEBUGGING
+ PERL_ARGS_ASSERT_STDIZE_LOCALE;
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale was using %p\n", __FILE__, __LINE__, old_obj);
+ if (input_locale == NULL) {
+ return NULL;
}
-# endif
+ first_bad = strpbrk(input_locale, "=\n");
- if (! old_obj) {
+ /* Most likely, there isn't a problem with the input */
+ if (LIKELY(! first_bad)) {
+ return input_locale;
+ }
-# ifdef DEBUGGING
+# ifdef LC_ALL
+
+ /* But if there is, and the category is LC_ALL, we have to look at each
+ * component category */
+ if (category == LC_ALL) {
+ const char * individ_locales[LC_ALL_INDEX_];
+ bool made_changes = FALSE;
+ unsigned int i;
- if (DEBUG_L_TEST || debug_initialization) {
- dSAVE_ERRNO;
- PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to C failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
- RESTORE_ERRNO;
+ for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ Size_t this_size = 0;
+ individ_locales[i] = stdize_locale(categories[i],
+ posix_setlocale(categories[i],
+ NULL),
+ &individ_locales[i],
+ &this_size,
+ caller_line);
+
+ /* If the size didn't change, it means this category did not have
+ * to be adjusted, and individ_locales[i] points to the buffer
+ * returned by posix_setlocale(); we have to copy that before
+ * it's called again in the next iteration */
+ if (this_size == 0) {
+ individ_locales[i] = savepv(individ_locales[i]);
+ }
+ else {
+ made_changes = TRUE;
+ }
}
-# endif
+ /* If all the individual categories were ok as-is, this was a false
+ * alarm. We must have seen an '=' which was a legal occurrence in
+ * this combination locale */
+ if (! made_changes) {
+ retval = input_locale; /* The input can be returned unchanged */
+ }
+ else {
+ retval = save_to_buffer(querylocale_c(LC_ALL), buf, buf_size);
+ }
- return NULL;
+ for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ Safefree(individ_locales[i]);
+ }
+
+ return retval;
}
-# ifdef DEBUGGING
+# else /* else no LC_ALL */
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "%s:%d: emulate_setlocale now using %p\n",
- __FILE__, __LINE__, PL_C_locale_obj);
- }
+ PERL_UNUSED_ARG(category);
+ PERL_UNUSED_ARG(caller_line);
-# endif
+# endif
- /* If this call is to switch to the LC_ALL C locale, it already exists, and
- * in fact, we already have switched to it (in preparation for what
- * normally is to come). But since we're already there, continue to use
- * it instead of trying to create a new locale */
- if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(locale)) {
+ /* Here, there was a problem in an individual category. This means that at
+ * least one adjustment will be necessary. Create a modifiable copy */
+ retval = save_to_buffer(input_locale, buf, buf_size);
-# ifdef DEBUGGING
+ if (*first_bad != '=') {
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "%s:%d: will stay in C object\n", __FILE__, __LINE__);
- }
+ /* Translate the found position into terms of the copy */
+ first_bad = retval + (first_bad - input_locale);
+ }
+ else { /* An '=' */
-# endif
+ /* It is unlikely that the return is so screwed-up that it contains
+ * multiple equals signs, but handle that case by stripping all of
+ * them. */
+ const char * final_equals = strrchr(retval, '=');
- new_obj = PL_C_locale_obj;
+ /* The length passed here causes the move to include the terminating
+ * NUL */
+ Move(final_equals + 1, retval, strlen(final_equals), char);
+
+ /* See if there are additional problems; if not, we're good to return.
+ * */
+ first_bad = strpbrk(retval, "\n");
- /* We already had switched to the C locale in preparation for freeing
- * 'old_obj' */
- if (old_obj != LC_GLOBAL_LOCALE && old_obj != PL_C_locale_obj) {
- freelocale(old_obj);
+ if (! first_bad) {
+ return retval;
}
}
- else {
- /* If we weren't in a thread safe locale, set so that newlocale() below
- * which uses 'old_obj', uses an empty one. Same for our reserved C
- * object. The latter is defensive coding, so that, even if there is
- * some bug, we will never end up trying to modify either of these, as
- * if passed to newlocale(), they can be. */
- if (old_obj == LC_GLOBAL_LOCALE || old_obj == PL_C_locale_obj) {
- old_obj = (locale_t) 0;
- }
- /* Ready to create a new locale by modification of the exising one */
- new_obj = newlocale(mask, locale, old_obj);
+ /* Here, the problem must be a \n. Get rid of it and what follows.
+ * (Originally, only a trailing \n was stripped. Unsure what to do if not
+ * trailing) */
+ *((char *) first_bad) = '\0';
+ return retval;
+}
- if (! new_obj) {
- dSAVE_ERRNO;
+#if defined(WIN32) || ( defined(USE_POSIX_2008_LOCALE) \
+ && ! defined(USE_QUERYLOCALE))
-# ifdef DEBUGGING
+STATIC const char *
+S_find_locale_from_environment(pTHX_ const unsigned int index)
+{
+ /* NB: This function may actually change the locale on Windows.
+ *
+ * On Windows systems, the concept of the POSIX ordering of environment
+ * variables is missing. To increase portability of programs across
+ * platforms, the POSIX ordering is emulated on Windows.
+ *
+ * And on POSIX 2008 systems without querylocale(), it is problematic
+ * getting the results of the POSIX 2008 equivalent of
+ * setlocale(category, "")
+ * (which gets the locale from the environment).
+ *
+ * To ensure that we know exactly what those values are, we do the setting
+ * ourselves, using the documented algorithm (assuming the documentation is
+ * correct) rather than use "" as the locale. This will lead to results
+ * that differ from native behavior if the native behavior differs from the
+ * standard documented value, but khw believes it is better to know what's
+ * going on, even if different from native, than to just guess.
+ *
+ * Another option for the POSIX 2008 case would be, in a critical section,
+ * to save the global locale's current value, and do a straight
+ * setlocale(LC_ALL, ""). That would return our desired values, destroying
+ * the global locale's, which we would then restore. But that could cause
+ * races with any other thread that is using the global locale and isn't
+ * using the mutex. And, the only reason someone would have done that is
+ * because they are calling a library function, like in gtk, that calls
+ * setlocale(), and which can't be changed to use the mutex. That wouldn't
+ * be a problem if this were to be done before any threads had switched,
+ * say during perl construction time. But this code would still be needed
+ * for the general case.
+ *
+ * The Windows and POSIX 2008 differ in that the ultimate fallback is "C"
+ * in POSIX, and is the system default locale in Windows. To get that
+ * system default value, we actually have to call setlocale() on Windows.
+ */
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "%s:%d: emulate_setlocale creating new object"
- " failed: %d\n", __FILE__, __LINE__, GET_ERRNO);
- }
+ /* We rely on PerlEnv_getenv() returning a mortalized copy */
+ const char * const lc_all = PerlEnv_getenv("LC_ALL");
-# endif
+ /* Use any "LC_ALL" environment variable, as it overrides everything
+ * else. */
+ if (lc_all && strNE(lc_all, "")) {
+ return lc_all;
+ }
- if (! uselocale(old_obj)) {
+ /* If setting an individual category, use its corresponding value found in
+ * the environment, if any */
+ if (index != LC_ALL_INDEX_) {
+ const char * const new_value = PerlEnv_getenv(category_names[index]);
-# ifdef DEBUGGING
+ if (new_value && strNE(new_value, "")) {
+ return new_value;
+ }
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "%s:%d: switching back failed: %d\n",
- __FILE__, __LINE__, GET_ERRNO);
- }
+ /* If no corresponding environment variable, see if LANG exists. If
+ * so, use it. */
+ const char * default_name = PerlEnv_getenv("LANG");
+ if (default_name && strNE(default_name, "")) {
+ return default_name;
+ }
+ /* If no LANG, use "C" on POSIX 2008, the system default on Windows */
+# ifndef WIN32
+ return "C";
+# else
+ return wrap_wsetlocale(categories[index], "");
# endif
- }
- RESTORE_ERRNO;
- return NULL;
- }
+ }
-# ifdef DEBUGGING
+ /* Here is LC_ALL, and no LC_ALL environment variable. LANG is used as a
+ * default, but overridden for individual categories that have
+ * corresponding environment variables. If no LANG exists, the default is
+ * "C" on POSIX 2008, or the system default for the category on Windows. */
+ const char * default_name = PerlEnv_getenv("LANG");
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "%s:%d: emulate_setlocale created %p",
- __FILE__, __LINE__, new_obj);
- if (old_obj) {
- PerlIO_printf(Perl_debug_log,
- "; should have freed %p", old_obj);
- }
- PerlIO_printf(Perl_debug_log, "\n");
+ /* Convert "" to NULL to save conditionals in the loop below */
+ if (default_name != NULL && strEQ(default_name, "")) {
+ default_name = NULL;
+ }
+
+ /* Loop through all the individual categories, setting each to any
+ * corresponding environment variable; or to the default if none exists for
+ * the category */
+ const char * locale_names[LC_ALL_INDEX_];
+ for (unsigned i = 0; i < LC_ALL_INDEX_; i++) {
+ const char * const env_override = PerlEnv_getenv(category_names[i]);
+
+ if (env_override && strNE(env_override, "")) {
+ locale_names[i] = env_override;
+ }
+ else if (default_name) {
+ locale_names[i] = default_name;
}
+ else {
+# ifndef WIN32
+ locale_names[i] = "C";
+# else
+ locale_names[i] = wrap_wsetlocale(categories[index], "");
# endif
- /* And switch into it */
- if (! uselocale(new_obj)) {
- dSAVE_ERRNO;
+ }
-# ifdef DEBUGGING
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "find_locale_from_environment i=%d, name=%s, locale=%s\n",
+ i, category_names[i], locale_names[i]));
+ }
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "%s:%d: emulate_setlocale switching to new object"
- " failed\n", __FILE__, __LINE__);
- }
+ return calculate_LC_ALL(locale_names);
+}
+#endif
+#if defined(WIN32) || defined(USE_POSIX_2008_LOCALE) || ! defined(LC_ALL)
+
+STATIC
+const char *
+
+# ifdef USE_QUERYLOCALE
+S_calculate_LC_ALL(pTHX_ const locale_t cur_obj)
+# else
+S_calculate_LC_ALL(pTHX_ const char ** individ_locales)
# endif
- if (! uselocale(old_obj)) {
+{
+ /* For POSIX 2008, we have to figure out LC_ALL ourselves when needed.
+ * querylocale(), on systems that have it, doesn't tend to work for LC_ALL.
+ * So we have to construct the answer ourselves based on the passed in
+ * data, which is either a locale_t object, for systems with querylocale(),
+ * or an array we keep updated to the proper values, otherwise.
+ *
+ * For Windows, we also may need to construct an LC_ALL when setting the
+ * locale to the system default.
+ *
+ * This function returns a mortalized string containing the locale name(s)
+ * of LC_ALL.
+ *
+ * If all individual categories are the same locale, we can just set LC_ALL
+ * to that locale. But if not, we have to create an aggregation of all the
+ * categories on the system. Platforms differ as to the syntax they use
+ * for these non-uniform locales for LC_ALL. Some use a '/' or other
+ * delimiter of the locales with a predetermined order of categories; a
+ * Configure probe would be needed to tell us how to decipher those. glibc
+ * and Windows use a series of name=value pairs, like
+ * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;...
+ * This function returns that syntax, which is suitable for input to the
+ * Windows setlocale(). It could also be suitable for glibc, but because
+ * the non-Windows code is common to systems that use a different syntax,
+ * we don't depend on it for glibc. Instead we take care not to use the
+ * native setlocale() function on whatever non-Windows style is chosen.
+ * But, it would be possible for someone to call Perl_setlocale() using a
+ * native style we don't understand. So far no one has complained.
+ *
+ * For systems that have categories we don't know about, the algorithm
+ * below won't know about those missing categories, leading to potential
+ * bugs for code that looks at them. If there is an environment variable
+ * that sets that category, we won't know to look for it, and so our use of
+ * LANG or "C" improperly overrides it. On the other hand, if we don't do
+ * what is done here, and there is no environment variable, the category's
+ * locale should be set to LANG or "C". So there is no good solution. khw
+ * thinks the best is to make sure we have a complete list of possible
+ * categories, adding new ones as they show up on obscure platforms.
+ */
+
+ unsigned int i;
+ Size_t names_len = 0;
+ bool are_all_categories_the_same_locale = TRUE;
+ char * aggregate_locale;
+ char * previous_start = NULL;
+ char * this_start = NULL;
+ Size_t entry_len = 0;
-# ifdef DEBUGGING
+ PERL_ARGS_ASSERT_CALCULATE_LC_ALL;
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "%s:%d: switching back failed: %d\n",
- __FILE__, __LINE__, GET_ERRNO);
- }
+ /* First calculate the needed size for the string listing the categories
+ * and their locales. */
+ for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+# ifdef USE_QUERYLOCALE
+ const char * entry = querylocale_l(i, cur_obj);
+# else
+ const char * entry = individ_locales[i];
# endif
- }
- freelocale(new_obj);
- RESTORE_ERRNO;
- return NULL;
- }
+ names_len += strlen(category_names[i])
+ + 1 /* '=' */
+ + strlen(entry)
+ + 1; /* ';' */
}
-# ifdef DEBUGGING
+ names_len++; /* Trailing '\0' */
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "%s:%d: emulate_setlocale now using %p\n",
- __FILE__, __LINE__, new_obj);
- }
+ /* Allocate enough space for the aggregated string */
+ Newxz(aggregate_locale, names_len, char);
+ SAVEFREEPV(aggregate_locale);
+
+ /* Then fill it in */
+ for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ Size_t new_len;
+# ifdef USE_QUERYLOCALE
+ const char * entry = querylocale_l(i, cur_obj);
+# else
+ const char * entry = individ_locales[i];
# endif
- /* We are done, except for updating our records (if the system doesn't keep
- * them) and in the case of locale "", we don't actually know what the
- * locale that got switched to is, as it came from the environment. So
- * have to find it */
+ new_len = my_strlcat(aggregate_locale, category_names[i], names_len);
+ assert(new_len <= names_len);
+ new_len = my_strlcat(aggregate_locale, "=", names_len);
+ assert(new_len <= names_len);
-# ifdef HAS_QUERYLOCALE
+ this_start = aggregate_locale + strlen(aggregate_locale);
+ entry_len = strlen(entry);
- if (strEQ(locale, "")) {
- locale = querylocale(mask, new_obj);
+ new_len = my_strlcat(aggregate_locale, entry, names_len);
+ assert(new_len <= names_len);
+ new_len = my_strlcat(aggregate_locale, ";", names_len);
+ assert(new_len <= names_len);
+ PERL_UNUSED_VAR(new_len); /* Only used in DEBUGGING */
+
+ if ( i > 0
+ && are_all_categories_the_same_locale
+ && memNE(previous_start, this_start, entry_len + 1))
+ {
+ are_all_categories_the_same_locale = FALSE;
+ }
+ else {
+ previous_start = this_start;
+ }
}
-# else
+ /* If they are all the same, just return any one of them */
+ if (are_all_categories_the_same_locale) {
+ aggregate_locale = this_start;
+ aggregate_locale[entry_len] = '\0';
+ }
- /* Here, 'locale' is the return value */
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "calculate_LC_ALL returning '%s'\n",
+ aggregate_locale));
- /* Without querylocale(), we have to update our records */
+ return aggregate_locale;
+}
- if (category == LC_ALL) {
- unsigned int i;
+#endif
+#if defined(USE_LOCALE) && ( defined(DEBUGGING) \
+ || defined(USE_PERL_SWITCH_LOCALE_CONTEXT))
- /* For LC_ALL, we change all individual categories to correspond */
- /* PL_curlocales is a parallel array, so has same
- * length as 'categories' */
- for (i = 0; i <= LC_ALL_INDEX; i++) {
- Safefree(PL_curlocales[i]);
- PL_curlocales[i] = savepv(locale);
- }
+STATIC const char *
+S_get_LC_ALL_display(pTHX)
+{
- FIX_GLIBC_LC_MESSAGES_BUG(LC_MESSAGES_INDEX);
- }
- else {
+# ifdef LC_ALL
- /* For a single category, if it's not the same as the one in LC_ALL, we
- * nullify LC_ALL */
+ return querylocale_c(LC_ALL);
- if (PL_curlocales[LC_ALL_INDEX] && strNE(PL_curlocales[LC_ALL_INDEX], locale)) {
- Safefree(PL_curlocales[LC_ALL_INDEX]);
- PL_curlocales[LC_ALL_INDEX] = NULL;
- }
+# else
- /* Then update the category's record */
- Safefree(PL_curlocales[index]);
- PL_curlocales[index] = savepv(locale);
+ const char * curlocales[NOMINAL_LC_ALL_INDEX];
- FIX_GLIBC_LC_MESSAGES_BUG(index);
+ for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ curlocales[i] = querylocale_i(i);
}
+ return calculate_LC_ALL(curlocales);
+
# endif
- return locale;
}
-#endif /* USE_POSIX_2008_LOCALE */
-
-#ifdef USE_LOCALE
+#endif
STATIC void
-S_set_numeric_radix(pTHX_ const bool use_locale)
+S_setlocale_failure_panic_i(pTHX_
+ const unsigned int cat_index,
+ const char * current,
+ const char * failed,
+ const line_t caller_0_line,
+ const line_t caller_1_line)
{
- /* If 'use_locale' is FALSE, set to use a dot for the radix character. If
- * TRUE, use the radix character derived from the current locale */
-
-#if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \
- || defined(HAS_NL_LANGINFO))
+ dSAVE_ERRNO;
+ const int cat = categories[cat_index];
+ const char * name = category_names[cat_index];
- const char * radix = (use_locale)
- ? my_nl_langinfo(RADIXCHAR, FALSE)
- /* FALSE => already in dest locale */
- : ".";
+ PERL_ARGS_ASSERT_SETLOCALE_FAILURE_PANIC_I;
- sv_setpv(PL_numeric_radix_sv, radix);
-
- /* If this is valid UTF-8 that isn't totally ASCII, and we are in
- * a UTF-8 locale, then mark the radix as being in UTF-8 */
- if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv),
- SvCUR(PL_numeric_radix_sv))
- && _is_cur_LC_category_utf8(LC_NUMERIC))
- {
- SvUTF8_on(PL_numeric_radix_sv);
+ if (current == NULL) {
+ current = querylocale_i(cat_index);
}
-# ifdef DEBUGGING
-
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n",
- SvPVX(PL_numeric_radix_sv),
- cBOOL(SvUTF8(PL_numeric_radix_sv)));
- }
+ Perl_locale_panic(Perl_form(aTHX_ "(%" LINE_Tf
+ "): Can't change locale for %s(%d)"
+ " from '%s' to '%s'",
+ caller_1_line, name, cat,
+ current, failed),
+ __FILE__, caller_0_line, GET_ERRNO);
+ NOT_REACHED; /* NOTREACHED */
+}
+/* Any of these will allow us to find the RADIX */
+# if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_SOME_LANGINFO) \
+ || defined(HAS_LOCALECONV) \
+ || defined(HAS_SNPRINTF))
+# define CAN_CALCULATE_RADIX
# endif
-#else
-
- PERL_UNUSED_ARG(use_locale);
-
-#endif /* USE_LOCALE_NUMERIC and can find the radix char */
-
-}
+# ifdef USE_LOCALE_NUMERIC
STATIC void
-S_new_numeric(pTHX_ const char *newnum)
+S_new_numeric(pTHX_ const char *newnum, bool force)
{
-
-#ifndef USE_LOCALE_NUMERIC
-
- PERL_UNUSED_ARG(newnum);
-
-#else
+ PERL_ARGS_ASSERT_NEW_NUMERIC;
/* Called after each libc setlocale() call affecting LC_NUMERIC, to tell
- * core Perl this and that 'newnum' is the name of the new locale.
- * It installs this locale as the current underlying default.
+ * core Perl this and that 'newnum' is the name of the new locale, and we
+ * are switched into it. It installs this locale as the current underlying
+ * default, and then switches to the C locale, if necessary, so that the
+ * code that has traditionally expected the radix character to be a dot may
+ * continue to do so.
*
* The default locale and the C locale can be toggled between by use of the
* set_numeric_underlying() and set_numeric_standard() functions, which
@@ -1285,8 +1866,8 @@ S_new_numeric(pTHX_ const char *newnum)
* SET_NUMERIC_STANDARD() in perl.h.
*
* The toggling is necessary mainly so that a non-dot radix decimal point
- * character can be output, while allowing internal calculations to use a
- * dot.
+ * character can be input and output, while allowing internal calculations
+ * to use a dot.
*
* This sets several interpreter-level variables:
* PL_numeric_name The underlying locale's name: a copy of 'newnum'
@@ -1305,101 +1886,155 @@ S_new_numeric(pTHX_ const char *newnum)
* variables are true at the same time. (Toggling is a
* no-op under these circumstances.) This variable is
* used to avoid having to recalculate.
+ * PL_numeric_radix_sv Contains the string that code should use for the
+ * decimal point. It is set to either a dot or the
+ * program's underlying locale's radix character string,
+ * depending on the situation.
+ * PL_underlying_radix_sv Contains the program's underlying locale's radix
+ * character string. This is copied into
+ * PL_numeric_radix_sv when the situation warrants. It
+ * exists to avoid having to recalculate it when toggling.
+ * PL_underlying_numeric_obj = (only on POSIX 2008 platforms) An object
+ * with everything set up properly so as to avoid work on
+ * such platforms.
*/
- char *save_newnum;
+ DEBUG_L( PerlIO_printf(Perl_debug_log,
+ "Called new_numeric with %s, PL_numeric_name=%s\n",
+ newnum, PL_numeric_name));
+
+ /* If not forcing this procedure, and there isn't actually a change from
+ * our records, do nothing. (Our records can be wrong when sync'ing to the
+ * locale set up by an external library, hence the 'force' parameter) */
+ if (! force && strEQ(PL_numeric_name, newnum)) {
+ return;
+ }
+
+ Safefree(PL_numeric_name);
+ PL_numeric_name = savepv(newnum);
- if (! newnum) {
- Safefree(PL_numeric_name);
- PL_numeric_name = NULL;
+ /* Handle the trivial case. Since this is called at process
+ * initialization, be aware that this bit can't rely on much being
+ * available. */
+ if (isNAME_C_OR_POSIX(PL_numeric_name)) {
PL_numeric_standard = TRUE;
- PL_numeric_underlying = TRUE;
PL_numeric_underlying_is_standard = TRUE;
+ PL_numeric_underlying = TRUE;
+ sv_setpv(PL_numeric_radix_sv, C_decimal_point);
+ sv_setpv(PL_underlying_radix_sv, C_decimal_point);
return;
}
- save_newnum = stdize_locale(savepv(newnum));
+ /* We are in the underlying locale until changed at the end of this
+ * function */
PL_numeric_underlying = TRUE;
- PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum);
-#ifndef TS_W32_BROKEN_LOCALECONV
+# ifdef USE_POSIX_2008_LOCALE
- /* If its name isn't C nor POSIX, it could still be indistinguishable from
- * them. But on broken Windows systems calling my_nl_langinfo() for
- * THOUSEP can currently (but rarely) cause a race, so avoid doing that,
- * and just always change the locale if not C nor POSIX on those systems */
- if (! PL_numeric_standard) {
- PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR,
- FALSE /* Don't toggle locale */ ))
- && strEQ("", my_nl_langinfo(THOUSEP, FALSE)));
- }
+ /* We keep a special object for easy switching to.
+ *
+ * NOTE: This code may incorrectly show up as a leak under the address
+ * sanitizer. We do not free this object under normal teardown, however
+ * you can set PERL_DESTRUCT_LEVEL=2 to cause it to be freed.
+ */
+ PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
+ PL_numeric_name,
+ PL_underlying_numeric_obj);
-#endif
+# endif
- /* Save the new name if it isn't the same as the previous one, if any */
- if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) {
- Safefree(PL_numeric_name);
- PL_numeric_name = save_newnum;
- }
- else {
- Safefree(save_newnum);
- }
+ const char * radix = NULL;
+ utf8ness_t utf8ness = UTF8NESS_IMMATERIAL;
- PL_numeric_underlying_is_standard = PL_numeric_standard;
+ /* Find and save this locale's radix character. */
+ my_langinfo_c(RADIXCHAR, LC_NUMERIC, PL_numeric_name,
+ &radix, NULL, &utf8ness);
+ sv_setpv(PL_underlying_radix_sv, radix);
-# ifdef HAS_POSIX_2008_LOCALE
+ if (utf8ness == UTF8NESS_YES) {
+ SvUTF8_on(PL_underlying_radix_sv);
+ }
- PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK,
- PL_numeric_name,
- PL_underlying_numeric_obj);
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "Locale radix is '%s', ?UTF-8=%d\n",
+ SvPVX(PL_underlying_radix_sv),
+ cBOOL(SvUTF8(PL_underlying_radix_sv))));
-#endif
+ /* This locale is indistinguishable from C (for numeric purposes) if both
+ * the radix character and the thousands separator are the same as C's.
+ * Start with the radix. */
+ PL_numeric_underlying_is_standard = strEQ(C_decimal_point, radix);
+ Safefree(radix);
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name);
- }
+# ifndef TS_W32_BROKEN_LOCALECONV
- /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't
- * have to worry about the radix being a non-dot. (Core operations that
- * need the underlying locale change to it temporarily). */
- if (PL_numeric_standard) {
- set_numeric_radix(0);
+ /* If the radix isn't the same as C's, we know it is distinguishable from
+ * C; otherwise check the thousands separator too. Only if both are the
+ * same as C's is the locale indistinguishable from C.
+ *
+ * But on earlier Windows versions, there is a potential race. This code
+ * knows that localeconv() (elsewhere in this file) will be used to extract
+ * the needed value, and localeconv() was buggy for quite a while, and that
+ * code in this file hence uses a workaround. And that workaround may have
+ * an (unlikely) race. Gathering the radix uses a different workaround on
+ * Windows that doesn't involve a race. It might be possible to do the
+ * same for this (patches welcome).
+ *
+ * Until then khw doesn't think it's worth even the small risk of a race to
+ * get this value, which doesn't appear to be used in any of the Microsoft
+ * library routines anyway. */
+
+ const char * scratch_buffer = NULL;
+ if (PL_numeric_underlying_is_standard) {
+ PL_numeric_underlying_is_standard = strEQ(C_thousands_sep,
+ my_langinfo_c(THOUSEP, LC_NUMERIC,
+ PL_numeric_name,
+ &scratch_buffer,
+ NULL, NULL));
}
- else {
+ Safefree(scratch_buffer);
+
+# else
+ PERL_UNUSED_VAR(C_thousands_sep);
+# endif
+
+ PL_numeric_standard = PL_numeric_underlying_is_standard;
+
+ /* Keep LC_NUMERIC so that it has the C locale radix and thousands
+ * separator. This is for XS modules, so they don't have to worry about
+ * the radix being a non-dot. (Core operations that need the underlying
+ * locale change to it temporarily). */
+ if (! PL_numeric_standard) {
set_numeric_standard();
}
-#endif /* USE_LOCALE_NUMERIC */
-
}
+# endif
+
void
Perl_set_numeric_standard(pTHX)
{
-#ifdef USE_LOCALE_NUMERIC
-
- /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like
- * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The
- * macro avoids calling this routine if toggling isn't necessary according
- * to our records (which could be wrong if some XS code has changed the
- * locale behind our back) */
-
-# ifdef DEBUGGING
+# ifdef USE_LOCALE_NUMERIC
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "Setting LC_NUMERIC locale to standard C\n");
- }
+ /* Unconditionally toggle the LC_NUMERIC locale to the C locale
+ *
+ * Most code should use the macro SET_NUMERIC_STANDARD() in perl.h
+ * instead of calling this directly. The macro avoids calling this routine
+ * if toggling isn't necessary according to our records (which could be
+ * wrong if some XS code has changed the locale behind our back) */
-# endif
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "Setting LC_NUMERIC locale to standard C\n"));
- do_setlocale_c(LC_NUMERIC, "C");
+ void_setlocale_c(LC_NUMERIC, "C");
PL_numeric_standard = TRUE;
+ sv_setpv(PL_numeric_radix_sv, C_decimal_point);
+
PL_numeric_underlying = PL_numeric_underlying_is_standard;
- set_numeric_radix(0);
-#endif /* USE_LOCALE_NUMERIC */
+# endif /* USE_LOCALE_NUMERIC */
}
@@ -1407,65 +2042,50 @@ void
Perl_set_numeric_underlying(pTHX)
{
-#ifdef USE_LOCALE_NUMERIC
+# ifdef USE_LOCALE_NUMERIC
- /* Toggle the LC_NUMERIC locale to the current underlying default. Most
- * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h
+ /* Unconditionally toggle the LC_NUMERIC locale to the current underlying
+ * default.
+ *
+ * Most code should use the macro SET_NUMERIC_UNDERLYING() in perl.h
* instead of calling this directly. The macro avoids calling this routine
* if toggling isn't necessary according to our records (which could be
* wrong if some XS code has changed the locale behind our back) */
-# ifdef DEBUGGING
-
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "Setting LC_NUMERIC locale to %s\n",
- PL_numeric_name);
- }
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "Setting LC_NUMERIC locale to %s\n",
+ PL_numeric_name));
-# endif
+ void_setlocale_c(LC_NUMERIC, PL_numeric_name);
+ PL_numeric_underlying = TRUE;
+ sv_setsv_nomg(PL_numeric_radix_sv, PL_underlying_radix_sv);
- do_setlocale_c(LC_NUMERIC, PL_numeric_name);
PL_numeric_standard = PL_numeric_underlying_is_standard;
- PL_numeric_underlying = TRUE;
- set_numeric_radix(! PL_numeric_standard);
-#endif /* USE_LOCALE_NUMERIC */
+# endif /* USE_LOCALE_NUMERIC */
}
-/*
- * Set up for a new ctype locale.
- */
+# ifdef USE_LOCALE_CTYPE
+
STATIC void
-S_new_ctype(pTHX_ const char *newctype)
+S_new_ctype(pTHX_ const char *newctype, bool force)
{
-
-#ifndef USE_LOCALE_CTYPE
-
- PERL_UNUSED_ARG(newctype);
- PERL_UNUSED_CONTEXT;
-
-#else
+ PERL_ARGS_ASSERT_NEW_CTYPE;
+ PERL_UNUSED_ARG(force);
/* Called after each libc setlocale() call affecting LC_CTYPE, to tell
* core Perl this and that 'newctype' is the name of the new locale.
*
* This function sets up the folding arrays for all 256 bytes, assuming
* that tofold() is tolc() since fold case is not a concept in POSIX,
- *
- * Any code changing the locale (outside this file) should use
- * Perl_setlocale or POSIX::setlocale, which call this function. Therefore
- * this function should be called directly only from this file and from
- * POSIX::setlocale() */
+ */
- unsigned int i;
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering new_ctype(%s)\n", newctype));
- /* Don't check for problems if we are suppressing the warnings */
- bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
- bool maybe_utf8_turkic = FALSE;
-
- PERL_ARGS_ASSERT_NEW_CTYPE;
+ /* No change means no-op */
+ if (strEQ(PL_ctype_name, newctype)) {
+ return;
+ }
/* We will replace any bad locale warning with 1) nothing if the new one is
* ok; or 2) a new warning for the bad new locale */
@@ -1474,52 +2094,196 @@ S_new_ctype(pTHX_ const char *newctype)
PL_warn_locale = NULL;
}
- PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE);
+ /* Clear cache */
+ Safefree(PL_ctype_name);
+ PL_ctype_name = "";
+
+ PL_in_utf8_turkic_locale = FALSE;
+
+ /* For the C locale, just use the standard folds, and we know there are no
+ * glitches possible, so return early. Since this is called at process
+ * initialization, be aware that this bit can't rely on much being
+ * available. */
+ if (isNAME_C_OR_POSIX(newctype)) {
+ Copy(PL_fold, PL_fold_locale, 256, U8);
+ PL_ctype_name = savepv(newctype);
+ PL_in_utf8_CTYPE_locale = FALSE;
+ return;
+ }
+
+ /* The cache being cleared signals this function to compute a new value */
+ PL_in_utf8_CTYPE_locale = is_locale_utf8(newctype);
+
+ PL_ctype_name = savepv(newctype);
+ bool maybe_utf8_turkic = FALSE;
+
+ /* Don't check for problems if we are suppressing the warnings */
+ bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST);
- /* A UTF-8 locale gets standard rules. But note that code still has to
- * handle this specially because of the three problematic code points */
if (PL_in_utf8_CTYPE_locale) {
+
+ /* A UTF-8 locale gets standard rules. But note that code still has to
+ * handle this specially because of the three problematic code points
+ * */
Copy(PL_fold_latin1, PL_fold_locale, 256, U8);
/* UTF-8 locales can have special handling for 'I' and 'i' if they are
- * Turkic. Make sure these two are the only anomalies. (We don't use
- * towupper and towlower because they aren't in C89.) */
+ * Turkic. Make sure these two are the only anomalies. (We don't
+ * require towupper and towlower because they aren't in C89.) */
-#if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
+# if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER)
- if (towupper('i') == 0x130 && towlower('I') == 0x131) {
+ if (towupper('i') == 0x130 && towlower('I') == 0x131)
-#else
+# else
- if (toupper('i') == 'i' && tolower('I') == 'I') {
+ if (toU8_UPPER_LC('i') == 'i' && toU8_LOWER_LC('I') == 'I')
-#endif
+# endif
+
+ {
+ /* This is how we determine it really is Turkic */
check_for_problems = TRUE;
maybe_utf8_turkic = TRUE;
}
}
+ else { /* Not a canned locale we know the values for. Compute them */
+
+# ifdef DEBUGGING
+
+ bool has_non_ascii_fold = FALSE;
+ bool found_unexpected = FALSE;
+
+ /* Under -DLv, see if there are any folds outside the ASCII range.
+ * This factoid is used below */
+ if (DEBUG_Lv_TEST) {
+ for (unsigned i = 128; i < 256; i++) {
+ int j = LATIN1_TO_NATIVE(i);
+ if (toU8_LOWER_LC(j) != j || toU8_UPPER_LC(j) != j) {
+ has_non_ascii_fold = TRUE;
+ break;
+ }
+ }
+ }
+
+# endif
+
+ for (unsigned i = 0; i < 256; i++) {
+ if (isU8_UPPER_LC(i))
+ PL_fold_locale[i] = (U8) toU8_LOWER_LC(i);
+ else if (isU8_LOWER_LC(i))
+ PL_fold_locale[i] = (U8) toU8_UPPER_LC(i);
+ else
+ PL_fold_locale[i] = (U8) i;
+
+# ifdef DEBUGGING
+
+ /* Most locales these days are supersets of ASCII. When debugging
+ * with -DLv, it is helpful to know what the exceptions to that are
+ * in this locale */
+ if (DEBUG_Lv_TEST) {
+ bool unexpected = FALSE;
+
+ if (isUPPER_L1(i)) {
+ if (isUPPER_A(i)) {
+ if (PL_fold_locale[i] != toLOWER_A(i)) {
+ unexpected = TRUE;
+ }
+ }
+ else if (has_non_ascii_fold) {
+ if (PL_fold_locale[i] != toLOWER_L1(i)) {
+ unexpected = TRUE;
+ }
+ }
+ else if (PL_fold_locale[i] != i) {
+ unexpected = TRUE;
+ }
+ }
+ else if ( isLOWER_L1(i)
+ && i != LATIN_SMALL_LETTER_SHARP_S
+ && i != MICRO_SIGN)
+ {
+ if (isLOWER_A(i)) {
+ if (PL_fold_locale[i] != toUPPER_A(i)) {
+ unexpected = TRUE;
+ }
+ }
+ else if (has_non_ascii_fold) {
+ if (PL_fold_locale[i] != toUPPER_LATIN1_MOD(i)) {
+ unexpected = TRUE;
+ }
+ }
+ else if (PL_fold_locale[i] != i) {
+ unexpected = TRUE;
+ }
+ }
+ else if (PL_fold_locale[i] != i) {
+ unexpected = TRUE;
+ }
+
+ if (unexpected) {
+ found_unexpected = TRUE;
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "For %s, fold of %02x is %02x\n",
+ newctype, i, PL_fold_locale[i]));
+ }
+ }
+ }
+
+ if (found_unexpected) {
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "All bytes not mentioned above either fold to"
+ " themselves or are the expected ASCII or"
+ " Latin1 ones\n"));
+ }
+ else {
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "No nonstandard folds were found\n"));
+# endif
+
+ }
+ }
+
+# ifdef MB_CUR_MAX
+
+ /* We only handle single-byte locales (outside of UTF-8 ones); so if this
+ * locale requires more than one byte, there are going to be BIG problems.
+ * */
+
+ if (MB_CUR_MAX > 1 && ! PL_in_utf8_CTYPE_locale
+
+ /* Some platforms return MB_CUR_MAX > 1 for even the "C" locale.
+ * Just assume that the implementation for them (plus for POSIX) is
+ * correct and the > 1 value is spurious. (Since these are
+ * specially handled to never be considered UTF-8 locales, as long
+ * as this is the only problem, everything should work fine */
+ && ! isNAME_C_OR_POSIX(newctype))
+ {
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "Unsupported, MB_CUR_MAX=%d\n", (int) MB_CUR_MAX));
+
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_LOCALE),
+ "Locale '%s' is unsupported, and may crash the"
+ " interpreter.\n",
+ newctype);
+ }
+
+# endif
+
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, "check_for_problems=%d\n",
+ check_for_problems));
/* We don't populate the other lists if a UTF-8 locale, but do check that
* everything works as expected, unless checking turned off */
- if (check_for_problems || ! PL_in_utf8_CTYPE_locale) {
+ if (check_for_problems) {
/* Assume enough space for every character being bad. 4 spaces each
* for the 94 printable characters that are output like "'x' "; and 5
* spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating
* NUL */
char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' };
- bool multi_byte_locale = FALSE; /* Assume is a single-byte locale
- to start */
unsigned int bad_count = 0; /* Count of bad characters */
- for (i = 0; i < 256; i++) {
- if (! PL_in_utf8_CTYPE_locale) {
- if (isupper(i))
- PL_fold_locale[i] = (U8) tolower(i);
- else if (islower(i))
- PL_fold_locale[i] = (U8) toupper(i);
- else
- PL_fold_locale[i] = (U8) i;
- }
+ for (unsigned i = 0; i < 256; i++) {
/* If checking for locale problems, see if the native ASCII-range
* printables plus \n and \t are in their expected categories in
@@ -1530,9 +2294,7 @@ S_new_ctype(pTHX_ const char *newctype)
* nowadays. It isn't a problem for most controls to be changed
* into something else; we check only \n and \t, though perhaps \r
* could be an issue as well. */
- if ( check_for_problems
- && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n'))
- {
+ if (isGRAPH_A(i) || isBLANK_A(i) || i == '\n') {
bool is_bad = FALSE;
char name[4] = { '\0' };
@@ -1553,77 +2315,79 @@ S_new_ctype(pTHX_ const char *newctype)
}
/* Check each possibe class */
- if (UNLIKELY(cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_ALPHANUMERIC_LC(i)) !=
+ cBOOL(isALPHANUMERIC_A(i))))
+ {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "isalnum('%s') unexpectedly is %d\n",
- name, cBOOL(isalnum(i))));
+ "isalnum('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_ALPHANUMERIC_LC(i))));
}
- if (UNLIKELY(cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_ALPHA_LC(i)) != cBOOL(isALPHA_A(i)))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "isalpha('%s') unexpectedly is %d\n",
- name, cBOOL(isalpha(i))));
+ "isalpha('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_ALPHA_LC(i))));
}
- if (UNLIKELY(cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_DIGIT_LC(i)) != cBOOL(isDIGIT_A(i)))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "isdigit('%s') unexpectedly is %d\n",
- name, cBOOL(isdigit(i))));
+ "isdigit('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_DIGIT_LC(i))));
}
- if (UNLIKELY(cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_GRAPH_LC(i)) != cBOOL(isGRAPH_A(i)))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "isgraph('%s') unexpectedly is %d\n",
- name, cBOOL(isgraph(i))));
+ "isgraph('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_GRAPH_LC(i))));
}
- if (UNLIKELY(cBOOL(islower(i)) != cBOOL(isLOWER_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_LOWER_LC(i)) != cBOOL(isLOWER_A(i)))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "islower('%s') unexpectedly is %d\n",
- name, cBOOL(islower(i))));
+ "islower('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_LOWER_LC(i))));
}
- if (UNLIKELY(cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_PRINT_LC(i)) != cBOOL(isPRINT_A(i)))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "isprint('%s') unexpectedly is %d\n",
- name, cBOOL(isprint(i))));
+ "isprint('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_PRINT_LC(i))));
}
- if (UNLIKELY(cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_PUNCT_LC(i)) != cBOOL(isPUNCT_A(i)))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "ispunct('%s') unexpectedly is %d\n",
- name, cBOOL(ispunct(i))));
+ "ispunct('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_PUNCT_LC(i))));
}
- if (UNLIKELY(cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_SPACE_LC(i)) != cBOOL(isSPACE_A(i)))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "isspace('%s') unexpectedly is %d\n",
- name, cBOOL(isspace(i))));
+ "isspace('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_SPACE_LC(i))));
}
- if (UNLIKELY(cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_UPPER_LC(i)) != cBOOL(isUPPER_A(i)))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "isupper('%s') unexpectedly is %d\n",
- name, cBOOL(isupper(i))));
+ "isupper('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_UPPER_LC(i))));
}
- if (UNLIKELY(cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)))) {
+ if (UNLIKELY(cBOOL(isU8_XDIGIT_LC(i))!= cBOOL(isXDIGIT_A(i)))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "isxdigit('%s') unexpectedly is %d\n",
- name, cBOOL(isxdigit(i))));
+ "isxdigit('%s') unexpectedly is %x\n",
+ name, cBOOL(isU8_XDIGIT_LC(i))));
}
- if (UNLIKELY(tolower(i) != (int) toLOWER_A(i))) {
+ if (UNLIKELY(toU8_LOWER_LC(i) != (int) toLOWER_A(i))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
"tolower('%s')=0x%x instead of the expected 0x%x\n",
- name, tolower(i), (int) toLOWER_A(i)));
+ name, toU8_LOWER_LC(i), (int) toLOWER_A(i)));
}
- if (UNLIKELY(toupper(i) != (int) toUPPER_A(i))) {
+ if (UNLIKELY(toU8_UPPER_LC(i) != (int) toUPPER_A(i))) {
is_bad = TRUE;
DEBUG_L(PerlIO_printf(Perl_debug_log,
"toupper('%s')=0x%x instead of the expected 0x%x\n",
- name, toupper(i), (int) toUPPER_A(i)));
+ name, toU8_UPPER_LC(i), (int) toUPPER_A(i)));
}
if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) {
is_bad = TRUE;
@@ -1645,46 +2409,25 @@ S_new_ctype(pTHX_ const char *newctype)
if (bad_count == 2 && maybe_utf8_turkic) {
bad_count = 0;
*bad_chars_list = '\0';
- PL_fold_locale['I'] = 'I';
- PL_fold_locale['i'] = 'i';
- PL_in_utf8_turkic_locale = TRUE;
- DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n",
- __FILE__, __LINE__, newctype));
- }
- else {
- PL_in_utf8_turkic_locale = FALSE;
- }
-# ifdef MB_CUR_MAX
-
- /* We only handle single-byte locales (outside of UTF-8 ones; so if
- * this locale requires more than one byte, there are going to be
- * problems. */
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n",
- __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX));
-
- if ( check_for_problems && MB_CUR_MAX > 1
- && ! PL_in_utf8_CTYPE_locale
-
- /* Some platforms return MB_CUR_MAX > 1 for even the "C"
- * locale. Just assume that the implementation for them (plus
- * for POSIX) is correct and the > 1 value is spurious. (Since
- * these are specially handled to never be considered UTF-8
- * locales, as long as this is the only problem, everything
- * should work fine */
- && strNE(newctype, "C") && strNE(newctype, "POSIX"))
- {
- multi_byte_locale = TRUE;
+ /* The casts are because otherwise some compilers warn:
+ gcc.gnu.org/bugzilla/show_bug.cgi?id=99950
+ gcc.gnu.org/bugzilla/show_bug.cgi?id=94182
+ */
+ PL_fold_locale[ (U8) 'I' ] = 'I';
+ PL_fold_locale[ (U8) 'i' ] = 'i';
+ PL_in_utf8_turkic_locale = TRUE;
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "%s is turkic\n", newctype));
}
-# endif
-
/* If we found problems and we want them output, do so */
- if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale))
+ if ( (UNLIKELY(bad_count))
&& (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST)))
{
- if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) {
+ /* WARNING. If you change the wording of these; be sure to update
+ * t/loc_tools.pl correspondingly */
+
+ if (PL_in_utf8_CTYPE_locale) {
PL_warn_locale = Perl_newSVpvf(aTHX_
"Locale '%s' contains (at least) the following characters"
" which have\nunexpected meanings: %s\nThe Perl program"
@@ -1692,29 +2435,24 @@ S_new_ctype(pTHX_ const char *newctype)
newctype, bad_chars_list);
}
else {
- PL_warn_locale = Perl_newSVpvf(aTHX_
- "Locale '%s' may not work well.%s%s%s\n",
- newctype,
- (multi_byte_locale)
- ? " Some characters in it are not recognized by"
- " Perl."
- : "",
- (bad_count)
- ? "\nThe following characters (and maybe others)"
- " may not have the same meaning as the Perl"
- " program expects:\n"
- : "",
- (bad_count)
- ? bad_chars_list
- : ""
+ PL_warn_locale =
+ Perl_newSVpvf(aTHX_
+ "\nThe following characters (and maybe"
+ " others) may not have the same meaning as"
+ " the Perl program expects: %s\n",
+ bad_chars_list
);
}
-# ifdef HAS_NL_LANGINFO
+# ifdef HAS_SOME_LANGINFO
+ const char * scratch_buffer = NULL;
Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s",
- /* parameter FALSE is a don't care here */
- my_nl_langinfo(CODESET, FALSE));
+ my_langinfo_c(CODESET, LC_CTYPE,
+ newctype,
+ &scratch_buffer, NULL,
+ NULL));
+ Safefree(scratch_buffer);
# endif
@@ -1728,7 +2466,8 @@ S_new_ctype(pTHX_ const char *newctype)
if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) {
/* The '0' below suppresses a bogus gcc compiler warning */
- Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0);
+ Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale),
+ 0);
if (IN_LC(LC_CTYPE)) {
SvREFCNT_dec_NN(PL_warn_locale);
@@ -1737,22 +2476,21 @@ S_new_ctype(pTHX_ const char *newctype)
}
}
}
-
-#endif /* USE_LOCALE_CTYPE */
-
}
+# endif /* USE_LOCALE_CTYPE */
+
void
Perl__warn_problematic_locale()
{
-#ifdef USE_LOCALE_CTYPE
+# ifdef USE_LOCALE_CTYPE
dTHX;
/* Internal-to-core function that outputs the message in PL_warn_locale,
* and then NULLS it. Should be called only through the macro
- * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */
+ * CHECK_AND_WARN_PROBLEMATIC_LOCALE_ */
if (PL_warn_locale) {
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
@@ -1762,26 +2500,39 @@ Perl__warn_problematic_locale()
PL_warn_locale = NULL;
}
-#endif
+# endif
}
STATIC void
-S_new_collate(pTHX_ const char *newcoll)
+S_new_LC_ALL(pTHX_ const char *unused, bool force)
{
+ PERL_ARGS_ASSERT_NEW_LC_ALL;
+ PERL_UNUSED_ARG(unused);
-#ifndef USE_LOCALE_COLLATE
+ /* LC_ALL updates all the things we care about. */
+
+ for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ if (update_functions[i]) {
+ const char * this_locale = querylocale_i(i);
+ update_functions[i](aTHX_ this_locale, force);
+ }
+ }
+}
- PERL_UNUSED_ARG(newcoll);
- PERL_UNUSED_CONTEXT;
+# ifdef USE_LOCALE_COLLATE
-#else
+STATIC void
+S_new_collate(pTHX_ const char *newcoll, bool force)
+{
+ PERL_ARGS_ASSERT_NEW_COLLATE;
+ PERL_UNUSED_ARG(force);
/* Called after each libc setlocale() call affecting LC_COLLATE, to tell
* core Perl this and that 'newcoll' is the name of the new locale.
*
* The design of locale collation is that every locale change is given an
- * index 'PL_collation_ix'. The first time a string particpates in an
+ * index 'PL_collation_ix'. The first time a string participates in an
* operation that requires collation while locale collation is active, it
* is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That
* magic includes the collation index, and the transformation of the string
@@ -1794,14 +2545,23 @@ S_new_collate(pTHX_ const char *newcoll)
* that a transformation would improperly be considered valid, leading to
* an unlikely bug */
- if (! newcoll) {
- if (PL_collation_name) {
- ++PL_collation_ix;
- Safefree(PL_collation_name);
- PL_collation_name = NULL;
- }
- PL_collation_standard = TRUE;
- is_standard_collation:
+ /* Return if the locale isn't changing */
+ if (strEQ(PL_collation_name, newcoll)) {
+ return;
+ }
+
+ Safefree(PL_collation_name);
+ PL_collation_name = savepv(newcoll);
+ ++PL_collation_ix;
+
+ /* Set the new one up if trivial. Since this is called at process
+ * initialization, be aware that this bit can't rely on much being
+ * available. */
+ PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
+ if (PL_collation_standard) {
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "Setting PL_collation name='%s'\n",
+ PL_collation_name));
PL_collxfrm_base = 0;
PL_collxfrm_mult = 2;
PL_in_utf8_COLLATE_locale = FALSE;
@@ -1810,240 +2570,105 @@ S_new_collate(pTHX_ const char *newcoll)
return;
}
- /* If this is not the same locale as currently, set the new one up */
- if (! PL_collation_name || strNE(PL_collation_name, newcoll)) {
- ++PL_collation_ix;
- Safefree(PL_collation_name);
- PL_collation_name = stdize_locale(savepv(newcoll));
- PL_collation_standard = isNAME_C_OR_POSIX(newcoll);
- if (PL_collation_standard) {
- goto is_standard_collation;
- }
+ /* Flag that the remainder of the set up is being deferred until first
+ * need. */
+ PL_collxfrm_mult = 0;
+ PL_collxfrm_base = 0;
- PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE);
- PL_strxfrm_NUL_replacement = '\0';
- PL_strxfrm_max_cp = 0;
-
- /* A locale collation definition includes primary, secondary, tertiary,
- * etc. weights for each character. To sort, the primary weights are
- * used, and only if they compare equal, then the secondary weights are
- * used, and only if they compare equal, then the tertiary, etc.
- *
- * strxfrm() works by taking the input string, say ABC, and creating an
- * output transformed string consisting of first the primary weights,
- * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the
- * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters
- * may not have weights at every level. In our example, let's say B
- * doesn't have a tertiary weight, and A doesn't have a secondary
- * weight. The constructed string is then going to be
- * A¹B¹C¹ B²C² A³C³ ....
- * This has the desired effect that strcmp() will look at the secondary
- * or tertiary weights only if the strings compare equal at all higher
- * priority weights. The spaces shown here, like in
- * "A¹B¹C¹ A²B²C² "
- * are not just for readability. In the general case, these must
- * actually be bytes, which we will call here 'separator weights'; and
- * they must be smaller than any other weight value, but since these
- * are C strings, only the terminating one can be a NUL (some
- * implementations may include a non-NUL separator weight just before
- * the NUL). Implementations tend to reserve 01 for the separator
- * weights. They are needed so that a shorter string's secondary
- * weights won't be misconstrued as primary weights of a longer string,
- * etc. By making them smaller than any other weight, the shorter
- * string will sort first. (Actually, if all secondary weights are
- * smaller than all primary ones, there is no need for a separator
- * weight between those two levels, etc.)
- *
- * The length of the transformed string is roughly a linear function of
- * the input string. It's not exactly linear because some characters
- * don't have weights at all levels. When we call strxfrm() we have to
- * allocate some memory to hold the transformed string. The
- * calculations below try to find coefficients 'm' and 'b' for this
- * locale so that m*x + b equals how much space we need, given the size
- * of the input string in 'x'. If we calculate too small, we increase
- * the size as needed, and call strxfrm() again, but it is better to
- * get it right the first time to avoid wasted expensive string
- * transformations. */
+}
- {
- /* We use the string below to find how long the tranformation of it
- * is. Almost all locales are supersets of ASCII, or at least the
- * ASCII letters. We use all of them, half upper half lower,
- * because if we used fewer, we might hit just the ones that are
- * outliers in a particular locale. Most of the strings being
- * collated will contain a preponderance of letters, and even if
- * they are above-ASCII, they are likely to have the same number of
- * weight levels as the ASCII ones. It turns out that digits tend
- * to have fewer levels, and some punctuation has more, but those
- * are relatively sparse in text, and khw believes this gives a
- * reasonable result, but it could be changed if experience so
- * dictates. */
- const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
- char * x_longer; /* Transformed 'longer' */
- Size_t x_len_longer; /* Length of 'x_longer' */
-
- char * x_shorter; /* We also transform a substring of 'longer' */
- Size_t x_len_shorter;
-
- /* _mem_collxfrm() is used get the transformation (though here we
- * are interested only in its length). It is used because it has
- * the intelligence to handle all cases, but to work, it needs some
- * values of 'm' and 'b' to get it started. For the purposes of
- * this calculation we use a very conservative estimate of 'm' and
- * 'b'. This assumes a weight can be multiple bytes, enough to
- * hold any UV on the platform, and there are 5 levels, 4 weight
- * bytes, and a trailing NUL. */
- PL_collxfrm_base = 5;
- PL_collxfrm_mult = 5 * sizeof(UV);
-
- /* Find out how long the transformation really is */
- x_longer = _mem_collxfrm(longer,
- sizeof(longer) - 1,
- &x_len_longer,
-
- /* We avoid converting to UTF-8 in the
- * called function by telling it the
- * string is in UTF-8 if the locale is a
- * UTF-8 one. Since the string passed
- * here is invariant under UTF-8, we can
- * claim it's UTF-8 even though it isn't.
- * */
- PL_in_utf8_COLLATE_locale);
- Safefree(x_longer);
-
- /* Find out how long the transformation of a substring of 'longer'
- * is. Together the lengths of these transformations are
- * sufficient to calculate 'm' and 'b'. The substring is all of
- * 'longer' except the first character. This minimizes the chances
- * of being swayed by outliers */
- x_shorter = _mem_collxfrm(longer + 1,
- sizeof(longer) - 2,
- &x_len_shorter,
- PL_in_utf8_COLLATE_locale);
- Safefree(x_shorter);
-
- /* If the results are nonsensical for this simple test, the whole
- * locale definition is suspect. Mark it so that locale collation
- * is not active at all for it. XXX Should we warn? */
- if ( x_len_shorter == 0
- || x_len_longer == 0
- || x_len_shorter >= x_len_longer)
- {
- PL_collxfrm_mult = 0;
- PL_collxfrm_base = 0;
- }
- else {
- SSize_t base; /* Temporary */
-
- /* We have both: m * strlen(longer) + b = x_len_longer
- * m * strlen(shorter) + b = x_len_shorter;
- * subtracting yields:
- * m * (strlen(longer) - strlen(shorter))
- * = x_len_longer - x_len_shorter
- * But we have set things up so that 'shorter' is 1 byte smaller
- * than 'longer'. Hence:
- * m = x_len_longer - x_len_shorter
- *
- * But if something went wrong, make sure the multiplier is at
- * least 1.
- */
- if (x_len_longer > x_len_shorter) {
- PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
- }
- else {
- PL_collxfrm_mult = 1;
- }
+# endif /* USE_LOCALE_COLLATE */
+#endif /* USE_LOCALE */
- /* mx + b = len
- * so: b = len - mx
- * but in case something has gone wrong, make sure it is
- * non-negative */
- base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
- if (base < 0) {
- base = 0;
- }
+#ifdef WIN32
- /* Add 1 for the trailing NUL */
- PL_collxfrm_base = base + 1;
- }
+wchar_t *
+S_Win_byte_string_to_wstring(const UINT code_page, const char * byte_string)
+{
+ /* Caller must arrange to free the returned string */
-# ifdef DEBUGGING
+ int req_size = MultiByteToWideChar(code_page, 0, byte_string, -1, NULL, 0);
+ if (! req_size) {
+ SET_EINVAL;
+ return NULL;
+ }
- if (DEBUG_L_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, "
- "x_len_longer=%zu,"
- " collate multipler=%zu, collate base=%zu\n",
- __FILE__, __LINE__,
- PL_in_utf8_COLLATE_locale,
- x_len_shorter, x_len_longer,
- PL_collxfrm_mult, PL_collxfrm_base);
- }
-# endif
+ wchar_t *wstring;
+ Newx(wstring, req_size, wchar_t);
- }
+ if (! MultiByteToWideChar(code_page, 0, byte_string, -1, wstring, req_size))
+ {
+ Safefree(wstring);
+ SET_EINVAL;
+ return NULL;
}
-#endif /* USE_LOCALE_COLLATE */
-
+ return wstring;
}
-#endif
+#define Win_utf8_string_to_wstring(s) Win_byte_string_to_wstring(CP_UTF8, (s))
-#ifdef WIN32
+char *
+S_Win_wstring_to_byte_string(const UINT code_page, const wchar_t * wstring)
+{
+ /* Caller must arrange to free the returned string */
-#define USE_WSETLOCALE
+ int req_size =
+ WideCharToMultiByte(code_page, 0, wstring, -1, NULL, 0, NULL, NULL);
-#ifdef USE_WSETLOCALE
+ char *byte_string;
+ Newx(byte_string, req_size, char);
-STATIC char *
-S_wrap_wsetlocale(pTHX_ int category, const char *locale) {
- wchar_t *wlocale;
- wchar_t *wresult;
- char *result;
+ if (! WideCharToMultiByte(code_page, 0, wstring, -1, byte_string,
+ req_size, NULL, NULL))
+ {
+ Safefree(byte_string);
+ SET_EINVAL;
+ return NULL;
+ }
- if (locale) {
- int req_size =
- MultiByteToWideChar(CP_UTF8, 0, locale, -1, NULL, 0);
+ return byte_string;
+}
- if (!req_size) {
- errno = EINVAL;
- return NULL;
- }
+#define Win_wstring_to_utf8_string(ws) Win_wstring_to_byte_string(CP_UTF8, (ws))
+
+STATIC const char *
+S_wrap_wsetlocale(pTHX_ const int category, const char *locale)
+{
+ PERL_ARGS_ASSERT_WRAP_WSETLOCALE;
+
+ /* Calls _wsetlocale(), converting the parameters/return to/from
+ * Perl-expected forms as if plain setlocale() were being called instead.
+ */
+
+ const wchar_t * wlocale = NULL;
- Newx(wlocale, req_size, wchar_t);
- if (!MultiByteToWideChar(CP_UTF8, 0, locale, -1, wlocale, req_size)) {
- Safefree(wlocale);
- errno = EINVAL;
+ if (locale) {
+ wlocale = Win_utf8_string_to_wstring(locale);
+ if (! wlocale) {
return NULL;
}
}
- else {
- wlocale = NULL;
- }
- wresult = _wsetlocale(category, wlocale);
+
+ WSETLOCALE_LOCK;
+ const wchar_t * wresult = _wsetlocale(category, wlocale);
Safefree(wlocale);
- if (wresult) {
- int req_size =
- WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL);
- Newx(result, req_size, char);
- SAVEFREEPV(result); /* is there something better we can do here? */
- if (!WideCharToMultiByte(CP_UTF8, 0, wresult, -1,
- result, req_size, NULL, NULL)) {
- errno = EINVAL;
- return NULL;
- }
- }
- else {
- result = NULL;
+
+ if (! wresult) {
+ WSETLOCALE_UNLOCK;
+ return NULL;
}
+ const char * result = Win_wstring_to_utf8_string(wresult);
+ WSETLOCALE_UNLOCK;
+
+ SAVEFREEPV(result); /* is there something better we can do here? Answer:
+ Without restructuring, returning a unique value each
+ call is required. See GH #20434 */
return result;
}
-#endif
-
-STATIC char *
+STATIC const char *
S_win32_setlocale(pTHX_ int category, const char* locale)
{
/* This, for Windows, emulates POSIX setlocale() behavior. There is no
@@ -2060,90 +2685,32 @@ S_win32_setlocale(pTHX_ int category, const char* locale)
* use the particular category's variable if set; otherwise to use the LANG
* variable. */
- bool override_LC_ALL = FALSE;
- char * result;
- unsigned int i;
-
- if (locale && strEQ(locale, "")) {
-
-# ifdef LC_ALL
-
- locale = PerlEnv_getenv("LC_ALL");
- if (! locale) {
- if (category == LC_ALL) {
- override_LC_ALL = TRUE;
- }
- else {
-
-# endif
-
- for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
- if (category == categories[i]) {
- locale = PerlEnv_getenv(category_names[i]);
- goto found_locale;
- }
- }
-
- locale = PerlEnv_getenv("LANG");
- if (! locale) {
- locale = "";
- }
-
- found_locale: ;
+ if (locale == NULL) {
+ return wrap_wsetlocale(category, NULL);
+ }
-# ifdef LC_ALL
+ if (strEQ(locale, "")) {
+ /* Note this function may change the locale, but that's ok because we
+ * are about to change it anyway */
+ locale = find_locale_from_environment(get_category_index(category, ""));
+ }
- }
- }
+ const char * result = wrap_wsetlocale(category, locale);
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
+ setlocale_debug_string_r(category, locale, result)));
-# endif
+# ifdef USE_PL_CUR_LC_ALL
+ /* If we need to keep track of LC_ALL, update it to the new value. */
+ Safefree(PL_cur_LC_ALL);
+ if (category == LC_ALL) {
+ PL_cur_LC_ALL = savepv(result);
}
-
-#ifdef USE_WSETLOCALE
- result = S_wrap_wsetlocale(aTHX_ category, locale);
-#else
- result = setlocale(category, locale);
-#endif
- DEBUG_L(STMT_START {
- dSAVE_ERRNO;
- PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__,
- setlocale_debug_string(category, locale, result));
- RESTORE_ERRNO;
- } STMT_END);
-
- if (! override_LC_ALL) {
- return result;
- }
-
- /* Here the input category was LC_ALL, and we have set it to what is in the
- * LANG variable or the system default if there is no LANG. But these have
- * lower priority than the other LC_foo variables, so override it for each
- * one that is set. (If they are set to "", it means to use the same thing
- * we just set LC_ALL to, so can skip) */
-
- for (i = 0; i < LC_ALL_INDEX; i++) {
- result = PerlEnv_getenv(category_names[i]);
- if (result && strNE(result, "")) {
-#ifdef USE_WSETLOCALE
- S_wrap_wsetlocale(aTHX_ categories[i], result);
-#else
- setlocale(categories[i], result);
-#endif
- DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
- __FILE__, __LINE__,
- setlocale_debug_string(categories[i], result, "not captured")));
- }
+ else {
+ PL_cur_LC_ALL = savepv(wrap_wsetlocale(LC_ALL, NULL));
}
- result = setlocale(LC_ALL, NULL);
- DEBUG_L(STMT_START {
- dSAVE_ERRNO;
- PerlIO_printf(Perl_debug_log, "%s:%d: %s\n",
- __FILE__, __LINE__,
- setlocale_debug_string(LC_ALL, NULL, result));
- RESTORE_ERRNO;
- } STMT_END);
+# endif
return result;
}
@@ -2173,11 +2740,16 @@ Finally, C<Perl_setlocale> works under all circumstances, whereas plain
C<setlocale> can be completely ineffective on some platforms under some
configurations.
-C<Perl_setlocale> should not be used to change the locale except on systems
-where the predefined variable C<${^SAFE_LOCALES}> is 1. On some such systems,
-the system C<setlocale()> is ineffective, returning the wrong information, and
-failing to actually change the locale. C<Perl_setlocale>, however works
-properly in all circumstances.
+Changing the locale is not a good idea when more than one thread is running,
+except on systems where the predefined variable C<${^SAFE_LOCALES}> is 1.
+This is because on such systems the locale is global to the whole process and
+not local to just the thread calling the function. So changing it in one
+thread instantaneously changes it in all. On some such systems, the system
+C<setlocale()> is ineffective, returning the wrong information, and failing to
+actually change the locale. z/OS refuses to try to change the locale once a
+second thread is created. C<Perl_setlocale>, should give you accurate results
+of what actually happened on these problematic platforms, returning NULL if the
+system forbade the locale change.
The return points to a per-thread static buffer, which is overwritten the next
time C<Perl_setlocale> is called from the same thread.
@@ -2186,6 +2758,14 @@ time C<Perl_setlocale> is called from the same thread.
*/
+#ifndef USE_LOCALE_NUMERIC
+# define affects_LC_NUMERIC(cat) 0
+#elif defined(LC_ALL)
+# define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC || cat == LC_ALL)
+#else
+# define affects_LC_NUMERIC(cat) (cat == LC_NUMERIC)
+#endif
+
const char *
Perl_setlocale(const int category, const char * locale)
{
@@ -2201,137 +2781,302 @@ Perl_setlocale(const int category, const char * locale)
#else
const char * retval;
- const char * newlocale;
- dSAVEDERRNO;
dTHX;
- DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
-#ifdef USE_LOCALE_NUMERIC
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "Entering Perl_setlocale(%d, \"%s\")\n",
+ category, locale));
- /* A NULL locale means only query what the current one is. We have the
- * LC_NUMERIC name saved, because we are normally switched into the C
- * (or equivalent) locale for it. For an LC_ALL query, switch back to get
- * the correct results. All other categories don't require special
- * handling */
+ /* A NULL locale means only query what the current one is. */
if (locale == NULL) {
+
+# ifndef USE_LOCALE_NUMERIC
+
+ /* Without LC_NUMERIC, it's trivial; we just return the value */
+ return save_to_buffer(querylocale_r(category),
+ &PL_setlocale_buf, &PL_setlocale_bufsize);
+# else
+
+ /* We have the LC_NUMERIC name saved, because we are normally switched
+ * into the C locale (or equivalent) for it. */
if (category == LC_NUMERIC) {
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "Perl_setlocale(LC_NUMERIC, NULL) returning stashed '%s'\n",
+ PL_numeric_name));
/* We don't have to copy this return value, as it is a per-thread
* variable, and won't change until a future setlocale */
return PL_numeric_name;
}
-# ifdef LC_ALL
+# ifndef LC_ALL
- else if (category == LC_ALL) {
- STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
+ /* Without LC_ALL, just return the value */
+ return save_to_buffer(querylocale_r(category),
+ &PL_setlocale_buf, &PL_setlocale_bufsize);
+
+# else
+
+ /* Here, LC_ALL is available on this platform. It's the one
+ * complicating category (because it can contain a toggled LC_NUMERIC
+ * value), for all the remaining ones (we took care of LC_NUMERIC
+ * above), just return the value */
+ if (category != LC_ALL) {
+ return save_to_buffer(querylocale_r(category),
+ &PL_setlocale_buf, &PL_setlocale_bufsize);
}
+ bool toggled = FALSE;
+
+ /* For an LC_ALL query, switch back to the underlying numeric locale
+ * (if we aren't there already) so as to get the correct results. Our
+ * records for all the other categories are valid without switching */
+ if (! PL_numeric_underlying) {
+ set_numeric_underlying();
+ toggled = TRUE;
+ }
+
+ retval = querylocale_c(LC_ALL);
+
+ if (toggled) {
+ set_numeric_standard();
+ }
+
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
+ setlocale_debug_string_r(category, locale, retval)));
+
+ return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
+
+# endif /* Has LC_ALL */
+# endif /* Has LC_NUMERIC */
+
+ } /* End of querying the current locale */
+
+
+ unsigned int cat_index = get_category_index(category, NULL);
+ retval = querylocale_i(cat_index);
+
+ /* If the new locale is the same as the current one, nothing is actually
+ * being changed, so do nothing. */
+ if ( strEQ(retval, locale)
+ && ( ! affects_LC_NUMERIC(category)
+
+# ifdef USE_LOCALE_NUMERIC
+
+ || strEQ(locale, PL_numeric_name)
+
# endif
+ )) {
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "Already in requested locale: no action taken\n"));
+ return save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
}
-#endif
+ /* Here, an actual change is being requested. Do it */
+ retval = setlocale_i(cat_index, locale);
- retval = save_to_buffer(do_setlocale_r(category, locale),
- &PL_setlocale_buf, &PL_setlocale_bufsize, 0);
- SAVE_ERRNO;
+ if (! retval) {
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n",
+ setlocale_debug_string_i(cat_index, locale, "NULL")));
+ return NULL;
+ }
-#if defined(USE_LOCALE_NUMERIC) && defined(LC_ALL)
+ assert(strNE(retval, ""));
+ retval = save_to_buffer(retval, &PL_setlocale_buf, &PL_setlocale_bufsize);
- if (locale == NULL && category == LC_ALL) {
- RESTORE_LC_NUMERIC();
+ /* Now that have changed locales, we have to update our records to
+ * correspond. Only certain categories have extra work to update. */
+ if (update_functions[cat_index]) {
+ update_functions[cat_index](aTHX_ retval, false);
}
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "returning '%s'\n", retval));
+
+ return retval;
+
#endif
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "%s:%d: %s\n", __FILE__, __LINE__,
- setlocale_debug_string(category, locale, retval)));
+}
- RESTORE_ERRNO;
+STATIC utf8ness_t
+S_get_locale_string_utf8ness_i(pTHX_ const char * string,
+ const locale_utf8ness_t known_utf8,
+ const char * locale,
+ const unsigned cat_index)
+{
+ PERL_ARGS_ASSERT_GET_LOCALE_STRING_UTF8NESS_I;
- if (! retval) {
- return NULL;
+#ifndef USE_LOCALE
+
+ return UTF8NESS_NO;
+ PERL_UNUSED_ARG(string);
+ PERL_UNUSED_ARG(known_utf8);
+ PERL_UNUSED_ARG(locale);
+ PERL_UNUSED_ARG(cat_index);
+
+#else
+
+ assert(cat_index <= NOMINAL_LC_ALL_INDEX);
+
+ /* Return to indicate if 'string' in the locale given by the input
+ * arguments should be considered UTF-8 or not.
+ *
+ * If the input 'locale' is not NULL, use that for the locale; otherwise
+ * use the current locale for the category specified by 'cat_index'.
+ */
+
+ if (string == NULL) {
+ return UTF8NESS_NO;
}
- /* If locale == NULL, we are just querying the state */
- if (locale == NULL) {
- return retval;
+ if (IN_BYTES) { /* respect 'use bytes' */
+ return UTF8NESS_NO;
}
- /* Now that have switched locales, we have to update our records to
- * correspond. */
+ Size_t len = strlen(string);
- switch (category) {
+ /* UTF8ness is immaterial if the representation doesn't vary */
+ const U8 * first_variant = NULL;
+ if (is_utf8_invariant_string_loc((U8 *) string, len, &first_variant)) {
+ return UTF8NESS_IMMATERIAL;
+ }
-#ifdef USE_LOCALE_CTYPE
+ /* Can't be UTF-8 if invalid */
+ if (! is_utf8_string((U8 *) first_variant,
+ len - ((char *) first_variant - string)))
+ {
+ return UTF8NESS_NO;
+ }
- case LC_CTYPE:
- new_ctype(retval);
- break;
+ /* Here and below, we know the string is legal UTF-8, containing at least
+ * one character requiring a sequence of two or more bytes. It is quite
+ * likely to be UTF-8. But it pays to be paranoid and do further checking.
+ *
+ * If we already know the UTF-8ness of the locale, then we immediately know
+ * what the string is */
+ if (UNLIKELY(known_utf8 != LOCALE_UTF8NESS_UNKNOWN)) {
+ if (known_utf8 == LOCALE_IS_UTF8) {
+ return UTF8NESS_YES;
+ }
+ else {
+ return UTF8NESS_NO;
+ }
+ }
-#endif
-#ifdef USE_LOCALE_COLLATE
+# ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION
- case LC_COLLATE:
- new_collate(retval);
- break;
+ /* Here, we have available the libc functions that can be used to
+ * accurately determine the UTF8ness of the underlying locale. If it is a
+ * UTF-8 locale, the string is UTF-8; otherwise it was coincidental that
+ * the string is legal UTF-8
+ *
+ * However, if the perl is compiled to not pay attention to the category
+ * being passed in, you might think that that locale is essentially always
+ * the C locale, so it would make sense to say it isn't UTF-8. But to get
+ * here, the string has to contain characters unknown in the C locale. And
+ * in fact, Windows boxes are compiled without LC_MESSAGES, as their
+ * message catalog isn't really a part of the locale system. But those
+ * messages really could be UTF-8, and given that the odds are rather small
+ * of something not being UTF-8 but being syntactically valid UTF-8, khw
+ * has decided to call such strings as UTF-8. */
-#endif
-#ifdef USE_LOCALE_NUMERIC
+ if (locale == NULL) {
+ locale = querylocale_i(cat_index);
+ }
+ if (is_locale_utf8(locale)) {
+ return UTF8NESS_YES;
+ }
- case LC_NUMERIC:
- new_numeric(retval);
- break;
+ return UTF8NESS_NO;
-#endif
-#ifdef LC_ALL
+# else
- case LC_ALL:
+ /* Here, we have a valid UTF-8 string containing non-ASCII characters, and
+ * don't have access to functions to check if the locale is UTF-8 or not.
+ * Assume that it is. khw tried adding a check that the string is entirely
+ * in a single Unicode script, but discovered the strftime() timezone is
+ * user-settable through the environment, which may be in a different
+ * script than the locale-expected value. */
+ PERL_UNUSED_ARG(locale);
+ PERL_UNUSED_ARG(cat_index);
- /* LC_ALL updates all the things we care about. The values may not
- * be the same as 'retval', as the locale "" may have set things
- * individually */
+ return UTF8NESS_YES;
-# ifdef USE_LOCALE_CTYPE
+# endif
+#endif
- newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL));
- new_ctype(newlocale);
- Safefree(newlocale);
+}
-# endif /* USE_LOCALE_CTYPE */
-# ifdef USE_LOCALE_COLLATE
+STATIC bool
+S_is_locale_utf8(pTHX_ const char * locale)
+{
+ /* Returns TRUE if the locale 'locale' is UTF-8; FALSE otherwise. It uses
+ * my_langinfo(), which employs various methods to get this information
+ * if nl_langinfo() isn't available, using heuristics as a last resort, in
+ * which case, the result will very likely be correct for locales for
+ * languages that have commonly used non-ASCII characters, but for notably
+ * English, it comes down to if the locale's name ends in something like
+ * "UTF-8". It errs on the side of not being a UTF-8 locale. */
- newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL));
- new_collate(newlocale);
- Safefree(newlocale);
+# if ! defined(USE_LOCALE) \
+ || ! defined(USE_LOCALE_CTYPE) \
+ || defined(EBCDIC) /* There aren't any real UTF-8 locales at this time */
-# endif
-# ifdef USE_LOCALE_NUMERIC
+ PERL_UNUSED_ARG(locale);
- newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL));
- new_numeric(newlocale);
- Safefree(newlocale);
+ return FALSE;
-# endif /* USE_LOCALE_NUMERIC */
-#endif /* LC_ALL */
+# else
- default:
- break;
+ const char * scratch_buffer = NULL;
+ const char * codeset;
+ bool retval;
+
+ PERL_ARGS_ASSERT_IS_LOCALE_UTF8;
+
+ if (strEQ(locale, PL_ctype_name)) {
+ return PL_in_utf8_CTYPE_locale;
}
+ codeset = my_langinfo_c(CODESET, LC_CTYPE, locale,
+ &scratch_buffer, NULL, NULL);
+ retval = is_codeset_name_UTF8(codeset);
+
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "found codeset=%s, is_utf8=%d\n", codeset, retval));
+
+ Safefree(scratch_buffer);
return retval;
-#endif
+# endif
}
-PERL_STATIC_INLINE const char *
-S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset)
+#ifdef USE_LOCALE
+
+STATIC const char *
+S_save_to_buffer(const char * string, const char **buf, Size_t *buf_size)
{
- /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size',
- * growing it if necessary */
+ /* Copy the NUL-terminated 'string' to a buffer whose address before this
+ * call began at *buf, and whose available length before this call was
+ * *buf_size.
+ *
+ * If the length of 'string' is greater than the space available, the
+ * buffer is grown accordingly, which may mean that it gets relocated.
+ * *buf and *buf_size will be updated to reflect this.
+ *
+ * Regardless, the function returns a pointer to where 'string' is now
+ * stored.
+ *
+ * 'string' may be NULL, which means no action gets taken, and NULL is
+ * returned.
+ *
+ * If *buf or 'buf_size' are NULL or *buf_size is 0, the buffer is assumed
+ * empty, and memory is malloc'd. 'buf-size' being NULL is to be used
+ * when this is a single use buffer, which will shortly be freed by the
+ * caller.
+ */
Size_t string_size;
@@ -2341,9 +3086,17 @@ S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t
return NULL;
}
- string_size = strlen(string) + offset + 1;
+ /* No-op to copy over oneself */
+ if (string == *buf) {
+ return string;
+ }
+
+ string_size = strlen(string) + 1;
- if (*buf_size == 0) {
+ if (buf_size == NULL) {
+ Newx(*buf, string_size, char);
+ }
+ else if (*buf_size == 0) {
Newx(*buf, string_size, char);
*buf_size = string_size;
}
@@ -2352,36 +3105,787 @@ S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t
*buf_size = string_size;
}
- Copy(string, *buf + offset, string_size - offset, char);
+ {
+ dTHX_DEBUGGING;
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "Copying '%s' to %p\n",
+ ((is_utf8_string((U8 *) string, 0))
+ ? string
+ :_byte_dump_string((U8 *) string, strlen(string), 0)),
+ *buf));
+ }
+
+# ifdef DEBUGGING
+
+ /* Catch glitches. Usually this is because LC_CTYPE needs to be the same
+ * locale as whatever is being worked on */
+ if (UNLIKELY(instr(string, REPLACEMENT_CHARACTER_UTF8))) {
+ dTHX_DEBUGGING;
+
+ locale_panic_(Perl_form(aTHX_
+ "Unexpected REPLACEMENT_CHARACTER in '%s'\n%s",
+ string, get_LC_ALL_display()));
+ }
+
+# endif
+
+ Copy(string, *buf, string_size, char);
return *buf;
}
+# ifdef WIN32
+
+bool
+Perl_get_win32_message_utf8ness(pTHX_ const char * string)
+{
+ /* NULL => locale irrelevant, 0 => category irrelevant
+ * so returns based on the UTF-8 legality of the input string, ignoring the
+ * locale and category completely.
+ *
+ * This is because Windows doesn't have LC_MESSAGES */
+ return get_locale_string_utf8ness_i(string, LOCALE_IS_UTF8, NULL, 0);
+}
+
+# endif
+#endif /* USE_LOCALE */
+
+
+int
+Perl_mbtowc_(pTHX_ const wchar_t * pwc, const char * s, const Size_t len)
+{
+
+#if ! defined(HAS_MBRTOWC) && ! defined(HAS_MBTOWC)
+
+ PERL_UNUSED_ARG(pwc);
+ PERL_UNUSED_ARG(s);
+ PERL_UNUSED_ARG(len);
+ return -1;
+
+#else /* Below we have some form of mbtowc() */
+# if defined(HAS_MBRTOWC) \
+ && (defined(USE_LOCALE_THREADS) || ! defined(HAS_MBTOWC))
+# define USE_MBRTOWC
+# else
+# undef USE_MBRTOWC
+# endif
+
+ int retval = -1;
+
+ if (s == NULL) { /* Initialize the shift state to all zeros in
+ PL_mbrtowc_ps. */
+
+# if defined(USE_MBRTOWC)
+
+ memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
+ return 0;
+
+# else
+
+ MBTOWC_LOCK_;
+ SETERRNO(0, 0);
+ retval = mbtowc(NULL, NULL, 0);
+ MBTOWC_UNLOCK_;
+ return retval;
+
+# endif
+
+ }
+
+# if defined(USE_MBRTOWC)
+
+ SETERRNO(0, 0);
+ retval = (SSize_t) mbrtowc((wchar_t *) pwc, s, len, &PL_mbrtowc_ps);
+
+# else
+
+ /* Locking prevents races, but locales can be switched out without locking,
+ * so this isn't a cure all */
+ MBTOWC_LOCK_;
+ SETERRNO(0, 0);
+ retval = mbtowc((wchar_t *) pwc, s, len);
+ MBTOWC_UNLOCK_;
+
+# endif
+
+ return retval;
+
+#endif
+
+}
+
/*
+=for apidoc Perl_localeconv
-=for apidoc Perl_langinfo
+This is a thread-safe version of the libc L<localeconv(3)>. It is the same as
+L<POSIX::localeconv|POSIX/localeconv> (returning a hash of the C<localeconv()>
+fields), but directly callable from XS code.
-This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>,
-taking the same C<item> parameter values, and returning the same information.
-But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks
-of Perl's locale handling from your code, and can be used on systems that lack
-a native C<nl_langinfo>.
+=cut
+*/
-Expanding on these:
+HV *
+Perl_localeconv(pTHX)
+{
+
+#if ! defined(HAS_LOCALECONV)
+
+ return newHV();
+
+#else
+
+ return my_localeconv(0);
+
+#endif
+
+}
+
+#if defined(HAS_LOCALECONV)
+
+HV *
+S_my_localeconv(pTHX_ const int item)
+{
+ PERL_ARGS_ASSERT_MY_LOCALECONV;
+
+ /* This returns a mortalized hash containing all or one of the elements
+ * returned by localeconv(). It is used by Perl_localeconv() and
+ * POSIX::localeconv() and is thread-safe.
+ *
+ * There are two use cases:
+ * 1) Called from POSIX::locale_conv(). This returns the lconv structure
+ * copied to a hash, based on the current underlying locales for
+ * LC_NUMERIC and LC_MONETARY. An input item==0 signifies this case, or
+ * on many platforms it is the only use case compiled.
+ * 2) Certain items that nl_langinfo() provides are also derivable from
+ * the return of localeconv(). Windows notably doesn't have
+ * nl_langinfo(), so on that, and actually any platform lacking it,
+ * my_localeconv() is used also to emulate it for those particular
+ * items. The code to do this is compiled only on such platforms.
+ * Rather than going to the expense of creating a full hash when only
+ * one item is needed, the returned hash has just the desired item in
+ * it.
+ *
+ * To access all the localeconv() struct lconv fields, there is a data
+ * structure that contains every commonly documented field in it. (Maybe
+ * some minority platforms have extra fields. Those could be added here
+ * without harm; they would just be ignored on platforms lacking them.)
+ *
+ * Our structure is compiled to make looping through the fields easier by
+ * pointing each name to its value's offset within lconv, e.g.,
+ { "thousands_sep", STRUCT_OFFSET(struct lconv, thousands_sep) }
+ */
+# define LCONV_ENTRY(name) \
+ {STRINGIFY(name), STRUCT_OFFSET(struct lconv, name)}
+
+ /* These synonyms are just for clarity, and to make it easier in case
+ * something needs to change in the future */
+# define LCONV_NUMERIC_ENTRY(name) LCONV_ENTRY(name)
+# define LCONV_MONETARY_ENTRY(name) LCONV_ENTRY(name)
+
+ /* There are just a few fields for NUMERIC strings */
+ const lconv_offset_t lconv_numeric_strings[] = {
+# ifndef NO_LOCALECONV_GROUPING
+ LCONV_NUMERIC_ENTRY(grouping),
+# endif
+ LCONV_NUMERIC_ENTRY(thousands_sep),
+ LCONV_NUMERIC_ENTRY(decimal_point),
+ {NULL, 0}
+ };
+
+ /* When used to implement nl_langinfo(), we save time by only populating
+ * the hash with the field(s) needed. Thus we would need a data structure
+ * of just:
+ * LCONV_NUMERIC_ENTRY(decimal_point),
+ * {NULL, 0}
+ *
+ * By placing the decimal_point field last in the full structure, we can
+ * use just the tail for this bit of it, saving space. This macro yields
+ * the address of the sub structure. */
+# define DECIMAL_POINT_ADDRESS \
+ &lconv_numeric_strings[(C_ARRAY_LENGTH(lconv_numeric_strings) - 2)]
+
+ /* And the MONETARY string fields */
+ const lconv_offset_t lconv_monetary_strings[] = {
+ LCONV_MONETARY_ENTRY(int_curr_symbol),
+ LCONV_MONETARY_ENTRY(mon_decimal_point),
+# ifndef NO_LOCALECONV_MON_THOUSANDS_SEP
+ LCONV_MONETARY_ENTRY(mon_thousands_sep),
+# endif
+# ifndef NO_LOCALECONV_MON_GROUPING
+ LCONV_MONETARY_ENTRY(mon_grouping),
+# endif
+ LCONV_MONETARY_ENTRY(positive_sign),
+ LCONV_MONETARY_ENTRY(negative_sign),
+ LCONV_MONETARY_ENTRY(currency_symbol),
+ {NULL, 0}
+ };
+
+ /* Like above, this field being last can be used as a sub structure */
+# define CURRENCY_SYMBOL_ADDRESS \
+ &lconv_monetary_strings[(C_ARRAY_LENGTH(lconv_monetary_strings) - 2)]
+
+ /* Finally there are integer fields, all are for monetary purposes */
+ const lconv_offset_t lconv_integers[] = {
+ LCONV_ENTRY(int_frac_digits),
+ LCONV_ENTRY(frac_digits),
+ LCONV_ENTRY(p_sep_by_space),
+ LCONV_ENTRY(n_cs_precedes),
+ LCONV_ENTRY(n_sep_by_space),
+ LCONV_ENTRY(p_sign_posn),
+ LCONV_ENTRY(n_sign_posn),
+# ifdef HAS_LC_MONETARY_2008
+ LCONV_ENTRY(int_p_cs_precedes),
+ LCONV_ENTRY(int_p_sep_by_space),
+ LCONV_ENTRY(int_n_cs_precedes),
+ LCONV_ENTRY(int_n_sep_by_space),
+ LCONV_ENTRY(int_p_sign_posn),
+ LCONV_ENTRY(int_n_sign_posn),
+# endif
+ LCONV_ENTRY(p_cs_precedes),
+ {NULL, 0}
+ };
+
+ /* Like above, this field being last can be used as a sub structure */
+# define P_CS_PRECEDES_ADDRESS \
+ &lconv_integers[(C_ARRAY_LENGTH(lconv_integers) - 2)]
+
+ /* If we aren't paying attention to a given category, use LC_CTYPE instead;
+ * If not paying attention to that either, the code below should end up not
+ * using this. Make sure that things blow up if that avoidance gets lost,
+ * by setting the category to -1 */
+ unsigned int numeric_index;
+ unsigned int monetary_index;
+
+# ifdef USE_LOCALE_NUMERIC
+ numeric_index = LC_NUMERIC_INDEX_;
+# elif defined(USE_LOCALE_CTYPE)
+ numeric_index = LC_CTYPE_INDEX_;
+# else
+ numeric_index = (unsigned) -1;
+# endif
+# ifdef USE_LOCALE_MONETARY
+ monetary_index = LC_MONETARY_INDEX_;
+# elif defined(USE_LOCALE_CTYPE)
+ monetary_index = LC_CTYPE_INDEX_;
+# else
+ monetary_index = (unsigned) -1;
+# endif
+
+ /* Some platforms, for correct non-mojibake results, require LC_CTYPE's
+ * locale to match LC_NUMERIC's for the numeric fields, and LC_MONETARY's
+ * for the monetary ones. What happens if LC_NUMERIC and LC_MONETARY
+ * aren't compatible? Wrong results. To avoid that, we call localeconv()
+ * twice, once for each locale, setting LC_CTYPE to match the category.
+ * But if the locales of both categories are the same, there is no need for
+ * a second call. Assume this is the case unless overridden below */
+ bool requires_2nd_localeconv = false;
+
+ /* The actual hash populating is done by S_populate_hash_from_localeconv().
+ * It gets passed an array of length two containing the data structure it
+ * is supposed to use to get the key names to fill the hash with. One
+ * element is alwasy for the NUMERIC strings (or NULL if none to use), and
+ * the other element similarly for the MONETARY ones. */
+# define NUMERIC_STRING_OFFSET 0
+# define MONETARY_STRING_OFFSET 1
+ const lconv_offset_t * strings[2] = { NULL, NULL };
+
+ /* This is a mask, with one bit to tell S_populate_hash_from_localeconv to
+ * populate the NUMERIC items; another bit for the MONETARY ones. This way
+ * it can choose which (or both) to populate from */
+ U32 index_bits = 0;
+
+ /* This converts from a locale index to its bit position in the above mask.
+ * */
+# define INDEX_TO_BIT(i) (1 << (i))
+
+ /* The two categories can have disparate locales. Initialize them to C and
+ * override later whichever one(s) we pay attention to */
+ const char * numeric_locale = "C";
+ const char * monetary_locale = "C";
+
+ /* This will be either 'numeric_locale' or 'monetary_locale' depending on
+ * what we are working on at the moment */
+ const char * locale;
+
+ /* The LC_MONETARY category also has some integer-valued fields, whose
+ * information is kept in a separate list */
+ const lconv_offset_t * integers;
+
+# ifdef HAS_SOME_LANGINFO
+
+ /* If the only use-case for this is the full localeconv(), the 'item'
+ * parameter is ignored. */
+ PERL_UNUSED_ARG(item);
+
+# else
+
+ /* This only gets compiled for the use-case of using localeconv() to
+ * emulate an nl_langinfo() missing from the platform.
+ *
+ * We need this substructure to only return this field for the THOUSEP
+ * item. The other items also need substructures, but they were handled
+ * above by placing the substructure's item at the end of the full one, so
+ * the data structure could do double duty. However, both this and
+ * RADIXCHAR would need to be in the final position of the same full
+ * structure; an impossibility. So make this into a separate structure */
+ const lconv_offset_t thousands_sep_string[] = {
+ LCONV_NUMERIC_ENTRY(thousands_sep),
+ {NULL, 0}
+ };
+
+ /* End of all the initialization of datastructures. Now for actual code.
+ *
+ * Without nl_langinfo(), the call to my_localeconv() could be for just one
+ * of the following 3 items to emulate nl_langinfo(). This is compiled
+ * only when using perl_langinfo.h, which we control, and it has been
+ * constructed so that no item is numbered 0.
+ *
+ * For each, setup the appropriate parameters for the call below to
+ * S_populate_hash_from_localeconv() */
+ if (item != 0) switch (item) {
+ default:
+ locale_panic_(Perl_form(aTHX_
+ "Unexpected item passed to my_localeconv: %d", item));
+ break;
+
+# ifdef USE_LOCALE_NUMERIC
+
+ case RADIXCHAR:
+ locale = numeric_locale = PL_numeric_name;
+ index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
+ strings[NUMERIC_STRING_OFFSET] = DECIMAL_POINT_ADDRESS;
+ integers = NULL;
+ break;
+
+ case THOUSEP:
+ index_bits = INDEX_TO_BIT(LC_NUMERIC_INDEX_);
+ locale = numeric_locale = PL_numeric_name;
+ strings[NUMERIC_STRING_OFFSET] = thousands_sep_string;
+ integers = NULL;
+ break;
+
+# endif
+# ifdef USE_LOCALE_MONETARY
+
+ case CRNCYSTR:
+ index_bits = INDEX_TO_BIT(LC_MONETARY_INDEX_);
+ locale = monetary_locale = querylocale_i(LC_MONETARY_INDEX_);
+
+ /* This item needs the values for both the currency symbol, and another
+ * one used to construct the nl_langino()-compatible return */
+ strings[MONETARY_STRING_OFFSET] = CURRENCY_SYMBOL_ADDRESS;
+ integers = P_CS_PRECEDES_ADDRESS;
+ break;
+
+# endif
+
+ } /* End of switch() */
+
+ else /* End of for just one item to emulate nl_langinfo() */
+
+# endif
+
+ { /* Here, the call is for all of localeconv(). It has a bunch of
+ * items. As in the individual item case, set up the parameters for
+ * S_populate_hash_from_localeconv(); */
+
+# ifdef USE_LOCALE_NUMERIC
+ numeric_locale = PL_numeric_name;
+# elif defined(USE_LOCALE_CTYPE)
+ numeric_locale = querylocale_i(numeric_index);
+# endif
+# if defined(USE_LOCALE_MONETARY) || defined(USE_LOCALE_CTYPE)
+ monetary_locale = querylocale_i(monetary_index);
+# endif
+
+ /* The first call to S_populate_hash_from_localeconv() will be for the
+ * MONETARY values */
+ index_bits = INDEX_TO_BIT(monetary_index);
+ locale = monetary_locale;
+
+ /* And if the locales for the two categories are the same, we can also
+ * do the NUMERIC values in the same call */
+ if (strEQ(numeric_locale, monetary_locale)) {
+ index_bits |= INDEX_TO_BIT(numeric_index);
+ }
+ else {
+ requires_2nd_localeconv = true;
+ }
+
+ /* We always pass both sets of strings. 'index_bits' tells
+ * S_populate_hash_from_localeconv which to actually look at */
+ strings[NUMERIC_STRING_OFFSET] = lconv_numeric_strings;
+ strings[MONETARY_STRING_OFFSET] = lconv_monetary_strings;
+
+ /* And pass the integer values to populate; again 'index_bits' will
+ * say to use them or not */
+ integers = lconv_integers;
+
+ } /* End of call is for localeconv() */
+
+ /* The code above has determined the parameters to
+ S_populate_hash_from_localeconv() for both cases of an individual item
+ and for the entire structure. Below is code common to both */
+
+ HV * hv = newHV(); /* The returned hash, initially empty */
+ sv_2mortal((SV*)hv);
+
+ /* Call localeconv() and copy its results into the hash. All the
+ * parameters have been initialized above */
+ populate_hash_from_localeconv(hv,
+ locale,
+ index_bits,
+ strings,
+ integers
+ );
+
+ /* The above call may have done all the hash fields, but not always, as
+ * already explained. If we need a second call it is always for the
+ * NUMERIC fields */
+ if (requires_2nd_localeconv) {
+ populate_hash_from_localeconv(hv,
+ numeric_locale,
+ INDEX_TO_BIT(numeric_index),
+ strings,
+ NULL /* There are No NUMERIC integer
+ fields */
+ );
+ }
+
+ /* Here, the hash has been completely populated.
+ *
+ * Now go through all the items and:
+ * a) For string items, see if they should be marked as UTF-8 or not.
+ * This would have been more convenient and faster to do while
+ * populating the hash in the first place, but that operation has to be
+ * done within a critical section, keeping other threads from
+ * executing, so only the minimal amount of work necessary is done at
+ * that time.
+ * b) For integer items, convert the C CHAR_MAX value into -1. Again,
+ * this could have been done in the critical section, but was deferred
+ * to here to keep to the bare minimum amount the time spent owning the
+ * processor. CHAR_MAX is a C concept for an 8-bit character type.
+ * Perl has no such type; the closest fit is a -1.
+ *
+ * XXX On unthreaded perls, this code could be #ifdef'd out, and the
+ * corrections determined at hash population time, at an extra maintenance
+ * cost which khw doesn't think is worth it
+ */
+ for (unsigned int i = 0; i < 2; i++) { /* Try both types of strings */
+ if (! strings[i]) { /* Skip if no strings of this type */
+ continue;
+ }
+
+ locale = (i == NUMERIC_STRING_OFFSET)
+ ? numeric_locale
+ : monetary_locale;
+
+ locale_utf8ness_t locale_is_utf8 = LOCALE_UTF8NESS_UNKNOWN;
+
+# ifdef HAS_RELIABLE_UTF8NESS_DETERMINATION
+
+ /* It saves time in the loop below to have predetermined the UTF8ness
+ * of the locale. But only do so if the platform reliably has this
+ * information; otherwise to do it, this could recurse indefinitely.
+ *
+ * When we don't do it here, it will be done on a per-element basis in
+ * the loop. The per-element check is intelligent enough to not
+ * recurse */
+
+ locale_is_utf8 = (is_locale_utf8(locale))
+ ? LOCALE_IS_UTF8
+ : LOCALE_NOT_UTF8;
+
+ if (locale_is_utf8 == LOCALE_NOT_UTF8) {
+ continue; /* No string can be UTF-8 if the locale isn't */
+ }
+
+# endif
+
+ /* Examine each string */
+ while (1) {
+ const char * name = strings[i]->name;
+
+ if (! name) { /* Reached the end */
+ break;
+ }
+
+ /* 'value' will contain the string that may need to be marked as
+ * UTF-8 */
+ SV ** value = hv_fetch(hv, name, strlen(name), true);
+ if (! value) {
+ continue;
+ }
+
+ /* Determine if the string should be marked as UTF-8. */
+ if (UTF8NESS_YES == (get_locale_string_utf8ness_i(SvPVX(*value),
+ locale_is_utf8,
+ NULL, 0)))
+ {
+ SvUTF8_on(*value);
+ }
+
+ strings[i]++; /* Iterate */
+ }
+ } /* End of fixing up UTF8ness */
+
+
+ /* Examine each integer */
+ if (integers) while (1) {
+ const char * name = integers->name;
+
+ if (! name) { /* Reached the end */
+ break;
+ }
+
+ SV ** value = hv_fetch(hv, name, strlen(name), true);
+ if (! value) {
+ continue;
+ }
+
+ /* Change CHAR_MAX to -1 */
+ if (SvIV(*value) == CHAR_MAX) {
+ sv_setiv(*value, -1);
+ }
+
+ integers++; /* Iterate */
+ }
+
+ return hv;
+}
+
+STATIC void
+S_populate_hash_from_localeconv(pTHX_ HV * hv,
+
+ /* Switch to this locale to run
+ * localeconv() from */
+ const char * locale,
+
+ /* bit mask of which categories to
+ * populate */
+ const U32 which_mask,
+
+ /* strings[0] points the numeric
+ * string fields; [1] to the monetary */
+ const lconv_offset_t * strings[2],
+
+ /* And to the monetary integer fields */
+ const lconv_offset_t * integers)
+{
+ PERL_ARGS_ASSERT_POPULATE_HASH_FROM_LOCALECONV;
+ PERL_UNUSED_ARG(which_mask); /* Some configurations don't use this;
+ complicated to figure out which */
+
+ /* Run localeconv() and copy some or all of its results to the input 'hv'
+ * hash. Most localeconv() implementations return the values in a global
+ * static buffer, so the operation must be performed in a critical section,
+ * ending only after the copy is completed. There are so many locks
+ * because localeconv() deals with two categories, and returns in a single
+ * global static buffer. Some locks might be no-ops on this platform, but
+ * not others. We need to lock if any one isn't a no-op. */
+
+# ifdef USE_LOCALE_CTYPE
+
+ /* Some platforms require LC_CTYPE to be congruent with the category we are
+ * looking for */
+ const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
+
+# endif
+# ifdef USE_LOCALE_NUMERIC
+
+ /* We need to toggle to the underlying NUMERIC locale if we are getting
+ * NUMERIC strings */
+ const char * orig_NUMERIC_locale = NULL;
+ if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
+ LC_NUMERIC_LOCK(0);
+ orig_NUMERIC_locale = toggle_locale_i(LC_NUMERIC_INDEX_, locale);
+ }
+
+# endif
+
+ /* Finally ready to do the actual localeconv(). Lock to prevent other
+ * accesses until we have made a copy of its returned static buffer */
+ gwLOCALE_LOCK;
+
+# ifdef TS_W32_BROKEN_LOCALECONV
+
+ /* This is a workaround for another bug in Windows. localeconv() was
+ * broken with thread-safe locales prior to VS 15. It looks at the global
+ * locale instead of the thread one. As a work-around, we toggle to the
+ * global locale; populate the return; then toggle back. We have to use
+ * LC_ALL instead of the individual categories because of yet another bug
+ * in Windows. And this all has to be done in a critical section.
+ *
+ * This introduces a potential race with any other thread that has also
+ * converted to use the global locale, and doesn't protect its locale calls
+ * with mutexes. khw can't think of any reason for a thread to do so on
+ * Windows, as the locale API is the same regardless of thread-safety, except
+ * if the code is ported from working on another platform where there might
+ * be some reason to do this. But this is typically due to some
+ * alien-to-Perl library that thinks it owns locale setting. Such a
+ * library isn't likely to exist on Windows, so such an application is
+ * unlikely to be run on Windows
+ */
+ bool restore_per_thread = FALSE;
+
+ /* Save the per-thread locale state */
+ const char * save_thread = querylocale_c(LC_ALL);
+
+ /* Change to the global locale, and note if we already were there */
+ if (_configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
+ != _DISABLE_PER_THREAD_LOCALE)
+ {
+ restore_per_thread = TRUE;
+ }
+
+ /* Save the state of the global locale; then convert to our desired
+ * state. */
+ const char * save_global = querylocale_c(LC_ALL);
+ void_setlocale_c(LC_ALL, save_thread);
+
+# endif /* TS_W32_BROKEN_LOCALECONV */
+
+ /* Finally, do the actual localeconv */
+ const char *lcbuf_as_string = (const char *) localeconv();
+
+ /* Fill in the string fields of the HV* */
+ for (unsigned int i = 0; i < 2; i++) {
+
+# ifdef USE_LOCALE_NUMERIC
+
+ /* One iteration is only for the numeric string fields */
+ if ( i == NUMERIC_STRING_OFFSET
+ && (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) == 0)
+ {
+ continue;
+ }
+
+# endif
+# ifdef USE_LOCALE_MONETARY
+
+ /* The other iteration is only for the monetary string fields */
+ if ( i == MONETARY_STRING_OFFSET
+ && (which_mask & INDEX_TO_BIT(LC_MONETARY_INDEX_)) == 0)
+ {
+ continue;
+ }
+
+# endif
+
+ /* For each field for the given category ... */
+ const lconv_offset_t * category_strings = strings[i];
+ while (1) {
+ const char * name = category_strings->name;
+ if (! name) { /* Quit at the end */
+ break;
+ }
+
+ /* we have set things up so that we know where in the returned
+ * structure, when viewed as a string, the corresponding value is.
+ * */
+ const char *value = *((const char **)( lcbuf_as_string
+ + category_strings->offset));
+
+ /* Set to get next string on next iteration */
+ category_strings++;
+
+ /* Skip if this platform doesn't have this field. */
+ if (! value) {
+ continue;
+ }
+
+ /* Copy to the hash */
+ (void) hv_store(hv,
+ name, strlen(name),
+ newSVpv(value, strlen(value)),
+ 0);
+ }
+
+ /* Add any int fields to the HV* */
+ if (i == MONETARY_STRING_OFFSET && integers) {
+ while (integers->name) {
+ const char value = *((const char *)( lcbuf_as_string
+ + integers->offset));
+ (void) hv_store(hv, integers->name,
+ strlen(integers->name), newSViv(value), 0);
+ integers++;
+ }
+ }
+ } /* End of loop through the fields */
+
+ /* Done with copying to the hash. Can unwind the critical section locks */
+
+# ifdef TS_W32_BROKEN_LOCALECONV
+
+ /* Restore the global locale's prior state */
+ void_setlocale_c(LC_ALL, save_global);
+
+ /* And back to per-thread locales */
+ if (restore_per_thread) {
+ _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
+ }
+
+ /* Restore the per-thread locale state */
+ void_setlocale_c(LC_ALL, save_thread);
+
+# endif /* TS_W32_BROKEN_LOCALECONV */
+
+ gwLOCALE_UNLOCK; /* Finished with the critical section of a
+ globally-accessible buffer */
+
+# ifdef USE_LOCALE_NUMERIC
+
+ restore_toggled_locale_i(LC_NUMERIC_INDEX_, orig_NUMERIC_locale);
+ if (which_mask & INDEX_TO_BIT(LC_NUMERIC_INDEX_)) {
+ LC_NUMERIC_UNLOCK;
+ }
+
+# endif
+# ifdef USE_LOCALE_CTYPE
+
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
+
+# endif
+
+}
+
+#endif /* defined(HAS_LOCALECONV) */
+#ifndef HAS_SOME_LANGINFO
+
+typedef int nl_item; /* Substitute 'int' for emulated nl_langinfo() */
+
+#endif
+
+/*
+
+=for apidoc Perl_langinfo
+=for apidoc_item Perl_langinfo8
+
+C<Perl_langinfo> is an (almost) drop-in replacement for the system
+C<L<nl_langinfo(3)>>, taking the same C<item> parameter values, and returning
+the same information. But it is more thread-safe than regular
+C<nl_langinfo()>, and hides the quirks of Perl's locale handling from your
+code, and can be used on systems that lack a native C<nl_langinfo>.
+
+However, you should instead use the improved version of this:
+L</Perl_langinfo8>, which behaves identically except for an additional
+parameter, a pointer to a variable declared as L</C<utf8ness_t>>, into which it
+returns to you how you should treat the returned string with regards to it
+being encoded in UTF-8 or not.
+
+Concerning the differences between these and plain C<nl_langinfo()>:
=over
-=item *
+=item a.
-The reason it isn't quite a drop-in replacement is actually an advantage. The
-only difference is that it returns S<C<const char *>>, whereas plain
-C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation)
-forbidden to write into the buffer. By declaring this C<const>, the compiler
-enforces this restriction, so if it is violated, you know at compilation time,
-rather than getting segfaults at runtime.
+C<Perl_langinfo8> has an extra parameter, described above. Besides this, the
+other reason they aren't quite a drop-in replacement is actually an advantage.
+The C<const>ness of the return allows the compiler to catch attempts to write
+into the returned buffer, which is illegal and could cause run-time crashes.
-=item *
+=item b.
-It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
+They deliver the correct results for the C<RADIXCHAR> and C<THOUSEP> items,
without you having to write extra code. The reason for the extra code would be
because these are from the C<LC_NUMERIC> locale category, which is normally
kept set by Perl so that the radix is a dot, and the separator is the empty
@@ -2393,739 +3897,1096 @@ the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C
(or equivalent) locale would break a lot of CPAN, which is expecting the radix
(decimal point) character to be a dot.)
-=item *
+=item c.
-The system function it replaces can have its static return buffer trashed,
+The system function they replace can have its static return buffer trashed,
not only by a subsequent call to that function, but by a C<freelocale>,
-C<setlocale>, or other locale change. The returned buffer of this function is
-not changed until the next call to it, so the buffer is never in a trashed
-state.
+C<setlocale>, or other locale change. The returned buffer of these functions
+is not changed until the next call to one or the other, so the buffer is never
+in a trashed state.
-=item *
+=item d.
-Its return buffer is per-thread, so it also is never overwritten by a call to
-this function from another thread; unlike the function it replaces.
+The return buffer is per-thread, so it also is never overwritten by a call to
+these functions from another thread; unlike the function it replaces.
-=item *
+=item e.
-But most importantly, it works on systems that don't have C<nl_langinfo>, such
-as Windows, hence makes your code more portable. Of the fifty-some possible
+But most importantly, they work on systems that don't have C<nl_langinfo>, such
+as Windows, hence making your code more portable. Of the fifty-some possible
items specified by the POSIX 2008 standard,
L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>,
only one is completely unimplemented, though on non-Windows platforms, another
-significant one is also not implemented). It uses various techniques to
+significant one is not fully implemented). They use various techniques to
recover the other items, including calling C<L<localeconv(3)>>, and
C<L<strftime(3)>>, both of which are specified in C89, so should be always be
-available. Later C<strftime()> versions have additional capabilities; C<""> is
-returned for those not available on your system.
+available. Later C<strftime()> versions have additional capabilities; What the
+C locale yields or C<""> is returned for any item not available on your system.
-It is important to note that when called with an item that is recovered by
+It is important to note that, when called with an item that is recovered by
using C<localeconv>, the buffer from any previous explicit call to
-C<localeconv> will be overwritten. This means you must save that buffer's
-contents if you need to access them after a call to this function. (But note
-that you might not want to be using C<localeconv()> directly anyway, because of
-issues like the ones listed in the second item of this list (above) for
-C<RADIXCHAR> and C<THOUSEP>. You can use the methods given in L<perlcall> to
-call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to
-unpack).
+C<L<localeconv(3)>> will be overwritten. But you shouldn't be using
+C<localeconv> anyway because it is is very much not thread-safe, and suffers
+from the same problems outlined in item 'b.' above for the fields it returns that
+are controlled by the LC_NUMERIC locale category. Instead, avoid all of those
+problems by calling L</Perl_localeconv>, which is thread-safe; or by using the
+methods given in L<perlcall> to call
+L<C<POSIX::localeconv()>|POSIX/localeconv>, which is also thread-safe.
+
+=back
The details for those items which may deviate from what this emulation returns
and what a native C<nl_langinfo()> would return are specified in
L<I18N::Langinfo>.
-=back
-
-When using C<Perl_langinfo> on systems that don't have a native
-C<nl_langinfo()>, you must
+When using C<Perl_langinfo8> (or plain C<Perl_langinfo>) on systems that don't
+have a native C<nl_langinfo()>, you must
#include "perl_langinfo.h"
-before the C<perl.h> C<#include>. You can replace your C<langinfo.h>
+before the C<perl.h> C<#include>. You can replace your F<langinfo.h>
C<#include> with this one. (Doing it this way keeps out the symbols that plain
-C<langinfo.h> would try to import into the namespace for code that doesn't need
+F<langinfo.h> would try to import into the namespace for code that doesn't need
it.)
-The original impetus for C<Perl_langinfo()> was so that code that needs to
-find out the current currency symbol, floating point radix character, or digit
-grouping separator can use, on all systems, the simpler and more
-thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a
-pain to make thread-friendly. For other fields returned by C<localeconv>, it
-is better to use the methods given in L<perlcall> to call
-L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly.
-
=cut
*/
const char *
-#ifdef HAS_NL_LANGINFO
Perl_langinfo(const nl_item item)
-#else
-Perl_langinfo(const int item)
-#endif
{
- return my_nl_langinfo(item, TRUE);
+ return Perl_langinfo8(item, NULL);
}
-STATIC const char *
-#ifdef HAS_NL_LANGINFO
-S_my_nl_langinfo(const nl_item item, bool toggle)
-#else
-S_my_nl_langinfo(const int item, bool toggle)
-#endif
+const char *
+Perl_langinfo8(const nl_item item, utf8ness_t * utf8ness)
{
dTHX;
- const char * retval;
+ unsigned cat_index;
-#ifdef USE_LOCALE_NUMERIC
+ PERL_ARGS_ASSERT_PERL_LANGINFO8;
- /* We only need to toggle into the underlying LC_NUMERIC locale for these
- * two items, and only if not already there */
- if (toggle && (( item != RADIXCHAR && item != THOUSEP)
- || PL_numeric_underlying))
+ if (utf8ness) { /* Assume for now */
+ *utf8ness = UTF8NESS_IMMATERIAL;
+ }
-#endif /* No toggling needed if not using LC_NUMERIC */
+ /* Find the locale category that controls the input 'item'. If we are not
+ * paying attention to that category, instead return a default value. Also
+ * return the default value if there is no way for us to figure out the
+ * correct value. If we have some form of nl_langinfo(), we can always
+ * figure it out, but lacking that, there may be alternative methods that
+ * can be used to recover most of the possible items. Some of those
+ * methods need libc functions, which may or may not be available. If
+ * unavailable, we can't compute the correct value, so must here return the
+ * default. */
+ switch (item) {
- toggle = FALSE;
+ case CODESET:
-#if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
-# if ! defined(HAS_THREAD_SAFE_NL_LANGINFO_L) \
- || ! defined(HAS_POSIX_2008_LOCALE)
+#ifdef USE_LOCALE_CTYPE
- /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC
- * for those items dependent on it. This must be copied to a buffer before
- * switching back, as some systems destroy the buffer when setlocale() is
- * called */
+ cat_index = LC_CTYPE_INDEX_;
+ break;
- {
- DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
+#else
+ return C_codeset;
+#endif
+#if defined(USE_LOCALE_MESSAGES) && defined(HAS_SOME_LANGINFO)
- if (toggle) {
- STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
- }
+ case YESEXPR: case YESSTR: case NOEXPR: case NOSTR:
+ cat_index = LC_MESSAGES_INDEX_;
+ break;
+#else
+ case YESEXPR: return "^[+1yY]";
+ case YESSTR: return "yes";
+ case NOEXPR: return "^[-0nN]";
+ case NOSTR: return "no";
+#endif
- /* Prevent interference from another thread executing this code
- * section. */
- NL_LANGINFO_LOCK;
+ case CRNCYSTR:
- /* Copy to a per-thread buffer, which is also one that won't be
- * destroyed by a subsequent setlocale(), such as the
- * RESTORE_LC_NUMERIC may do just below. */
- retval = save_to_buffer(nl_langinfo(item),
- &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
- NL_LANGINFO_UNLOCK;
+#if defined(USE_LOCALE_MONETARY) \
+ && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV))
- if (toggle) {
- RESTORE_LC_NUMERIC();
- }
- }
+ cat_index = LC_MONETARY_INDEX_;
+ break;
+#else
+ return "-";
+#endif
-# else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */
+ case RADIXCHAR:
- {
- bool do_free = FALSE;
- locale_t cur = uselocale((locale_t) 0);
+#ifdef CAN_CALCULATE_RADIX
- if (cur == LC_GLOBAL_LOCALE) {
- cur = duplocale(LC_GLOBAL_LOCALE);
- do_free = TRUE;
- }
+ cat_index = LC_NUMERIC_INDEX_;
+ break;
+#else
+ return C_decimal_point;
+#endif
-# ifdef USE_LOCALE_NUMERIC
+ case THOUSEP:
- if (toggle) {
- if (PL_underlying_numeric_obj) {
- cur = PL_underlying_numeric_obj;
- }
- else {
- cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur);
- do_free = TRUE;
- }
- }
+#if defined(USE_LOCALE_NUMERIC) \
+ && (defined(HAS_SOME_LANGINFO) || defined(HAS_LOCALECONV))
-# endif
+ cat_index = LC_NUMERIC_INDEX_;
+ break;
+#else
+ return C_thousands_sep;
+#endif
- /* We have to save it to a buffer, because the freelocale() just below
- * can invalidate the internal one */
- retval = save_to_buffer(nl_langinfo_l(item, cur),
- &PL_langinfo_buf, &PL_langinfo_bufsize, 0);
+/* The other possible items are all in LC_TIME. */
+#ifdef USE_LOCALE_TIME
- if (do_free) {
- freelocale(cur);
- }
+ default:
+ cat_index = LC_TIME_INDEX_;
+ break;
+
+#endif
+#if ! defined(USE_LOCALE_TIME) || ! defined(HAS_SOME_LANGINFO)
+
+ /* If not using LC_TIME, hard code the rest. Or, if there is no
+ * nl_langinfo(), we use strftime() as an alternative, and it is missing
+ * functionality to get every single one, so hard-code those */
+
+ case ERA: return ""; /* Unimplemented; for use with strftime() %E
+ modifier */
+
+ /* These formats are defined by C89, so we assume that strftime supports
+ * them, and so are returned unconditionally; they may not be what the
+ * locale actually says, but should give good enough results for someone
+ * using them as formats (as opposed to trying to parse them to figure
+ * out what the locale says). The other format items are actually tested
+ * to verify they work on the platform */
+ case D_FMT: return "%x";
+ case T_FMT: return "%X";
+ case D_T_FMT: return "%c";
+
+# if defined(WIN32) || ! defined(USE_LOCALE_TIME)
+
+ /* strftime() on Windows doesn't have the POSIX (beyond C89) extensions
+ * that would allow it to recover these */
+ case ERA_D_FMT: return "%x";
+ case ERA_T_FMT: return "%X";
+ case ERA_D_T_FMT: return "%c";
+ case ALT_DIGITS: return "0";
+
+# endif
+# ifndef USE_LOCALE_TIME
+
+ case T_FMT_AMPM: return "%r";
+ case ABDAY_1: return "Sun";
+ case ABDAY_2: return "Mon";
+ case ABDAY_3: return "Tue";
+ case ABDAY_4: return "Wed";
+ case ABDAY_5: return "Thu";
+ case ABDAY_6: return "Fri";
+ case ABDAY_7: return "Sat";
+ case AM_STR: return "AM";
+ case PM_STR: return "PM";
+ case ABMON_1: return "Jan";
+ case ABMON_2: return "Feb";
+ case ABMON_3: return "Mar";
+ case ABMON_4: return "Apr";
+ case ABMON_5: return "May";
+ case ABMON_6: return "Jun";
+ case ABMON_7: return "Jul";
+ case ABMON_8: return "Aug";
+ case ABMON_9: return "Sep";
+ case ABMON_10: return "Oct";
+ case ABMON_11: return "Nov";
+ case ABMON_12: return "Dec";
+ case DAY_1: return "Sunday";
+ case DAY_2: return "Monday";
+ case DAY_3: return "Tuesday";
+ case DAY_4: return "Wednesday";
+ case DAY_5: return "Thursday";
+ case DAY_6: return "Friday";
+ case DAY_7: return "Saturday";
+ case MON_1: return "January";
+ case MON_2: return "February";
+ case MON_3: return "March";
+ case MON_4: return "April";
+ case MON_5: return "May";
+ case MON_6: return "June";
+ case MON_7: return "July";
+ case MON_8: return "August";
+ case MON_9: return "September";
+ case MON_10: return "October";
+ case MON_11: return "November";
+ case MON_12: return "December";
+
+# endif
+#endif
+
+ } /* End of switch on item */
+
+#ifndef USE_LOCALE
+
+ Perl_croak_nocontext("panic: Unexpected nl_langinfo() item %d", item);
+ NOT_REACHED; /* NOTREACHED */
+ PERL_UNUSED_VAR(cat_index);
+
+#else
+# ifdef USE_LOCALE_NUMERIC
+
+ /* Use either the underlying numeric, or the other underlying categories */
+ if (cat_index == LC_NUMERIC_INDEX_) {
+ return my_langinfo_c(item, LC_NUMERIC, PL_numeric_name,
+ &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
}
+ else
# endif
- if (strEQ(retval, "")) {
- if (item == YESSTR) {
- return "yes";
- }
- if (item == NOSTR) {
- return "no";
- }
+ {
+ return my_langinfo_i(item, cat_index, querylocale_i(cat_index),
+ &PL_langinfo_buf, &PL_langinfo_bufsize, utf8ness);
}
- return retval;
+#endif
-#else /* Below, emulate nl_langinfo as best we can */
+}
- {
+#ifdef USE_LOCALE
-# ifdef HAS_LOCALECONV
+/* There are several implementations of my_langinfo, depending on the
+ * Configuration. They all share the same beginning of the function */
+STATIC const char *
+S_my_langinfo_i(pTHX_
+ const nl_item item, /* The item to look up */
+ const unsigned int cat_index, /* The locale category that
+ controls it */
+ /* The locale to look up 'item' in. */
+ const char * locale,
+
+ /* Where to store the result, and where the size of that buffer
+ * is stored, updated on exit. retbuf_sizep may be NULL for an
+ * empty-on-entry, single use buffer whose size we don't need
+ * to keep track of */
+ const char ** retbufp,
+ Size_t * retbuf_sizep,
+
+ /* If not NULL, the location to store the UTF8-ness of 'item's
+ * value, as documented */
+ utf8ness_t * utf8ness)
+{
+ const char * retval = NULL;
- const struct lconv* lc;
- const char * temp;
- DECLARATION_FOR_LC_NUMERIC_MANIPULATION;
+ PERL_ARGS_ASSERT_MY_LANGINFO_I;
+ assert(cat_index < NOMINAL_LC_ALL_INDEX);
-# ifdef TS_W32_BROKEN_LOCALECONV
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "Entering my_langinfo item=%ld, using locale %s\n",
+ (long) item, locale));
+/*--------------------------------------------------------------------------*/
+/* Above is the common beginning to all the implementations of my_langinfo().
+ * Below are the various completions.
+ *
+ * Some platforms don't deal well with non-ASCII strings in locale X when
+ * LC_CTYPE is not in X. (Actually it is probably when X is UTF-8 and LC_CTYPE
+ * isn't, or vice versa). There is explicit code to bring the categories into
+ * sync. This doesn't seem to be a problem with nl_langinfo(), so that
+ * implementation doesn't currently worry about it. But it is a problem on
+ * Windows boxes, which don't have nl_langinfo(). */
+
+/*--------------------------------------------------------------------------*/
+# if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */
+# ifdef USE_LOCALE_CTYPE
- const char * save_global;
- const char * save_thread;
- int needed_size;
- char * ptr;
- char * e;
- char * item_start;
+ /* Ths function sorts out if things actually have to be switched or not,
+ * for both calls. */
+ const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
# endif
-# endif
-# ifdef HAS_STRFTIME
- struct tm tm;
- bool return_format = FALSE; /* Return the %format, not the value */
- const char * format;
+ const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
-# endif
+ gwLOCALE_LOCK;
+ retval = save_to_buffer(nl_langinfo(item), retbufp, retbuf_sizep);
+ gwLOCALE_UNLOCK;
- /* We copy the results to a per-thread buffer, even if not
- * multi-threaded. This is in part to simplify this code, and partly
- * because we need a buffer anyway for strftime(), and partly because a
- * call of localeconv() could otherwise wipe out the buffer, and the
- * programmer would not be expecting this, as this is a nl_langinfo()
- * substitute after all, so s/he might be thinking their localeconv()
- * is safe until another localeconv() call. */
+ if (utf8ness) {
+ *utf8ness = get_locale_string_utf8ness_i(retval,
+ LOCALE_UTF8NESS_UNKNOWN,
+ locale, cat_index);
+ }
- switch (item) {
- Size_t len;
+ restore_toggled_locale_i(cat_index, orig_switched_locale);
- /* This is unimplemented */
- case ERA: /* For use with strftime() %E modifier */
+# ifdef USE_LOCALE_CTYPE
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
+# endif
- default:
- return "";
+ return retval;
+/*--------------------------------------------------------------------------*/
+# else /* Below, emulate nl_langinfo as best we can */
- /* We use only an English set, since we don't know any more */
- case YESEXPR: return "^[+1yY]";
- case YESSTR: return "yes";
- case NOEXPR: return "^[-0nN]";
- case NOSTR: return "no";
+ /* And the third and final completion is where we have to emulate
+ * nl_langinfo(). There are various possibilities depending on the
+ * Configuration */
- case CODESET:
+# ifdef USE_LOCALE_CTYPE
-# ifndef WIN32
+ const char * orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, locale);
- /* On non-windows, this is unimplemented, in part because of
- * inconsistencies between vendors. The Darwin native
- * nl_langinfo() implementation simply looks at everything past
- * any dot in the name, but that doesn't work for other
- * vendors. Many Linux locales that don't have UTF-8 in their
- * names really are UTF-8, for example; z/OS locales that do
- * have UTF-8 in their names, aren't really UTF-8 */
- return "";
+# endif
-# else
+ const char * orig_switched_locale = toggle_locale_i(cat_index, locale);
- { /* But on Windows, the name does seem to be consistent, so
- use that. */
- const char * p;
- const char * first;
- Size_t offset = 0;
- const char * name = my_setlocale(LC_CTYPE, NULL);
+ /* Here, we are in the locale we want information about */
- if (isNAME_C_OR_POSIX(name)) {
- return "ANSI_X3.4-1968";
- }
+ /* Almost all the items will have ASCII return values. Set that here, and
+ * override if necessary */
+ utf8ness_t is_utf8 = UTF8NESS_IMMATERIAL;
- /* Find the dot in the locale name */
- first = (const char *) strchr(name, '.');
- if (! first) {
- first = name;
- goto has_nondigit;
- }
+ switch (item) {
+ default:
+ assert(item < 0); /* Make sure using perl_langinfo.h */
+ retval = "";
+ break;
- /* Look at everything past the dot */
- first++;
- p = first;
+ case RADIXCHAR:
- while (*p) {
- if (! isDIGIT(*p)) {
- goto has_nondigit;
- }
+# if defined(HAS_SNPRINTF) \
+ && (! defined(HAS_LOCALECONV) || defined(TS_W32_BROKEN_LOCALECONV))
- p++;
- }
+ {
+ /* snprintf() can be used to find the radix character by outputting
+ * a known simple floating point number to a buffer, and parsing
+ * it, inferring the radix as the bytes separating the integer and
+ * fractional parts. But localeconv() is more direct, not
+ * requiring inference, so use it instead of the code just below,
+ * if (likely) it is available and works ok */
+
+ char * floatbuf = NULL;
+ const Size_t initial_size = 10;
+
+ Newx(floatbuf, initial_size, char);
+
+ /* 1.5 is exactly representable on binary computers */
+ Size_t needed_size = snprintf(floatbuf, initial_size, "%.1f", 1.5);
+
+ /* If our guess wasn't big enough, increase and try again, based on
+ * the real number that snprintf() is supposed to return */
+ if (UNLIKELY(needed_size >= initial_size)) {
+ needed_size++; /* insurance */
+ Renew(floatbuf, needed_size, char);
+ Size_t new_needed = snprintf(floatbuf, needed_size, "%.1f", 1.5);
+ assert(new_needed <= needed_size);
+ needed_size = new_needed;
+ }
- /* Here everything past the dot is a digit. Treat it as a
- * code page */
- retval = save_to_buffer("CP", &PL_langinfo_buf,
- &PL_langinfo_bufsize, 0);
- offset = STRLENs("CP");
+ char * s = floatbuf;
+ char * e = floatbuf + needed_size;
- has_nondigit:
+ /* Find the '1' */
+ while (s < e && *s != '1') {
+ s++;
+ }
- retval = save_to_buffer(first, &PL_langinfo_buf,
- &PL_langinfo_bufsize, offset);
+ if (LIKELY(s < e)) {
+ s++;
+ }
+
+ /* Find the '5' */
+ char * item_start = s;
+ while (s < e && *s != '5') {
+ s++;
+ }
+
+ /* Everything in between is the radix string */
+ if (LIKELY(s < e)) {
+ *s = '\0';
+ retval = save_to_buffer(item_start, retbufp, retbuf_sizep);
+ Safefree(floatbuf);
+
+ if (utf8ness) {
+ is_utf8 = get_locale_string_utf8ness_i(retval,
+ LOCALE_UTF8NESS_UNKNOWN,
+ locale, cat_index);
}
break;
+ }
-# endif
-# ifdef HAS_LOCALECONV
+ Safefree(floatbuf);
+ }
- case CRNCYSTR:
+# ifdef HAS_LOCALECONV /* snprintf() failed; drop down to use
+ localeconv() */
- /* We don't bother with localeconv_l() because any system that
- * has it is likely to also have nl_langinfo() */
+ /* FALLTHROUGH */
- LOCALECONV_LOCK; /* Prevent interference with other threads
- using localeconv() */
+# else /* snprintf() failed and no localeconv() */
-# ifdef TS_W32_BROKEN_LOCALECONV
+ retval = C_decimal_point;
+ break;
- /* This is a workaround for a Windows bug prior to VS 15.
- * What we do here is, while locked, switch to the global
- * locale so localeconv() works; then switch back just before
- * the unlock. This can screw things up if some thread is
- * already using the global locale while assuming no other is.
- * A different workaround would be to call GetCurrencyFormat on
- * a known value, and parse it; patches welcome
- *
- * We have to use LC_ALL instead of LC_MONETARY because of
- * another bug in Windows */
+# endif
+# endif
+# ifdef HAS_LOCALECONV
- save_thread = savepv(my_setlocale(LC_ALL, NULL));
- _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
- save_global= savepv(my_setlocale(LC_ALL, NULL));
- my_setlocale(LC_ALL, save_thread);
+ /* These items are available from localeconv(). (To avoid using
+ * TS_W32_BROKEN_LOCALECONV, one could use GetNumberFormat and
+ * GetCurrencyFormat; patches welcome) */
-# endif
+# define P_CS_PRECEDES "p_cs_precedes"
+# define CURRENCY_SYMBOL "currency_symbol"
- lc = localeconv();
- if ( ! lc
- || ! lc->currency_symbol
- || strEQ("", lc->currency_symbol))
- {
- LOCALECONV_UNLOCK;
- return "";
- }
+ /* case RADIXCHAR: // May drop down to here in some configurations */
+ case THOUSEP:
+ case CRNCYSTR:
+ {
- /* Leave the first spot empty to be filled in below */
- retval = save_to_buffer(lc->currency_symbol, &PL_langinfo_buf,
- &PL_langinfo_bufsize, 1);
- if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, ""))
- { /* khw couldn't figure out how the localedef specifications
- would show that the $ should replace the radix; this is
- just a guess as to how it might work.*/
- PL_langinfo_buf[0] = '.';
- }
- else if (lc->p_cs_precedes) {
- PL_langinfo_buf[0] = '-';
+ /* The hash gets populated with just the field(s) related to 'item'. */
+ HV * result_hv = my_localeconv(item);
+
+ SV* string;
+ if (item != CRNCYSTR) {
+
+ /* These items have been populated with just one key => value */
+ (void) hv_iterinit(result_hv);
+ HE * entry = hv_iternext(result_hv);
+ string = hv_iterval(result_hv, entry);
+ }
+ else {
+
+ /* But CRNCYSTR localeconv() returns a slightly different value
+ * than the nl_langinfo() API calls for, so have to modify this one
+ * to conform. We need another value from localeconv() to know
+ * what to change it to. my_localeconv() has populated the hash
+ * with exactly both fields. Delete this one, leaving just the
+ * CRNCYSTR one in the hash */
+ SV* precedes = hv_delete(result_hv,
+ P_CS_PRECEDES, STRLENs(P_CS_PRECEDES),
+ 0);
+ if (! precedes) {
+ locale_panic_("my_localeconv() unexpectedly didn't return"
+ " a value for " P_CS_PRECEDES);
+ }
+
+ /* The modification is to prefix the localeconv() return with a
+ * single byte, calculated as follows: */
+ char prefix = (LIKELY(SvIV(precedes) != -1))
+ ? ((precedes != 0) ? '-' : '+')
+
+ /* khw couldn't find any documentation that
+ * CHAR_MAX (which we modify to -1) is the signal,
+ * but cygwin uses it thusly, and it makes sense
+ * given that CHAR_MAX indicates the value isn't
+ * used, so it neither precedes nor succeeds */
+ : '.';
+
+ /* Now get CRNCYSTR */
+ (void) hv_iterinit(result_hv);
+ HE * entry = hv_iternext(result_hv);
+ string = hv_iterval(result_hv, entry);
+
+ /* And perform the modification */
+ Perl_sv_setpvf(aTHX_ string, "%c%s", prefix, SvPV_nolen(string));
+ }
+
+ /* Here, 'string' contains the value we want to return */
+ retval = save_to_buffer(SvPV_nolen(string), retbufp, retbuf_sizep);
+
+ if (utf8ness) {
+ is_utf8 = (SvUTF8(string))
+ ? UTF8NESS_YES
+ : (is_utf8_invariant_string( (U8 *) retval,
+ strlen(retval)))
+ ? UTF8NESS_IMMATERIAL
+ : UTF8NESS_NO;
+ }
+
+ break;
+
+ }
+
+# endif /* Some form of localeconv */
+# ifdef HAS_STRFTIME
+
+ /* These formats are only available in later strftime's */
+ case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
+
+ /* The rest can be gotten from most versions of strftime(). */
+ case ABDAY_1: case ABDAY_2: case ABDAY_3:
+ case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
+ case ALT_DIGITS:
+ case AM_STR: case PM_STR:
+ case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
+ case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
+ case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
+ case DAY_1: case DAY_2: case DAY_3: case DAY_4:
+ case DAY_5: case DAY_6: case DAY_7:
+ case MON_1: case MON_2: case MON_3: case MON_4:
+ case MON_5: case MON_6: case MON_7: case MON_8:
+ case MON_9: case MON_10: case MON_11: case MON_12:
+ {
+ const char * format;
+ bool return_format = FALSE;
+ int mon = 0;
+ int mday = 1;
+ int hour = 6;
+
+ GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
+
+ switch (item) {
+ default:
+ locale_panic_(Perl_form(aTHX_ "switch case: %d problem", item));
+ NOT_REACHED; /* NOTREACHED */
+
+ case PM_STR: hour = 18;
+ case AM_STR:
+ format = "%p";
+ break;
+ case ABDAY_7: mday++;
+ case ABDAY_6: mday++;
+ case ABDAY_5: mday++;
+ case ABDAY_4: mday++;
+ case ABDAY_3: mday++;
+ case ABDAY_2: mday++;
+ case ABDAY_1:
+ format = "%a";
+ break;
+ case DAY_7: mday++;
+ case DAY_6: mday++;
+ case DAY_5: mday++;
+ case DAY_4: mday++;
+ case DAY_3: mday++;
+ case DAY_2: mday++;
+ case DAY_1:
+ format = "%A";
+ break;
+ case ABMON_12: mon++;
+ case ABMON_11: mon++;
+ case ABMON_10: mon++;
+ case ABMON_9: mon++;
+ case ABMON_8: mon++;
+ case ABMON_7: mon++;
+ case ABMON_6: mon++;
+ case ABMON_5: mon++;
+ case ABMON_4: mon++;
+ case ABMON_3: mon++;
+ case ABMON_2: mon++;
+ case ABMON_1:
+ format = "%b";
+ break;
+ case MON_12: mon++;
+ case MON_11: mon++;
+ case MON_10: mon++;
+ case MON_9: mon++;
+ case MON_8: mon++;
+ case MON_7: mon++;
+ case MON_6: mon++;
+ case MON_5: mon++;
+ case MON_4: mon++;
+ case MON_3: mon++;
+ case MON_2: mon++;
+ case MON_1:
+ format = "%B";
+ break;
+ case T_FMT_AMPM:
+ format = "%r";
+ return_format = TRUE;
+ break;
+ case ERA_D_FMT:
+ format = "%Ex";
+ return_format = TRUE;
+ break;
+ case ERA_T_FMT:
+ format = "%EX";
+ return_format = TRUE;
+ break;
+ case ERA_D_T_FMT:
+ format = "%Ec";
+ return_format = TRUE;
+ break;
+ case ALT_DIGITS:
+ format = "%Ow"; /* Find the alternate digit for 0 */
+ break;
+ }
+
+ GCC_DIAG_RESTORE_STMT;
+
+ /* The year was deliberately chosen so that January 1 is on the
+ * first day of the week. Since we're only getting one thing at a
+ * time, it all works */
+ const char * temp = my_strftime8_temp(format, 30, 30, hour, mday, mon,
+ 2011, 0, 0, 0, &is_utf8);
+ retval = save_to_buffer(temp, retbufp, retbuf_sizep);
+ Safefree(temp);
+
+ /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
+ * format for wday 0. If the value is the same as the normal 0,
+ * there isn't an alternate, so clear the buffer.
+ *
+ * (wday was chosen because its range is all a single digit.
+ * Things like tm_sec have two digits as the minimum: '00'.) */
+ if (item == ALT_DIGITS && strEQ(*retbufp, "0")) {
+ retval = "";
+ break;
+ }
+
+ /* ALT_DIGITS is problematic. Experiments on it showed that
+ * strftime() did not always work properly when going from alt-9 to
+ * alt-10. Only a few locales have this item defined, and in all
+ * of them on Linux that khw was able to find, nl_langinfo() merely
+ * returned the alt-0 character, possibly doubled. Most Unicode
+ * digits are in blocks of 10 consecutive code points, so that is
+ * sufficient information for such scripts, as we can infer alt-1,
+ * alt-2, .... But for a Japanese locale, a CJK ideographic 0 is
+ * returned, and the CJK digits are not in code point order, so you
+ * can't really infer anything. The localedef for this locale did
+ * specify the succeeding digits, so that strftime() works properly
+ * on them, without needing to infer anything. But the
+ * nl_langinfo() return did not give sufficient information for the
+ * caller to understand what's going on. So until there is
+ * evidence that it should work differently, this returns the alt-0
+ * string for ALT_DIGITS. */
+
+ if (return_format) {
+
+ /* If to return the format, not the value, overwrite the buffer
+ * with it. But some strftime()s will keep the original format
+ * if illegal, so change those to "" */
+ if (strEQ(*retbufp, format)) {
+ retval = "";
}
else {
- PL_langinfo_buf[0] = '+';
+ retval = format;
}
-# ifdef TS_W32_BROKEN_LOCALECONV
+ /* A format is always in ASCII */
+ is_utf8 = UTF8NESS_IMMATERIAL;
+ }
- my_setlocale(LC_ALL, save_global);
- _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
- my_setlocale(LC_ALL, save_thread);
- Safefree(save_global);
- Safefree(save_thread);
+ break;
+ }
# endif
- LOCALECONV_UNLOCK;
- break;
+ case CODESET:
-# ifdef TS_W32_BROKEN_LOCALECONV
+ /* The trivial case */
+ if (isNAME_C_OR_POSIX(locale)) {
+ retval = C_codeset;
+ break;
+ }
- case RADIXCHAR:
+# ifdef WIN32
- /* For this, we output a known simple floating point number to
- * a buffer, and parse it, looking for the radix */
+ /* This function retrieves the code page. It is subject to change, but
+ * is documented and has been stable for many releases */
+ UINT ___lc_codepage_func(void);
- if (toggle) {
- STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
- }
+ retval = save_to_buffer(Perl_form(aTHX_ "%d", ___lc_codepage_func()),
+ retbufp, retbuf_sizep);
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, "locale='%s' cp=%s\n",
+ locale, retval));
+ break;
- if (PL_langinfo_bufsize < 10) {
- PL_langinfo_bufsize = 10;
- Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
- }
+# else
- needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
- "%.1f", 1.5);
- if (needed_size >= (int) PL_langinfo_bufsize) {
- PL_langinfo_bufsize = needed_size + 1;
- Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
- needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize,
- "%.1f", 1.5);
- assert(needed_size < (int) PL_langinfo_bufsize);
- }
+ /* The codeset is important, but khw did not figure out a way for it to
+ * be retrieved on non-Windows boxes without nl_langinfo(). But even
+ * if we can't get it directly, we can usually determine if it is a
+ * UTF-8 locale or not. If it is UTF-8, we (correctly) use that for
+ * the code set. */
+
+# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
+
+ /* If libc mbtowc() evaluates the bytes that form the REPLACEMENT
+ * CHARACTER as that Unicode code point, this has to be a UTF-8 locale.
+ * */
+ wchar_t wc = 0;
+ (void) Perl_mbtowc_(aTHX_ NULL, NULL, 0);/* Reset shift state */
+ int mbtowc_ret = Perl_mbtowc_(aTHX_ &wc,
+ STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
+ if (mbtowc_ret >= 0 && wc == UNICODE_REPLACEMENT) {
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "mbtowc returned REPLACEMENT\n"));
+ retval = "UTF-8";
+ break;
+ }
- ptr = PL_langinfo_buf;
- e = PL_langinfo_buf + PL_langinfo_bufsize;
+ /* Here, it isn't a UTF-8 locale. */
- /* Find the '1' */
- while (ptr < e && *ptr != '1') {
- ptr++;
- }
- ptr++;
+# else /* mbtowc() is not available. */
- /* Find the '5' */
- item_start = ptr;
- while (ptr < e && *ptr != '5') {
- ptr++;
- }
+ /* Sling together several possibilities, depending on platform
+ * capabilities and what we found.
+ *
+ * For non-English locales or non-dollar currency locales, we likely
+ * will find out whether a locale is UTF-8 or not */
- /* Everything in between is the radix string */
- if (ptr >= e) {
- PL_langinfo_buf[0] = '?';
- PL_langinfo_buf[1] = '\0';
- }
- else {
- *ptr = '\0';
- Move(item_start, PL_langinfo_buf, ptr - PL_langinfo_buf, char);
+ utf8ness_t is_utf8 = UTF8NESS_UNKNOWN;
+ const char * scratch_buf = NULL;
+
+# if defined(USE_LOCALE_MONETARY) && defined(HAS_LOCALECONV)
+
+ /* Can't use this method unless localeconv() is available, as that's
+ * the way we find out the currency symbol. */
+
+ /* First try looking at the currency symbol (via a recursive call) to
+ * see if it disambiguates things. Often that will be in the native
+ * script, and if the symbol isn't legal UTF-8, we know that the locale
+ * isn't either. */
+ (void) my_langinfo_c(CRNCYSTR, LC_MONETARY, locale, &scratch_buf, NULL,
+ &is_utf8);
+ Safefree(scratch_buf);
+
+# endif
+# ifdef USE_LOCALE_TIME
+
+ /* If we have ruled out being UTF-8, no point in checking further. */
+ if (is_utf8 != UTF8NESS_NO) {
+
+ /* But otherwise do check more. This is done even if the currency
+ * symbol looks to be UTF-8, just in case that's a false positive.
+ *
+ * Look at the LC_TIME entries, like the names of the months or
+ * weekdays. We quit at the first one that is illegal UTF-8 */
+
+ utf8ness_t this_is_utf8 = UTF8NESS_UNKNOWN;
+ const int times[] = {
+ DAY_1, DAY_2, DAY_3, DAY_4, DAY_5, DAY_6, DAY_7,
+ MON_1, MON_2, MON_3, MON_4, MON_5, MON_6, MON_7, MON_8,
+ MON_9, MON_10, MON_11, MON_12,
+ ALT_DIGITS, AM_STR, PM_STR,
+ ABDAY_1, ABDAY_2, ABDAY_3, ABDAY_4, ABDAY_5, ABDAY_6,
+ ABDAY_7,
+ ABMON_1, ABMON_2, ABMON_3, ABMON_4, ABMON_5, ABMON_6,
+ ABMON_7, ABMON_8, ABMON_9, ABMON_10, ABMON_11, ABMON_12
+ };
+
+ /* The code in the recursive call can handle switching the locales,
+ * but by doing it here, we avoid switching each iteration of the
+ * loop */
+ const char * orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
+
+ for (PERL_UINT_FAST8_T i = 0; i < C_ARRAY_LENGTH(times); i++) {
+ scratch_buf = NULL;
+ (void) my_langinfo_c(times[i], LC_TIME, locale, &scratch_buf,
+ NULL, &this_is_utf8);
+ Safefree(scratch_buf);
+ if (this_is_utf8 == UTF8NESS_NO) {
+ is_utf8 = UTF8NESS_NO;
+ break;
}
- if (toggle) {
- RESTORE_LC_NUMERIC();
+ if (this_is_utf8 == UTF8NESS_YES) {
+ is_utf8 = UTF8NESS_YES;
}
+ }
- retval = PL_langinfo_buf;
- break;
+ /* Here we have gone through all the LC_TIME elements. is_utf8 has
+ * been set as follows:
+ * UTF8NESS_NO If at least one is't legal UTF-8
+ * UTF8NESS_IMMMATERIAL If all are ASCII
+ * UTF8NESS_YES If all are legal UTF-8 (including
+ * ASCIIi), and at least one isn't
+ * ASCII. */
-# else
+ restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
+ }
- case RADIXCHAR: /* No special handling needed */
+# endif /* LC_TIME */
-# endif
+ /* If nothing examined above rules out it being UTF-8, and at least one
+ * thing fits as UTF-8 (and not plain ASCII), assume the codeset is
+ * UTF-8. */
+ if (is_utf8 == UTF8NESS_YES) {
+ retval = "UTF-8";
+ break;
+ }
- case THOUSEP:
+ /* Here, nothing examined indicates that the codeset is UTF-8. But
+ * what is it? The other locale categories are not likely to be of
+ * further help:
+ *
+ * LC_NUMERIC Only a few locales in the world have a non-ASCII radix
+ * or group separator.
+ * LC_CTYPE This code wouldn't be compiled if mbtowc() existed and
+ * was reliable. This is unlikely in C99. There are
+ * other functions that could be used instead, but are
+ * they going to exist, and be able to distinguish between
+ * UTF-8 and 8859-1? Deal with this only if it becomes
+ * necessary.
+ * LC_MESSAGES The strings returned from strerror() would seem likely
+ * candidates, but experience has shown that many systems
+ * don't actually have translations installed for them.
+ * They are instead always in English, so everything in
+ * them is ASCII, which is of no help to us. A Configure
+ * probe could possibly be written to see if this platform
+ * has non-ASCII error messages. But again, wait until it
+ * turns out to be an actual problem. */
+
+# endif /* ! mbtowc() */
+
+ /* Rejoin the mbtowc available/not-available cases.
+ *
+ * We got here only because we haven't been able to find the codeset.
+ * The only other option khw could think of is to see if the codeset is
+ * part of the locale name. This is very less than ideal; often there
+ * is no code set in the name; and at other times they even lie.
+ *
+ * But there is an XPG standard syntax, which many locales follow:
+ *
+ * language[_territory[.codeset]][@modifier]
+ *
+ * So we take the part between the dot and any '@' */
+ retval = (const char *) strchr(locale, '.');
+ if (! retval) {
+ retval = ""; /* Alas, no dot */
+ break;
+ }
- if (toggle) {
- STORE_LC_NUMERIC_FORCE_TO_UNDERLYING();
- }
+ /* Don't include the dot */
+ retval++;
+
+ /* And stop before any '@' */
+ const char * modifier = strchr(retval, '@');
+ if (modifier) {
+ char * code_set_name;
+ const Size_t name_len = modifier - retval;
+ Newx(code_set_name, name_len + 1, char); /* +1 for NUL */
+ my_strlcpy(code_set_name, retval, name_len + 1);
+ SAVEFREEPV(code_set_name);
+ retval = code_set_name;
+ }
- LOCALECONV_LOCK; /* Prevent interference with other threads
- using localeconv() */
-
-# ifdef TS_W32_BROKEN_LOCALECONV
-
- /* This should only be for the thousands separator. A
- * different work around would be to use GetNumberFormat on a
- * known value and parse the result to find the separator */
- save_thread = savepv(my_setlocale(LC_ALL, NULL));
- _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
- save_global = savepv(my_setlocale(LC_ALL, NULL));
- my_setlocale(LC_ALL, save_thread);
-# if 0
- /* This is the start of code that for broken Windows replaces
- * the above and below code, and instead calls
- * GetNumberFormat() and then would parse that to find the
- * thousands separator. It needs to handle UTF-16 vs -8
- * issues. */
-
- needed_size = GetNumberFormatEx(PL_numeric_name, 0, "1234.5", NULL, PL_langinfo_buf, PL_langinfo_bufsize);
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "%s: %d: return from GetNumber, count=%d, val=%s\n",
- __FILE__, __LINE__, needed_size, PL_langinfo_buf));
+# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
+
+ /* When these functions, are available, they were tried earlier and
+ * indicated that the locale did not act like a proper UTF-8 one. So
+ * if it claims to be UTF-8, it is a lie */
+ if (is_codeset_name_UTF8(retval)) {
+ retval = "";
+ break;
+ }
# endif
-# endif
- lc = localeconv();
- if (! lc) {
- temp = "";
- }
- else {
- temp = (item == RADIXCHAR)
- ? lc->decimal_point
- : lc->thousands_sep;
- if (! temp) {
- temp = "";
- }
- }
+ /* Otherwise the code set name is considered to be everything between
+ * the dot and the '@' */
+ retval = save_to_buffer(retval, retbufp, retbuf_sizep);
+
+ break;
- retval = save_to_buffer(temp, &PL_langinfo_buf,
- &PL_langinfo_bufsize, 0);
+# endif
-# ifdef TS_W32_BROKEN_LOCALECONV
+ } /* Giant switch() of nl_langinfo() items */
- my_setlocale(LC_ALL, save_global);
- _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
- my_setlocale(LC_ALL, save_thread);
- Safefree(save_global);
- Safefree(save_thread);
+ restore_toggled_locale_i(cat_index, orig_switched_locale);
+# ifdef USE_LOCALE_CTYPE
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
# endif
- LOCALECONV_UNLOCK;
+ if (utf8ness) {
+ *utf8ness = is_utf8;
+ }
- if (toggle) {
- RESTORE_LC_NUMERIC();
- }
+ return retval;
- break;
+# endif /* All the implementations of my_langinfo() */
-# endif
-# ifdef HAS_STRFTIME
-
- /* These are defined by C89, so we assume that strftime supports
- * them, and so are returned unconditionally; they may not be what
- * the locale actually says, but should give good enough results
- * for someone using them as formats (as opposed to trying to parse
- * them to figure out what the locale says). The other format
- * items are actually tested to verify they work on the platform */
- case D_FMT: return "%x";
- case T_FMT: return "%X";
- case D_T_FMT: return "%c";
-
- /* These formats are only available in later strfmtime's */
- case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM:
-
- /* The rest can be gotten from most versions of strftime(). */
- case ABDAY_1: case ABDAY_2: case ABDAY_3:
- case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7:
- case ALT_DIGITS:
- case AM_STR: case PM_STR:
- case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4:
- case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8:
- case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12:
- case DAY_1: case DAY_2: case DAY_3: case DAY_4:
- case DAY_5: case DAY_6: case DAY_7:
- case MON_1: case MON_2: case MON_3: case MON_4:
- case MON_5: case MON_6: case MON_7: case MON_8:
- case MON_9: case MON_10: case MON_11: case MON_12:
-
- init_tm(&tm); /* Precaution against core dumps */
- tm.tm_sec = 30;
- tm.tm_min = 30;
- tm.tm_hour = 6;
- tm.tm_year = 2017 - 1900;
- tm.tm_wday = 0;
- tm.tm_mon = 0;
-
- GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough);
-
- switch (item) {
- default:
- Perl_croak(aTHX_
- "panic: %s: %d: switch case: %d problem",
- __FILE__, __LINE__, item);
- NOT_REACHED; /* NOTREACHED */
-
- case PM_STR: tm.tm_hour = 18;
- case AM_STR:
- format = "%p";
- break;
-
- case ABDAY_7: tm.tm_wday++;
- case ABDAY_6: tm.tm_wday++;
- case ABDAY_5: tm.tm_wday++;
- case ABDAY_4: tm.tm_wday++;
- case ABDAY_3: tm.tm_wday++;
- case ABDAY_2: tm.tm_wday++;
- case ABDAY_1:
- format = "%a";
- break;
-
- case DAY_7: tm.tm_wday++;
- case DAY_6: tm.tm_wday++;
- case DAY_5: tm.tm_wday++;
- case DAY_4: tm.tm_wday++;
- case DAY_3: tm.tm_wday++;
- case DAY_2: tm.tm_wday++;
- case DAY_1:
- format = "%A";
- break;
-
- case ABMON_12: tm.tm_mon++;
- case ABMON_11: tm.tm_mon++;
- case ABMON_10: tm.tm_mon++;
- case ABMON_9: tm.tm_mon++;
- case ABMON_8: tm.tm_mon++;
- case ABMON_7: tm.tm_mon++;
- case ABMON_6: tm.tm_mon++;
- case ABMON_5: tm.tm_mon++;
- case ABMON_4: tm.tm_mon++;
- case ABMON_3: tm.tm_mon++;
- case ABMON_2: tm.tm_mon++;
- case ABMON_1:
- format = "%b";
- break;
-
- case MON_12: tm.tm_mon++;
- case MON_11: tm.tm_mon++;
- case MON_10: tm.tm_mon++;
- case MON_9: tm.tm_mon++;
- case MON_8: tm.tm_mon++;
- case MON_7: tm.tm_mon++;
- case MON_6: tm.tm_mon++;
- case MON_5: tm.tm_mon++;
- case MON_4: tm.tm_mon++;
- case MON_3: tm.tm_mon++;
- case MON_2: tm.tm_mon++;
- case MON_1:
- format = "%B";
- break;
-
- case T_FMT_AMPM:
- format = "%r";
- return_format = TRUE;
- break;
-
- case ERA_D_FMT:
- format = "%Ex";
- return_format = TRUE;
- break;
-
- case ERA_T_FMT:
- format = "%EX";
- return_format = TRUE;
- break;
-
- case ERA_D_T_FMT:
- format = "%Ec";
- return_format = TRUE;
- break;
-
- case ALT_DIGITS:
- tm.tm_wday = 0;
- format = "%Ow"; /* Find the alternate digit for 0 */
- break;
- }
+/*--------------------------------------------------------------------------*/
- GCC_DIAG_RESTORE_STMT;
+} /* my_langinfo() */
- /* We can't use my_strftime() because it doesn't look at
- * tm_wday */
- while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize,
- format, &tm))
- {
- /* A zero return means one of:
- * a) there wasn't enough space in PL_langinfo_buf
- * b) the format, like a plain %p, returns empty
- * c) it was an illegal format, though some
- * implementations of strftime will just return the
- * illegal format as a plain character sequence.
- *
- * To quickly test for case 'b)', try again but precede
- * the format with a plain character. If that result is
- * still empty, the problem is either 'a)' or 'c)' */
-
- Size_t format_size = strlen(format) + 1;
- Size_t mod_size = format_size + 1;
- char * mod_format;
- char * temp_result;
-
- Newx(mod_format, mod_size, char);
- Newx(temp_result, PL_langinfo_bufsize, char);
- *mod_format = ' ';
- my_strlcpy(mod_format + 1, format, mod_size);
- len = strftime(temp_result,
- PL_langinfo_bufsize,
- mod_format, &tm);
- Safefree(mod_format);
- Safefree(temp_result);
-
- /* If 'len' is non-zero, it means that we had a case like
- * %p which means the current locale doesn't use a.m. or
- * p.m., and that is valid */
- if (len == 0) {
-
- /* Here, still didn't work. If we get well beyond a
- * reasonable size, bail out to prevent an infinite
- * loop. */
-
- if (PL_langinfo_bufsize > 100 * format_size) {
- *PL_langinfo_buf = '\0';
- }
- else {
- /* Double the buffer size to retry; Add 1 in case
- * original was 0, so we aren't stuck at 0. */
- PL_langinfo_bufsize *= 2;
- PL_langinfo_bufsize++;
- Renew(PL_langinfo_buf, PL_langinfo_bufsize, char);
- continue;
- }
- }
+#endif /* USE_LOCALE */
- break;
- }
+char *
+Perl_my_strftime(pTHX_ const char *fmt, int sec, int min, int hour, int mday, int mon, int year, int wday, int yday, int isdst)
+{
+#ifdef HAS_STRFTIME
- /* Here, we got a result.
- *
- * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the
- * alternate format for wday 0. If the value is the same as
- * the normal 0, there isn't an alternate, so clear the buffer.
- * */
- if ( item == ALT_DIGITS
- && strEQ(PL_langinfo_buf, "0"))
- {
- *PL_langinfo_buf = '\0';
- }
+/*
+=for apidoc_section $time
+=for apidoc my_strftime
- /* ALT_DIGITS is problematic. Experiments on it showed that
- * strftime() did not always work properly when going from
- * alt-9 to alt-10. Only a few locales have this item defined,
- * and in all of them on Linux that khw was able to find,
- * nl_langinfo() merely returned the alt-0 character, possibly
- * doubled. Most Unicode digits are in blocks of 10
- * consecutive code points, so that is sufficient information
- * for those scripts, as we can infer alt-1, alt-2, .... But
- * for a Japanese locale, a CJK ideographic 0 is returned, and
- * the CJK digits are not in code point order, so you can't
- * really infer anything. The localedef for this locale did
- * specify the succeeding digits, so that strftime() works
- * properly on them, without needing to infer anything. But
- * the nl_langinfo() return did not give sufficient information
- * for the caller to understand what's going on. So until
- * there is evidence that it should work differently, this
- * returns the alt-0 string for ALT_DIGITS.
- *
- * wday was chosen because its range is all a single digit.
- * Things like tm_sec have two digits as the minimum: '00' */
+strftime(), but with a different API so that the return value is a pointer
+to the formatted result (which MUST be arranged to be FREED BY THE
+CALLER). This allows this function to increase the buffer size as needed,
+so that the caller doesn't have to worry about that.
- retval = PL_langinfo_buf;
+On failure it returns NULL.
- /* If to return the format, not the value, overwrite the buffer
- * with it. But some strftime()s will keep the original format
- * if illegal, so change those to "" */
- if (return_format) {
- if (strEQ(PL_langinfo_buf, format)) {
- *PL_langinfo_buf = '\0';
- }
- else {
- retval = save_to_buffer(format, &PL_langinfo_buf,
- &PL_langinfo_bufsize, 0);
- }
- }
+Note that yday and wday effectively are ignored by this function, as
+mini_mktime() overwrites them.
- break;
+Also note that it is always executed in the underlying C<LC_TIME> locale of
+the program, giving results based on that locale.
+=cut
+ */
+ PERL_ARGS_ASSERT_MY_STRFTIME;
+
+ /* An empty format yields an empty result */
+ const int fmtlen = strlen(fmt);
+ if (fmtlen == 0) {
+ char *ret;
+ Newxz (ret, 1, char);
+ return ret;
+ }
+
+ /* Set mytm to now */
+ struct tm mytm;
+ init_tm(&mytm); /* XXX workaround - see Perl_init_tm() */
+
+ /* Override with the passed-in values */
+ mytm.tm_sec = sec;
+ mytm.tm_min = min;
+ mytm.tm_hour = hour;
+ mytm.tm_mday = mday;
+ mytm.tm_mon = mon;
+ mytm.tm_year = year;
+ mytm.tm_wday = wday;
+ mytm.tm_yday = yday;
+ mytm.tm_isdst = isdst;
+ mini_mktime(&mytm);
+
+ /* use libc to get the values for tm_gmtoff and tm_zone on platforms that
+ * have them [perl #18238] */
+#if defined(HAS_MKTIME) && (defined(HAS_TM_TM_GMTOFF) || defined(HAS_TM_TM_ZONE))
+ struct tm mytm2;
+ mytm2 = mytm;
+ MKTIME_LOCK;
+ mktime(&mytm2);
+ MKTIME_UNLOCK;
+# ifdef HAS_TM_TM_GMTOFF
+ mytm.tm_gmtoff = mytm2.tm_gmtoff;
# endif
+# ifdef HAS_TM_TM_ZONE
+ mytm.tm_zone = mytm2.tm_zone;
+# endif
+#endif
+#if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
+
+ const char * orig_CTYPE_LOCALE = toggle_locale_c(LC_CTYPE,
+ querylocale_c(LC_TIME));
+#endif
+ /* Guess an initial size for the returned string based on an expansion
+ * factor of the input format, but with a minimum that should handle most
+ * common cases. If this guess is too small, we will try again with a
+ * larger one */
+ int bufsize = MAX(fmtlen * 2, 64);
+
+ char *buf = NULL; /* Makes Renew() act as Newx() on the first iteration */
+ do {
+ Renew(buf, bufsize, char);
+
+ GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
+
+ STRFTIME_LOCK;
+ int len = strftime(buf, bufsize, fmt, &mytm);
+ STRFTIME_UNLOCK;
+
+ GCC_DIAG_RESTORE_STMT;
+
+ /* A non-zero return indicates success. But to make sure we're not
+ * dealing with some rogue strftime that returns how much space it
+ * needs instead of 0 when there isn't enough, check that the return
+ * indicates we have at least one byte of spare space (which will be
+ * used for the terminating NUL). */
+ if (inRANGE(len, 1, bufsize - 1)) {
+ goto strftime_success;
}
+
+ /* There are several possible reasons for a 0 return code for a
+ * non-empty format, and they are not trivial to tease apart. This
+ * issue is a known bug in the strftime() API. What we do to cope is
+ * to assume that the reason is not enough space in the buffer, so
+ * increase it and try again. */
+ bufsize *= 2;
+
+ /* But don't just keep increasing the size indefinitely. Stop when it
+ * becomes obvious that the reason for failure is something besides not
+ * enough space. The most likely largest expanding format is %c. On
+ * khw's Linux box, the maximum result of this is 67 characters, in the
+ * km_KH locale. If a new script comes along that uses 4 UTF-8 bytes
+ * per character, and with a similar expansion factor, that would be a
+ * 268:2 byte ratio, or a bit more than 128:1 = 2**7:1. Some strftime
+ * implementations allow you to say %1000c to pad to 1000 bytes. This
+ * shows that it is impossible to implement this without a heuristic
+ * (that can fail). But it indicates we need to be generous in the
+ * upper limit before failing. The previous heuristic used was too
+ * stingy. Since the size doubles per iteration, it doesn't take many
+ * to reach the limit */
+ } while (bufsize < ((1 << 11) + 1) * fmtlen);
+
+ /* Here, strftime() returned 0, and it likely wasn't for lack of space.
+ * There are two possible reasons:
+ *
+ * First is that the result is legitimately 0 length. This can happen
+ * when the format is precisely "%p". That is the only documented format
+ * that can have an empty result. */
+ if (strEQ(fmt, "%p")) {
+ Renew(buf, 1, char);
+ *buf = '\0';
+ goto strftime_success;
}
- return retval;
+ /* The other reason is that the format string is malformed. Probably it is
+ * an illegal conversion specifier.) */
+ Safefree(buf);
+ return NULL;
+ strftime_success:
+
+#if defined(USE_LOCALE_CTYPE) && defined(USE_LOCALE_TIME)
+
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_LOCALE);
+
+#endif
+ return buf;
+
+#else
+ Perl_croak(aTHX_ "panic: no strftime");
+ return NULL;
#endif
}
+char *
+Perl_my_strftime8_temp(pTHX_ const char *fmt, int sec, int min, int hour, int mday,
+ int mon, int year, int wday, int yday, int isdst,
+ utf8ness_t * utf8ness)
+{ /* Documented above */
+ char * retval = my_strftime(fmt, sec, min, hour, mday, mon, year, wday,
+ yday, isdst);
+
+ PERL_ARGS_ASSERT_MY_STRFTIME8_TEMP;
+
+ if (utf8ness) {
+
+#ifdef USE_LOCALE_TIME
+ *utf8ness = get_locale_string_utf8ness_i(retval,
+ LOCALE_UTF8NESS_UNKNOWN,
+ NULL, LC_TIME_INDEX_);
+#else
+ *utf8ness = UTF8NESS_IMMATERIAL;
+#endif
+
+ }
+
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "fmt=%s, retval=%s", fmt,
+ ((is_utf8_string((U8 *) retval, 0))
+ ? retval
+ :_byte_dump_string((U8 *) retval, strlen(retval), 0)));
+ if (utf8ness) PerlIO_printf(Perl_debug_log, "; utf8ness=%d",
+ (int) *utf8ness);
+ PerlIO_printf(Perl_debug_log, "\n");
+ );
+
+ return retval;
+}
+
/*
* Initialize locale awareness.
*/
@@ -3150,53 +5011,79 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
* error handling.
*
* Besides some asserts, data structure initialization, and specific
- * platform complications, this routine is effectively just two things.
+ * platform complications, this routine is effectively represented by this
+ * pseudo-code:
*
- * a) setlocale(LC_ALL, "");
+ * setlocale(LC_ALL, ""); x
+ * foreach (subcategory) { x
+ * curlocales[f(subcategory)] = setlocale(subcategory, NULL); x
+ * } x
+ * if (platform_so_requires) {
+ * foreach (subcategory) {
+ * PL_curlocales[f(subcategory)] = curlocales[f(subcategory)]
+ * }
+ * }
+ * foreach (subcategory) {
+ * if (needs_special_handling[f(subcategory)] &this_subcat_handler
+ * }
*
- * which sets LC_ALL to the values in the current environment.
+ * This sets all the categories to the values in the current environment,
+ * saves them temporarily in curlocales[] until they can be handled and/or
+ * on some platforms saved in a per-thread array PL_curlocales[].
*
- * And for each individual category 'foo' whose value we care about:
+ * f(foo) is a mapping from the opaque system category numbers to small
+ * non-negative integers used most everywhere in this file as indices into
+ * arrays (such as curlocales[]) so the program doesn't have to otherwise
+ * deal with the opaqueness.
*
- * b) save_foo = setlocale(LC_foo, NULL); handle_foo(save_foo);
+ * If the platform doesn't have LC_ALL, the lines marked 'x' above are
+ * effectively replaced by:
+ * foreach (subcategory) { y
+ * curlocales[f(subcategory)] = setlocale(subcategory, ""); y
+ * } y
*
- * (We don't tend to care about categories like LC_PAPER, for example.)
+ * The only differences being the lack of an LC_ALL call, and using ""
+ * instead of NULL in the setlocale calls.
*
- * But there are complications. On systems without LC_ALL, it emulates
- * step a) by looping through all the categories, and doing
+ * But there are, of course, complications.
*
- * setlocale(LC_foo, "");
+ * it has to deal with if this is an embedded perl, whose locale doesn't
+ * come from the environment, but has been set up by the caller. This is
+ * pretty simply handled: the "" in the setlocale calls is not a string
+ * constant, but a variable which is set to NULL in the embedded case.
*
- * on each.
+ * But the major complication is handling failure and doing fallback. All
+ * the code marked 'x' or 'y' above is actually enclosed in an outer loop,
+ * using the array trial_locales[]. On entry, trial_locales[] is
+ * initialized to just one entry, containing the NULL or "" locale argument
+ * shown above. If, as is almost always the case, everything works, it
+ * exits after just the one iteration, going on to the next step.
*
- * And it has to deal with if this is an embedded perl, whose locale
- * doesn't come from the environment, but has been set up by the caller.
- * This is pretty simply handled: the "" in the setlocale calls is not a
- * string constant, but a variable which is set to NULL in the embedded
- * case.
- *
- * But the major complication is handling failure and doing fallback.
- * There is an array, trial_locales, the elements of which are looped over
- * until the locale is successfully set. The array is initialized with
- * just one element, for
- * setlocale(LC_ALL, $NULL_or_empty)
- * If that works, as it almost always does, there's no more elements and
- * the loop iterates just the once. Otherwise elements are added for each
- * of the environment variables that POSIX dictates should control the
- * program, in priority order, with a final one being "C". The loop is
- * repeated until the first one succeeds. If all fail, we limp along with
- * whatever state we got to. If there is no LC_ALL, an inner loop is run
- * through all categories (making things look complex).
+ * But if there is a failure, the code tries its best to honor the
+ * environment as much as possible. It self-modifies trial_locales[] to
+ * have more elements, one for each of the POSIX-specified settings from
+ * the environment, such as LANG, ending in the ultimate fallback, the C
+ * locale. Thus if there is something bogus with a higher priority
+ * environment variable, it will try with the next highest, until something
+ * works. If everything fails, it limps along with whatever state it got
+ * to.
*
* A further complication is that Windows has an additional fallback, the
* user-default ANSI code page obtained from the operating system. This is
* added as yet another loop iteration, just before the final "C"
*
- * On Ultrix, the locale MUST come from the environment, so there is
- * preliminary code to set it. I (khw) am not sure that it is necessary,
- * and that this couldn't be folded into the loop, but barring any real
- * platforms to test on, it's staying as-is
- */
+ * A slight complication is that in embedded Perls, the locale may already
+ * be set-up, and we don't want to get it from the normal environment
+ * variables. This is handled by having a special environment variable
+ * indicate we're in this situation. We simply set setlocale's 2nd
+ * parameter to be a NULL instead of "". That indicates to setlocale that
+ * it is not to change anything, but to return the current value,
+ * effectively initializing perl's db to what the locale already is.
+ *
+ * We play the same trick with NULL if a LC_ALL succeeds. We call
+ * setlocale() on the individual categories with NULL to get their existing
+ * values for our db, instead of trying to change them.
+ * */
int ok = 1;
@@ -3215,7 +5102,13 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT"))
? NULL
: "";
- const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */
+ typedef struct trial_locales_struct_s {
+ const char* trial_locale;
+ const char* fallback_desc;
+ const char* fallback_name;
+ } trial_locales_struct;
+ /* 5 = 1 each for "", LC_ALL, LANG, (Win32) system default locale, C */
+ trial_locales_struct trial_locales[5];
unsigned int trial_locales_count;
const char * const lc_all = PerlEnv_getenv("LC_ALL");
const char * const lang = PerlEnv_getenv("LANG");
@@ -3233,165 +5126,140 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
*bad_lang_use_once
&& strNE("0", bad_lang_use_once)))));
- /* setlocale() return vals; not copied so must be looked at immediately */
- const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
-
/* current locale for given category; should have been copied so aren't
* volatile */
const char * curlocales[NOMINAL_LC_ALL_INDEX + 1];
-# ifdef WIN32
-
- /* In some systems you can find out the system default locale
- * and use that as the fallback locale. */
-# define SYSTEM_DEFAULT_LOCALE
-# endif
-# ifdef SYSTEM_DEFAULT_LOCALE
-
- const char *system_default_locale = NULL;
-
-# endif
-
# ifndef DEBUGGING
# define DEBUG_LOCALE_INIT(a,b,c)
# else
DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT")));
-# define DEBUG_LOCALE_INIT(category, locale, result) \
- STMT_START { \
- if (debug_initialization) { \
- PerlIO_printf(Perl_debug_log, \
- "%s:%d: %s\n", \
- __FILE__, __LINE__, \
- setlocale_debug_string(category, \
- locale, \
- result)); \
- } \
- } STMT_END
+# define DEBUG_LOCALE_INIT(cat_index, locale, result) \
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "%s\n", \
+ setlocale_debug_string_i(cat_index, locale, result)));
/* Make sure the parallel arrays are properly set up */
# ifdef USE_LOCALE_NUMERIC
- assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC);
- assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC"));
+ assert(categories[LC_NUMERIC_INDEX_] == LC_NUMERIC);
+ assert(strEQ(category_names[LC_NUMERIC_INDEX_], "LC_NUMERIC"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_NUMERIC_INDEX] == LC_NUMERIC_MASK);
+ assert(category_masks[LC_NUMERIC_INDEX_] == LC_NUMERIC_MASK);
# endif
# endif
# ifdef USE_LOCALE_CTYPE
- assert(categories[LC_CTYPE_INDEX] == LC_CTYPE);
- assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE"));
+ assert(categories[LC_CTYPE_INDEX_] == LC_CTYPE);
+ assert(strEQ(category_names[LC_CTYPE_INDEX_], "LC_CTYPE"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_CTYPE_INDEX] == LC_CTYPE_MASK);
+ assert(category_masks[LC_CTYPE_INDEX_] == LC_CTYPE_MASK);
# endif
# endif
# ifdef USE_LOCALE_COLLATE
- assert(categories[LC_COLLATE_INDEX] == LC_COLLATE);
- assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE"));
+ assert(categories[LC_COLLATE_INDEX_] == LC_COLLATE);
+ assert(strEQ(category_names[LC_COLLATE_INDEX_], "LC_COLLATE"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_COLLATE_INDEX] == LC_COLLATE_MASK);
+ assert(category_masks[LC_COLLATE_INDEX_] == LC_COLLATE_MASK);
# endif
# endif
# ifdef USE_LOCALE_TIME
- assert(categories[LC_TIME_INDEX] == LC_TIME);
- assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME"));
+ assert(categories[LC_TIME_INDEX_] == LC_TIME);
+ assert(strEQ(category_names[LC_TIME_INDEX_], "LC_TIME"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_TIME_INDEX] == LC_TIME_MASK);
+ assert(category_masks[LC_TIME_INDEX_] == LC_TIME_MASK);
# endif
# endif
# ifdef USE_LOCALE_MESSAGES
- assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES);
- assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES"));
+ assert(categories[LC_MESSAGES_INDEX_] == LC_MESSAGES);
+ assert(strEQ(category_names[LC_MESSAGES_INDEX_], "LC_MESSAGES"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_MESSAGES_INDEX] == LC_MESSAGES_MASK);
+ assert(category_masks[LC_MESSAGES_INDEX_] == LC_MESSAGES_MASK);
# endif
# endif
# ifdef USE_LOCALE_MONETARY
- assert(categories[LC_MONETARY_INDEX] == LC_MONETARY);
- assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY"));
+ assert(categories[LC_MONETARY_INDEX_] == LC_MONETARY);
+ assert(strEQ(category_names[LC_MONETARY_INDEX_], "LC_MONETARY"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_MONETARY_INDEX] == LC_MONETARY_MASK);
+ assert(category_masks[LC_MONETARY_INDEX_] == LC_MONETARY_MASK);
# endif
# endif
# ifdef USE_LOCALE_ADDRESS
- assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS);
- assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS"));
+ assert(categories[LC_ADDRESS_INDEX_] == LC_ADDRESS);
+ assert(strEQ(category_names[LC_ADDRESS_INDEX_], "LC_ADDRESS"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_ADDRESS_INDEX] == LC_ADDRESS_MASK);
+ assert(category_masks[LC_ADDRESS_INDEX_] == LC_ADDRESS_MASK);
# endif
# endif
# ifdef USE_LOCALE_IDENTIFICATION
- assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION);
- assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION"));
+ assert(categories[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION);
+ assert(strEQ(category_names[LC_IDENTIFICATION_INDEX_], "LC_IDENTIFICATION"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION_MASK);
+ assert(category_masks[LC_IDENTIFICATION_INDEX_] == LC_IDENTIFICATION_MASK);
# endif
# endif
# ifdef USE_LOCALE_MEASUREMENT
- assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT);
- assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT"));
+ assert(categories[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT);
+ assert(strEQ(category_names[LC_MEASUREMENT_INDEX_], "LC_MEASUREMENT"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT_MASK);
+ assert(category_masks[LC_MEASUREMENT_INDEX_] == LC_MEASUREMENT_MASK);
# endif
# endif
# ifdef USE_LOCALE_PAPER
- assert(categories[LC_PAPER_INDEX] == LC_PAPER);
- assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER"));
+ assert(categories[LC_PAPER_INDEX_] == LC_PAPER);
+ assert(strEQ(category_names[LC_PAPER_INDEX_], "LC_PAPER"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_PAPER_INDEX] == LC_PAPER_MASK);
+ assert(category_masks[LC_PAPER_INDEX_] == LC_PAPER_MASK);
# endif
# endif
# ifdef USE_LOCALE_TELEPHONE
- assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE);
- assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE"));
+ assert(categories[LC_TELEPHONE_INDEX_] == LC_TELEPHONE);
+ assert(strEQ(category_names[LC_TELEPHONE_INDEX_], "LC_TELEPHONE"));
+# ifdef USE_POSIX_2008_LOCALE
+ assert(category_masks[LC_TELEPHONE_INDEX_] == LC_TELEPHONE_MASK);
+# endif
+# endif
+# ifdef USE_LOCALE_NAME
+ assert(categories[LC_NAME_INDEX_] == LC_NAME);
+ assert(strEQ(category_names[LC_NAME_INDEX_], "LC_NAME"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_TELEPHONE_INDEX] == LC_TELEPHONE_MASK);
+ assert(category_masks[LC_NAME_INDEX_] == LC_NAME_MASK);
# endif
# endif
# ifdef USE_LOCALE_SYNTAX
- assert(categories[LC_SYNTAX_INDEX] == LC_SYNTAX);
- assert(strEQ(category_names[LC_SYNTAX_INDEX], "LC_SYNTAX"));
+ assert(categories[LC_SYNTAX_INDEX_] == LC_SYNTAX);
+ assert(strEQ(category_names[LC_SYNTAX_INDEX_], "LC_SYNTAX"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_SYNTAX_INDEX] == LC_SYNTAX_MASK);
+ assert(category_masks[LC_SYNTAX_INDEX_] == LC_SYNTAX_MASK);
# endif
# endif
# ifdef USE_LOCALE_TOD
- assert(categories[LC_TOD_INDEX] == LC_TOD);
- assert(strEQ(category_names[LC_TOD_INDEX], "LC_TOD"));
+ assert(categories[LC_TOD_INDEX_] == LC_TOD);
+ assert(strEQ(category_names[LC_TOD_INDEX_], "LC_TOD"));
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_TOD_INDEX] == LC_TOD_MASK);
+ assert(category_masks[LC_TOD_INDEX_] == LC_TOD_MASK);
# endif
# endif
# ifdef LC_ALL
- assert(categories[LC_ALL_INDEX] == LC_ALL);
- assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL"));
- assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX);
+ assert(categories[LC_ALL_INDEX_] == LC_ALL);
+ assert(strEQ(category_names[LC_ALL_INDEX_], "LC_ALL"));
+ STATIC_ASSERT_STMT(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX_);
# ifdef USE_POSIX_2008_LOCALE
- assert(category_masks[LC_ALL_INDEX] == LC_ALL_MASK);
+ assert(category_masks[LC_ALL_INDEX_] == LC_ALL_MASK);
# endif
# endif
# endif /* DEBUGGING */
/* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for
* why these particular incantations are used. */
-#ifdef HAS_MBRLEN
+# ifdef HAS_MBRLEN
memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps));
-#endif
-#ifdef HAS_MBRTOWC
+# endif
+# ifdef HAS_MBRTOWC
memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps));
-#endif
-#ifdef HAS_WCTOMBR
+# endif
+# ifdef HAS_WCTOMBR
wcrtomb(NULL, L'\0', &PL_wcrtomb_ps);
-#endif
-
- /* Initialize the cache of the program's UTF-8ness for the always known
- * locales C and POSIX */
- my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness,
- sizeof(PL_locale_utf8ness));
-
- /* See https://github.com/Perl/perl5/issues/17824 */
- Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *);
-
+# endif
# ifdef USE_THREAD_SAFE_LOCALE
# ifdef WIN32
@@ -3401,123 +5269,87 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
# endif
# ifdef USE_POSIX_2008_LOCALE
- PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
if (! PL_C_locale_obj) {
- Perl_croak_nocontext(
- "panic: Cannot create POSIX 2008 C locale object; errno=%d", errno);
+ PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0);
}
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "%s:%d: created C object %p\n", __FILE__, __LINE__, PL_C_locale_obj);
+ if (! PL_C_locale_obj) {
+ locale_panic_(Perl_form(aTHX_
+ "Cannot create POSIX 2008 C locale object"));
}
-# endif
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, "created C object %p\n",
+ PL_C_locale_obj));
-# ifdef USE_LOCALE_NUMERIC
+ /* Switch to using the POSIX 2008 interface now. This would happen below
+ * anyway, but deferring it can lead to leaks of memory that would also get
+ * malloc'd in the interim */
+ uselocale(PL_C_locale_obj);
- PL_numeric_radix_sv = newSVpvs(".");
+# ifdef USE_LOCALE_NUMERIC
+ PL_underlying_numeric_obj = duplocale(PL_C_locale_obj);
+
+# endif
# endif
+# ifdef USE_LOCALE_NUMERIC
+
+ PL_numeric_radix_sv = newSV(1);
+ PL_underlying_radix_sv = newSV(1);
+ Newxz(PL_numeric_name, 1, char); /* Single NUL character */
+ new_numeric("C", false);
-# if defined(USE_POSIX_2008_LOCALE) && ! defined(HAS_QUERYLOCALE)
+# endif
+# ifdef USE_LOCALE_COLLATE
- /* Initialize our records. If we have POSIX 2008, we have LC_ALL */
- do_setlocale_c(LC_ALL, my_setlocale(LC_ALL, NULL));
+ Newxz(PL_collation_name, 1, char);
+ new_collate("C", false);
# endif
-# ifdef LOCALE_ENVIRON_REQUIRED
+# ifdef USE_LOCALE_CTYPE
- /*
- * Ultrix setlocale(..., "") fails if there are no environment
- * variables from which to get a locale name.
- */
+ Newxz(PL_ctype_name, 1, char);
+ new_ctype("C", false);
-# ifndef LC_ALL
-# error Ultrix without LC_ALL not implemented
-# else
+# endif
+# ifdef USE_PL_CURLOCALES
- {
- bool done = FALSE;
- if (lang) {
- sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init);
- DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]);
- if (sl_result[LC_ALL_INDEX])
- done = TRUE;
- else
- setlocale_failure = TRUE;
- }
- if (! setlocale_failure) {
- const char * locale_param;
- for (i = 0; i < LC_ALL_INDEX; i++) {
- locale_param = (! done && (lang || PerlEnv_getenv(category_names[i])))
- ? setlocale_init
- : NULL;
- sl_result[i] = do_setlocale_r(categories[i], locale_param);
- if (! sl_result[i]) {
- setlocale_failure = TRUE;
- }
- DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]);
- }
- }
+ /* Initialize our records. */
+ for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ (void) emulate_setlocale_i(i, posix_setlocale(categories[i], NULL),
+ RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
+ __LINE__);
}
-# endif /* LC_ALL */
-# endif /* LOCALE_ENVIRON_REQUIRED */
+# endif
/* We try each locale in the list until we get one that works, or exhaust
* the list. Normally the loop is executed just once. But if setting the
* locale fails, inside the loop we add fallback trials to the array and so
* will execute the loop multiple times */
- trial_locales[0] = setlocale_init;
+ trial_locales_struct ts = {
+ .trial_locale = setlocale_init,
+ .fallback_desc = NULL,
+ .fallback_name = NULL,
+ };
+ trial_locales[0] = ts;
trial_locales_count = 1;
- for (i= 0; i < trial_locales_count; i++) {
- const char * trial_locale = trial_locales[i];
-
- if (i > 0) {
-
- /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED
- * when i==0, but I (khw) don't think that behavior makes much
- * sense */
- setlocale_failure = FALSE;
-
-# ifdef SYSTEM_DEFAULT_LOCALE
-# ifdef WIN32 /* Note that assumes Win32 has LC_ALL */
-
- /* On Windows machines, an entry of "" after the 0th means to use
- * the system default locale, which we now proceed to get. */
- if (strEQ(trial_locale, "")) {
- unsigned int j;
-
- /* Note that this may change the locale, but we are going to do
- * that anyway just below */
- system_default_locale = do_setlocale_c(LC_ALL, "");
- DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale);
-
- /* Skip if invalid or if it's already on the list of locales to
- * try */
- if (! system_default_locale) {
- goto next_iteration;
- }
- for (j = 0; j < trial_locales_count; j++) {
- if (strEQ(system_default_locale, trial_locales[j])) {
- goto next_iteration;
- }
- }
-
- trial_locale = system_default_locale;
- }
-# else
-# error SYSTEM_DEFAULT_LOCALE only implemented for Win32
-# endif
-# endif /* SYSTEM_DEFAULT_LOCALE */
+ for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ curlocales[i] = NULL;
+ }
- } /* For i > 0 */
+ for (i= 0; i < trial_locales_count; i++) {
+ const char * trial_locale = trial_locales[i].trial_locale;
+ setlocale_failure = FALSE;
# ifdef LC_ALL
- sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale);
- DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]);
- if (! sl_result[LC_ALL_INDEX]) {
+ /* setlocale() return vals; not copied so must be looked at
+ * immediately. */
+ const char * sl_result[NOMINAL_LC_ALL_INDEX + 1];
+ sl_result[LC_ALL_INDEX_] = stdized_setlocale(LC_ALL, trial_locale);
+ DEBUG_LOCALE_INIT(LC_ALL_INDEX_, trial_locale, sl_result[LC_ALL_INDEX_]);
+ if (! sl_result[LC_ALL_INDEX_]) {
setlocale_failure = TRUE;
}
else {
@@ -3536,12 +5368,12 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
if (! setlocale_failure) {
unsigned int j;
for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
- curlocales[j]
- = savepv(do_setlocale_r(categories[j], trial_locale));
+ curlocales[j] = stdized_setlocale(categories[j], trial_locale);
if (! curlocales[j]) {
setlocale_failure = TRUE;
}
- DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]);
+ curlocales[j] = savepv(curlocales[j]);
+ DEBUG_LOCALE_INIT(j, trial_locale, curlocales[j]);
}
if (LIKELY(! setlocale_failure)) { /* All succeeded */
@@ -3565,14 +5397,11 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
# else /* !LC_ALL */
PerlIO_printf(Perl_error_log,
- "perl: warning: Setting locale failed for the categories:\n\t");
+ "perl: warning: Setting locale failed for the categories:\n");
for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
if (! curlocales[j]) {
- PerlIO_printf(Perl_error_log, category_names[j]);
- }
- else {
- Safefree(curlocales[j]);
+ PerlIO_printf(Perl_error_log, "\t%s\n", category_names[j]);
}
}
@@ -3656,21 +5485,35 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
* to change the behavior. */
if (lc_all) {
for (j = 0; j < trial_locales_count; j++) {
- if (strEQ(lc_all, trial_locales[j])) {
+ if (strEQ(lc_all, trial_locales[j].trial_locale)) {
goto done_lc_all;
}
}
- trial_locales[trial_locales_count++] = lc_all;
+ trial_locales_struct ts = {
+ .trial_locale = lc_all,
+ .fallback_desc = (strEQ(lc_all, "C")
+ ? "the standard locale"
+ : "a fallback locale"),
+ .fallback_name = lc_all,
+ };
+ trial_locales[trial_locales_count++] = ts;
}
done_lc_all:
if (lang) {
for (j = 0; j < trial_locales_count; j++) {
- if (strEQ(lang, trial_locales[j])) {
+ if (strEQ(lang, trial_locales[j].trial_locale)) {
goto done_lang;
}
}
- trial_locales[trial_locales_count++] = lang;
+ trial_locales_struct ts = {
+ .trial_locale = lang,
+ .fallback_desc = (strEQ(lang, "C")
+ ? "the standard locale"
+ : "a fallback locale"),
+ .fallback_name = lang,
+ };
+ trial_locales[trial_locales_count++] = ts;
}
done_lang:
@@ -3679,29 +5522,65 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
/* For Windows, we also try the system default locale before "C".
* (If there exists a Windows without LC_ALL we skip this because
* it gets too complicated. For those, the "C" is the next
- * fallback possibility). The "" is the same as the 0th element of
- * the array, but the code at the loop above knows to treat it
- * differently when not the 0th */
- trial_locales[trial_locales_count++] = "";
+ * fallback possibility). */
+ {
+ /* Note that this may change the locale, but we are going to do
+ * that anyway.
+ *
+ * Our normal Windows setlocale() implementation ignores the
+ * system default locale to make things work like POSIX. This
+ * is the only place where we want to consider it, so have to
+ * use wrap_wsetlocale(). */
+ const char *system_default_locale =
+ stdize_locale(LC_ALL,
+ wrap_wsetlocale(LC_ALL, ""),
+ &PL_stdize_locale_buf,
+ &PL_stdize_locale_bufsize,
+ __LINE__);
+ DEBUG_LOCALE_INIT(LC_ALL_INDEX_, "", system_default_locale);
+
+ /* Skip if invalid or if it's already on the list of locales to
+ * try */
+ if (! system_default_locale) {
+ goto done_system_default;
+ }
+ for (j = 0; j < trial_locales_count; j++) {
+ if (strEQ(system_default_locale, trial_locales[j].trial_locale)) {
+ goto done_system_default;
+ }
+ }
+
+ trial_locales_struct ts = {
+ .trial_locale = system_default_locale,
+ .fallback_desc = (strEQ(system_default_locale, "C")
+ ? "the standard locale"
+ : "the system default locale"),
+ .fallback_name = system_default_locale,
+ };
+ trial_locales[trial_locales_count++] = ts;
+ }
+ done_system_default:
# endif
for (j = 0; j < trial_locales_count; j++) {
- if (strEQ("C", trial_locales[j])) {
+ if (strEQ("C", trial_locales[j].trial_locale)) {
goto done_C;
}
}
- trial_locales[trial_locales_count++] = "C";
-
+ {
+ /* new scope to avoid C++ complaining about
+ initialization being bypassed by goto.
+ */
+ trial_locales_struct ts = {
+ .trial_locale = "C",
+ .fallback_desc = "the standard locale",
+ .fallback_name = "C",
+ };
+ trial_locales[trial_locales_count++] = ts;
+ }
done_C: ;
} /* end of first time through the loop */
-
-# ifdef WIN32
-
- next_iteration: ;
-
-# endif
-
} /* end of looping through the trial locales */
if (ok < 1) { /* If we tried to fallback */
@@ -3723,34 +5602,15 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) {
Safefree(curlocales[j]);
- curlocales[j] = savepv(do_setlocale_r(categories[j], NULL));
- DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]);
+ curlocales[j] = savepv(stdized_setlocale(categories[j], NULL));
+ DEBUG_LOCALE_INIT(j, NULL, curlocales[j]);
}
}
if (locwarn) {
- const char * description;
- const char * name = "";
- if (strEQ(trial_locales[i], "C")) {
- description = "the standard locale";
- name = "C";
- }
-
-# ifdef SYSTEM_DEFAULT_LOCALE
-
- else if (strEQ(trial_locales[i], "")) {
- description = "the system default locale";
- if (system_default_locale) {
- name = system_default_locale;
- }
- }
-
-# endif /* SYSTEM_DEFAULT_LOCALE */
+ const char * description = trial_locales[i].fallback_desc;
+ const char * name = trial_locales[i].fallback_name;
- else {
- description = "a fallback locale";
- name = trial_locales[i];
- }
if (name && strNE(name, "")) {
PerlIO_printf(Perl_error_log,
"perl: warning: %s %s (\"%s\").\n", msg, description, name);
@@ -3762,43 +5622,23 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
}
} /* End of tried to fallback */
- /* Done with finding the locales; update our records */
-
-# ifdef USE_LOCALE_CTYPE
+# ifdef USE_POSIX_2008_LOCALE
- new_ctype(curlocales[LC_CTYPE_INDEX]);
+ /* The stdized setlocales haven't affected the P2008 locales. Initialize
+ * them now, calculating LC_ALL only on the final go round, when all have
+ * been set. */
+ for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ (void) emulate_setlocale_i(i, curlocales[i],
+ RECALCULATE_LC_ALL_ON_FINAL_INTERATION,
+ __LINE__);
+ }
# endif
-# ifdef USE_LOCALE_COLLATE
-
- new_collate(curlocales[LC_COLLATE_INDEX]);
-# endif
-# ifdef USE_LOCALE_NUMERIC
-
- new_numeric(curlocales[LC_NUMERIC_INDEX]);
-
-# endif
+ /* Done with finding the locales; update the auxiliary records */
+ new_LC_ALL(NULL, false);
for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
-
-# if defined(USE_ITHREADS) && ! defined(USE_THREAD_SAFE_LOCALE)
-
- /* This caches whether each category's locale is UTF-8 or not. This
- * may involve changing the locale. It is ok to do this at
- * initialization time before any threads have started, but not later
- * unless thread-safe operations are used.
- * Caching means that if the program heeds our dictate not to change
- * locales in threaded applications, this data will remain valid, and
- * it may get queried without having to change locales. If the
- * environment is such that all categories have the same locale, this
- * isn't needed, as the code will not change the locale; but this
- * handles the uncommon case where the environment has disparate
- * locales for the categories */
- (void) _is_cur_LC_category_utf8(categories[i]);
-
-# endif
-
Safefree(curlocales[i]);
}
@@ -3824,20 +5664,174 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
# endif
#endif /* USE_LOCALE */
-#ifdef DEBUGGING
/* So won't continue to output stuff */
DEBUG_INITIALIZATION_set(FALSE);
-#endif
-
return ok;
}
#ifdef USE_LOCALE_COLLATE
+STATIC void
+S_compute_collxfrm_coefficients(pTHX)
+{
+
+ /* A locale collation definition includes primary, secondary, tertiary,
+ * etc. weights for each character. To sort, the primary weights are used,
+ * and only if they compare equal, then the secondary weights are used, and
+ * only if they compare equal, then the tertiary, etc.
+ *
+ * strxfrm() works by taking the input string, say ABC, and creating an
+ * output transformed string consisting of first the primary weights,
+ * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the tertiary,
+ * etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters may not have
+ * weights at every level. In our example, let's say B doesn't have a
+ * tertiary weight, and A doesn't have a secondary weight. The constructed
+ * string is then going to be
+ * A¹B¹C¹ B²C² A³C³ ....
+ * This has the desired effect that strcmp() will look at the secondary or
+ * tertiary weights only if the strings compare equal at all higher
+ * priority weights. The spaces shown here, like in
+ * "A¹B¹C¹ A²B²C² "
+ * are not just for readability. In the general case, these must actually
+ * be bytes, which we will call here 'separator weights'; and they must be
+ * smaller than any other weight value, but since these are C strings, only
+ * the terminating one can be a NUL (some implementations may include a
+ * non-NUL separator weight just before the NUL). Implementations tend to
+ * reserve 01 for the separator weights. They are needed so that a shorter
+ * string's secondary weights won't be misconstrued as primary weights of a
+ * longer string, etc. By making them smaller than any other weight, the
+ * shorter string will sort first. (Actually, if all secondary weights are
+ * smaller than all primary ones, there is no need for a separator weight
+ * between those two levels, etc.)
+ *
+ * The length of the transformed string is roughly a linear function of the
+ * input string. It's not exactly linear because some characters don't
+ * have weights at all levels. When we call strxfrm() we have to allocate
+ * some memory to hold the transformed string. The calculations below try
+ * to find coefficients 'm' and 'b' for this locale so that m*x + b equals
+ * how much space we need, given the size of the input string in 'x'. If
+ * we calculate too small, we increase the size as needed, and call
+ * strxfrm() again, but it is better to get it right the first time to
+ * avoid wasted expensive string transformations.
+ *
+ * We use the string below to find how long the transformation of it is.
+ * Almost all locales are supersets of ASCII, or at least the ASCII
+ * letters. We use all of them, half upper half lower, because if we used
+ * fewer, we might hit just the ones that are outliers in a particular
+ * locale. Most of the strings being collated will contain a preponderance
+ * of letters, and even if they are above-ASCII, they are likely to have
+ * the same number of weight levels as the ASCII ones. It turns out that
+ * digits tend to have fewer levels, and some punctuation has more, but
+ * those are relatively sparse in text, and khw believes this gives a
+ * reasonable result, but it could be changed if experience so dictates. */
+ const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz";
+ char * x_longer; /* Transformed 'longer' */
+ Size_t x_len_longer; /* Length of 'x_longer' */
+
+ char * x_shorter; /* We also transform a substring of 'longer' */
+ Size_t x_len_shorter;
+
+ PL_in_utf8_COLLATE_locale = (PL_collation_standard)
+ ? 0
+ : is_locale_utf8(PL_collation_name);
+ PL_strxfrm_NUL_replacement = '\0';
+ PL_strxfrm_max_cp = 0;
+
+ /* mem_collxfrm_() is used get the transformation (though here we are
+ * interested only in its length). It is used because it has the
+ * intelligence to handle all cases, but to work, it needs some values of
+ * 'm' and 'b' to get it started. For the purposes of this calculation we
+ * use a very conservative estimate of 'm' and 'b'. This assumes a weight
+ * can be multiple bytes, enough to hold any UV on the platform, and there
+ * are 5 levels, 4 weight bytes, and a trailing NUL. */
+ PL_collxfrm_base = 5;
+ PL_collxfrm_mult = 5 * sizeof(UV);
+
+ /* Find out how long the transformation really is */
+ x_longer = mem_collxfrm_(longer,
+ sizeof(longer) - 1,
+ &x_len_longer,
+
+ /* We avoid converting to UTF-8 in the called
+ * function by telling it the string is in UTF-8
+ * if the locale is a UTF-8 one. Since the string
+ * passed here is invariant under UTF-8, we can
+ * claim it's UTF-8 even though it isn't. */
+ PL_in_utf8_COLLATE_locale);
+ Safefree(x_longer);
+
+ /* Find out how long the transformation of a substring of 'longer' is.
+ * Together the lengths of these transformations are sufficient to
+ * calculate 'm' and 'b'. The substring is all of 'longer' except the
+ * first character. This minimizes the chances of being swayed by outliers
+ * */
+ x_shorter = mem_collxfrm_(longer + 1,
+ sizeof(longer) - 2,
+ &x_len_shorter,
+ PL_in_utf8_COLLATE_locale);
+ Safefree(x_shorter);
+
+ /* If the results are nonsensical for this simple test, the whole locale
+ * definition is suspect. Mark it so that locale collation is not active
+ * at all for it. XXX Should we warn? */
+ if ( x_len_shorter == 0
+ || x_len_longer == 0
+ || x_len_shorter >= x_len_longer)
+ {
+ PL_collxfrm_mult = 0;
+ PL_collxfrm_base = 1;
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "Disabling locale collation for LC_COLLATE='%s';"
+ " length for shorter sample=%zu; longer=%zu\n",
+ PL_collation_name, x_len_shorter, x_len_longer));
+ }
+ else {
+ SSize_t base; /* Temporary */
+
+ /* We have both: m * strlen(longer) + b = x_len_longer
+ * m * strlen(shorter) + b = x_len_shorter;
+ * subtracting yields:
+ * m * (strlen(longer) - strlen(shorter))
+ * = x_len_longer - x_len_shorter
+ * But we have set things up so that 'shorter' is 1 byte smaller than
+ * 'longer'. Hence:
+ * m = x_len_longer - x_len_shorter
+ *
+ * But if something went wrong, make sure the multiplier is at least 1.
+ */
+ if (x_len_longer > x_len_shorter) {
+ PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
+ }
+ else {
+ PL_collxfrm_mult = 1;
+ }
+
+ /* mx + b = len
+ * so: b = len - mx
+ * but in case something has gone wrong, make sure it is non-negative
+ * */
+ base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
+ if (base < 0) {
+ base = 0;
+ }
+
+ /* Add 1 for the trailing NUL */
+ PL_collxfrm_base = base + 1;
+ }
+
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "?UTF-8 locale=%d; x_len_shorter=%zu, "
+ "x_len_longer=%zu,"
+ " collate multipler=%zu, collate base=%zu\n",
+ PL_in_utf8_COLLATE_locale,
+ x_len_shorter, x_len_longer,
+ PL_collxfrm_mult, PL_collxfrm_base));
+}
+
char *
-Perl__mem_collxfrm(pTHX_ const char *input_string,
+Perl_mem_collxfrm_(pTHX_ const char *input_string,
STRLEN len, /* Length of 'input_string' */
STRLEN *xlen, /* Set to length of returned string
(not including the collation index
@@ -3845,15 +5839,18 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
bool utf8 /* Is the input in UTF-8? */
)
{
-
- /* _mem_collxfrm() is a bit like strxfrm() but with two important
- * differences. First, it handles embedded NULs. Second, it allocates a bit
- * more memory than needed for the transformed data itself. The real
- * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to
- * the length of that, and doesn't include the collation index size.
+ /* mem_collxfrm_() is like strxfrm() but with two important differences.
+ * First, it handles embedded NULs. Second, it allocates a bit more memory
+ * than needed for the transformed data itself. The real transformed data
+ * begins at offset COLLXFRM_HDR_LEN. *xlen is set to the length of that,
+ * and doesn't include the collation index size.
+ *
+ * It is the caller's responsibility to eventually free the memory returned
+ * by this function.
+ *
* Please see sv_collxfrm() to see how this is used. */
-#define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
+# define COLLXFRM_HDR_LEN sizeof(PL_collation_ix)
char * s = (char *) input_string;
STRLEN s_strlen = strlen(input_string);
@@ -3862,16 +5859,29 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
STRLEN length_in_chars;
bool first_time = TRUE; /* Cleared after first loop iteration */
- PERL_ARGS_ASSERT__MEM_COLLXFRM;
+# ifdef USE_LOCALE_CTYPE
+ const char * orig_CTYPE_locale = NULL;
+# endif
+
+# if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
+ locale_t constructed_locale = (locale_t) 0;
+# endif
+
+ PERL_ARGS_ASSERT_MEM_COLLXFRM_;
/* Must be NUL-terminated */
assert(*(input_string + len) == '\0');
- /* If this locale has defective collation, skip */
- if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) {
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm: locale's collation is defective\n"));
- goto bad;
+ if (PL_collxfrm_mult == 0) { /* unknown or bad */
+ if (PL_collxfrm_base != 0) { /* bad collation => skip */
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "mem_collxfrm_: locale's collation is defective\n"));
+ goto bad;
+ }
+
+ /* (mult, base) == (0,0) means we need to calculate mult and base
+ * before proceeding */
+ S_compute_collxfrm_coefficients(aTHX);
}
/* Replace any embedded NULs with the control that sorts before any others.
@@ -3912,6 +5922,16 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
try_non_controls < 2;
try_non_controls++)
{
+
+# ifdef USE_LOCALE_CTYPE
+
+ /* In this case we use isCNTRL_LC() below, which relies on
+ * LC_CTYPE, so that must be switched to correspond with the
+ * LC_COLLATE locale */
+ if (! try_non_controls && ! PL_in_utf8_COLLATE_locale) {
+ orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
+ }
+# endif
/* Look through all legal code points (NUL isn't) */
for (j = 1; j < 256; j++) {
char * x; /* j's xfrm plus collation index */
@@ -3932,7 +5952,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
cur_source[0] = (char) j;
/* Then transform it */
- x = _mem_collxfrm(cur_source, trial_len, &x_len,
+ x = mem_collxfrm_(cur_source, trial_len, &x_len,
0 /* The string is not in UTF-8 */);
/* Ignore any character that didn't successfully transform.
@@ -3956,6 +5976,10 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
}
} /* end of loop through all 255 characters */
+# ifdef USE_LOCALE_CTYPE
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
+# endif
+
/* Stop looking if found */
if (cur_min_x) {
break;
@@ -3965,18 +5989,18 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
* work in the locale, repeat the loop, looking for any
* character that works */
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm: No control worked. Trying non-controls\n"));
+ "mem_collxfrm_: No control worked. Trying non-controls\n"));
} /* End of loop to try first the controls, then any char */
if (! cur_min_x) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm: Couldn't find any character to replace"
+ "mem_collxfrm_: Couldn't find any character to replace"
" embedded NULs in locale %s with", PL_collation_name));
goto bad;
}
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm: Replacing embedded NULs in locale %s with "
+ "mem_collxfrm_: Replacing embedded NULs in locale %s with "
"0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement));
Safefree(cur_min_x);
@@ -4089,7 +6113,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
cur_source[0] = (char) j;
/* Then transform it */
- x = _mem_collxfrm(cur_source, 1, &x_len, FALSE);
+ x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
/* If something went wrong (which it shouldn't), just
* ignore this code point */
@@ -4114,14 +6138,14 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
if (! cur_max_x) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm: Couldn't find any character to"
+ "mem_collxfrm_: Couldn't find any character to"
" replace above-Latin1 chars in locale %s with",
PL_collation_name));
goto bad;
}
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm: highest 1-byte collating character"
+ "mem_collxfrm_: highest 1-byte collating character"
" in locale %s is 0x%02X\n",
PL_collation_name,
PL_strxfrm_max_cp));
@@ -4181,27 +6205,66 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
Newx(xbuf, xAlloc, char);
if (UNLIKELY(! xbuf)) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc));
+ "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
goto bad;
}
/* Store the collation id */
*(U32*)xbuf = PL_collation_ix;
+# if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L
+# ifdef USE_LOCALE_CTYPE
+
+ constructed_locale = newlocale(LC_CTYPE_MASK, PL_collation_name,
+ duplocale(use_curlocale_scratch()));
+# else
+
+ constructed_locale = duplocale(use_curlocale_scratch());
+
+# endif
+# define my_strxfrm(dest, src, n) strxfrm_l(dest, src, n, \
+ constructed_locale)
+# define CLEANUP_STRXFRM \
+ STMT_START { \
+ if (constructed_locale != (locale_t) 0) \
+ freelocale(constructed_locale); \
+ } STMT_END
+# else
+# define my_strxfrm(dest, src, n) strxfrm(dest, src, n)
+# ifdef USE_LOCALE_CTYPE
+
+ orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
+
+# define CLEANUP_STRXFRM \
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale)
+# else
+# define CLEANUP_STRXFRM NOOP
+# endif
+# endif
+
/* Then the transformation of the input. We loop until successful, or we
* give up */
for (;;) {
- *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
+ errno = 0;
+ *xlen = my_strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN);
/* If the transformed string occupies less space than we told strxfrm()
- * was available, it means it successfully transformed the whole
- * string. */
+ * was available, it means it transformed the whole string. */
if (*xlen < xAlloc - COLLXFRM_HDR_LEN) {
- /* Some systems include a trailing NUL in the returned length.
- * Ignore it, using a loop in case multiple trailing NULs are
- * returned. */
+ /* But there still could have been a problem */
+ if (errno != 0) {
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "strxfrm failed for LC_COLLATE=%s; errno=%d, input=%s\n",
+ PL_collation_name, errno,
+ _byte_dump_string((U8 *) s, len, 0)));
+ goto bad;
+ }
+
+ /* Here, the transformation was successful. Some systems include a
+ * trailing NUL in the returned length. Ignore it, using a loop in
+ * case multiple trailing NULs are returned. */
while ( (*xlen) > 0
&& *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0')
{
@@ -4223,9 +6286,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
: PL_collxfrm_mult;
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "%s: %d: initial size of %zu bytes for a length "
+ "initial size of %zu bytes for a length "
"%zu string was insufficient, %zu needed\n",
- __FILE__, __LINE__,
computed_guess, length_in_chars, needed));
/* If slope increased, use it, but discard this result for
@@ -4249,9 +6311,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
}
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "%s: %d: slope is now %zu; was %zu, base "
+ "slope is now %zu; was %zu, base "
"is now %zu; was %zu\n",
- __FILE__, __LINE__,
PL_collxfrm_mult, old_m,
PL_collxfrm_base, old_b));
}
@@ -4260,9 +6321,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
- computed_guess
+ PL_collxfrm_base;
DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "%s: %d: base is now %zu; was %zu\n",
- __FILE__, __LINE__,
- new_b, PL_collxfrm_base));
+ "base is now %zu; was %zu\n", new_b, PL_collxfrm_base));
PL_collxfrm_base = new_b;
}
}
@@ -4272,7 +6331,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
if (UNLIKELY(*xlen >= PERL_INT_MAX)) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n",
+ "mem_collxfrm_: Needed %zu bytes, max permissible is %u\n",
*xlen, PERL_INT_MAX));
goto bad;
}
@@ -4298,45 +6357,29 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
xAlloc += (xAlloc / 4) + 1;
PL_strxfrm_is_behaved = FALSE;
-# ifdef DEBUGGING
-
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm required more space than previously calculated"
- " for locale %s, trying again with new guess=%zu+%zu\n",
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "mem_collxfrm_ required more space than previously"
+ " calculated for locale %s, trying again with new"
+ " guess=%zu+%zu\n",
PL_collation_name, COLLXFRM_HDR_LEN,
- xAlloc - COLLXFRM_HDR_LEN);
- }
-
-# endif
-
+ xAlloc - COLLXFRM_HDR_LEN));
}
Renew(xbuf, xAlloc, char);
if (UNLIKELY(! xbuf)) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
- "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc));
+ "mem_collxfrm_: Couldn't realloc %zu bytes\n", xAlloc));
goto bad;
}
first_time = FALSE;
}
+ CLEANUP_STRXFRM;
-# ifdef DEBUGGING
-
- if (DEBUG_Lv_TEST || debug_initialization) {
+ DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
- print_collxfrm_input_and_return(s, s + len, xlen, utf8);
- PerlIO_printf(Perl_debug_log, "Its xfrm is:");
- PerlIO_printf(Perl_debug_log, "%s\n",
- _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN,
- *xlen, 1));
- }
-
-# endif
-
- /* Free up unneeded space; retain ehough for trailing NUL */
+ /* Free up unneeded space; retain enough for trailing NUL */
Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
if (s != input_string) {
@@ -4347,13 +6390,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
bad:
-# ifdef DEBUGGING
-
- if (DEBUG_Lv_TEST || debug_initialization) {
- print_collxfrm_input_and_return(s, s + len, NULL, utf8);
- }
-
-# endif
+ CLEANUP_STRXFRM;
+ DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
Safefree(xbuf);
if (s != input_string) {
@@ -4368,45 +6406,96 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
STATIC void
S_print_collxfrm_input_and_return(pTHX_
- const char * const s,
- const char * const e,
- const STRLEN * const xlen,
+ const char * s,
+ const char * e,
+ const char * xbuf,
+ const STRLEN xlen,
const bool is_utf8)
{
PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN;
- PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ",
- (UV)PL_collation_ix);
- if (xlen) {
- PerlIO_printf(Perl_debug_log, "%zu", *xlen);
- }
- else {
- PerlIO_printf(Perl_debug_log, "NULL");
+ PerlIO_printf(Perl_debug_log,
+ "mem_collxfrm_[ix %" UVuf "] for locale '%s':\n"
+ " input=%s\n return=%s\n return len=%zu\n",
+ (UV) PL_collation_ix, PL_collation_name,
+ get_displayable_string(s, e, is_utf8),
+ ((xbuf == NULL)
+ ? "(null)"
+ : _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, xlen, 0)),
+ xlen);
+}
+
+# endif /* DEBUGGING */
+
+SV *
+Perl_strxfrm(pTHX_ SV * src)
+{
+ PERL_ARGS_ASSERT_STRXFRM;
+
+ /* For use by POSIX::strxfrm(). If they differ, toggle LC_CTYPE to
+ * LC_COLLATE to avoid potential mojibake.
+ *
+ * If we can't calculate a collation, 'src' is instead returned, so that
+ * future comparisons will be by code point order */
+
+# ifdef USE_LOCALE_CTYPE
+
+ const char * orig_ctype = toggle_locale_c(LC_CTYPE,
+ querylocale_c(LC_COLLATE));
+# endif
+
+ SV * dst = src;
+ STRLEN dstlen;
+ STRLEN srclen;
+ const char *p = SvPV_const(src,srclen);
+ const U32 utf8_flag = SvUTF8(src);
+ char *d = mem_collxfrm_(p, srclen, &dstlen, cBOOL(utf8_flag));
+
+ assert(utf8_flag == 0 || utf8_flag == SVf_UTF8);
+
+ if (d != NULL) {
+ assert(dstlen > 0);
+ dst =newSVpvn_flags(d + COLLXFRM_HDR_LEN,
+ dstlen, SVs_TEMP|utf8_flag);
+ Safefree(d);
}
- PerlIO_printf(Perl_debug_log, " for locale '%s', string='",
- PL_collation_name);
- print_bytes_for_locale(s, e, is_utf8);
- PerlIO_printf(Perl_debug_log, "'\n");
+# ifdef USE_LOCALE_CTYPE
+
+ restore_toggled_locale_c(LC_CTYPE, orig_ctype);
+
+# endif
+
+ return dst;
}
-# endif /* DEBUGGING */
#endif /* USE_LOCALE_COLLATE */
-#ifdef USE_LOCALE
-# ifdef DEBUGGING
+#if defined(DEBUGGING) || defined(USE_POSIX_2008_LOCALE)
-STATIC void
-S_print_bytes_for_locale(pTHX_
- const char * const s,
- const char * const e,
- const bool is_utf8)
+STATIC const char *
+S_get_displayable_string(pTHX_
+ const char * const s,
+ const char * const e,
+ const bool is_utf8)
{
+ PERL_ARGS_ASSERT_GET_DISPLAYABLE_STRING;
+
const char * t = s;
bool prev_was_printable = TRUE;
bool first_time = TRUE;
+ char * ret;
+
+ if (e <= s) {
+ return "";
+ }
- PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE;
+ /* Worst case scenario: All are non-printable so have a blank between each.
+ * If UTF-8, all are the largest possible code point; otherwise all are a
+ * single byte. '(2 + 1)' is from each byte takes 2 characters to
+ * display, and a blank (or NUL for the final one) after it */
+ Newxz(ret, (e - s) * (2 + 1) * ((is_utf8) ? UVSIZE : 1), char);
+ SAVEFREEPV(ret);
while (t < e) {
UV cp = (is_utf8)
@@ -4414,932 +6503,453 @@ S_print_bytes_for_locale(pTHX_
: * (U8 *) t;
if (isPRINT(cp)) {
if (! prev_was_printable) {
- PerlIO_printf(Perl_debug_log, " ");
+ my_strlcat(ret, " ", sizeof(ret));
}
- PerlIO_printf(Perl_debug_log, "%c", (U8) cp);
+
+ /* Escape these to avoid any ambiguity */
+ if (cp == ' ' || cp == '\\') {
+ my_strlcat(ret, "\\", sizeof(ret));
+ }
+ my_strlcat(ret, Perl_form(aTHX_ "%c", (U8) cp), sizeof(ret));
prev_was_printable = TRUE;
}
else {
if (! first_time) {
- PerlIO_printf(Perl_debug_log, " ");
+ my_strlcat(ret, " ", sizeof(ret));
}
- PerlIO_printf(Perl_debug_log, "%02" UVXf, cp);
+ my_strlcat(ret, Perl_form(aTHX_ "%02" UVXf, cp), sizeof(ret));
prev_was_printable = FALSE;
}
t += (is_utf8) ? UTF8SKIP(t) : 1;
first_time = FALSE;
}
+
+ return ret;
}
-# endif /* #ifdef DEBUGGING */
+#endif
+#ifdef USE_LOCALE
STATIC const char *
-S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale)
+S_toggle_locale_i(pTHX_ const unsigned cat_index,
+ const char * new_locale,
+ const line_t caller_line)
{
- /* Changes the locale for LC_'switch_category" to that of
- * LC_'template_category', if they aren't already the same. If not NULL,
- * 'template_locale' is the locale that 'template_category' is in.
+ /* Changes the locale for the category specified by 'index' to 'new_locale,
+ * if they aren't already the same.
*
- * Returns a copy of the name of the original locale for 'switch_category'
+ * Returns a copy of the name of the original locale for 'cat_index'
* so can be switched back to with the companion function
- * restore_switched_locale(), (NULL if no restoral is necessary.) */
+ * restore_toggled_locale_i(), (NULL if no restoral is necessary.) */
- char * restore_to_locale = NULL;
+ const char * locale_to_restore_to = NULL;
- if (switch_category == template_category) { /* No changes needed */
- return NULL;
- }
+ PERL_ARGS_ASSERT_TOGGLE_LOCALE_I;
+ assert(cat_index <= NOMINAL_LC_ALL_INDEX);
/* Find the original locale of the category we may need to change, so that
* it can be restored to later */
- restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category,
- NULL)));
- if (! restore_to_locale) {
- Perl_croak(aTHX_
- "panic: %s: %d: Could not find current %s locale, errno=%d\n",
- __FILE__, __LINE__, category_name(switch_category), errno);
- }
-
- /* If the locale of the template category wasn't passed in, find it now */
- if (template_locale == NULL) {
- template_locale = do_setlocale_r(template_category, NULL);
- if (! template_locale) {
- Perl_croak(aTHX_
- "panic: %s: %d: Could not find current %s locale, errno=%d\n",
- __FILE__, __LINE__, category_name(template_category), errno);
- }
- }
-
- /* It the locales are the same, there's nothing to do */
- if (strEQ(restore_to_locale, template_locale)) {
- Safefree(restore_to_locale);
- DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n",
- category_name(switch_category), template_locale));
+ locale_to_restore_to = querylocale_i(cat_index);
- return NULL;
- }
-
- /* Finally, change the locale to the template one */
- if (! do_setlocale_r(switch_category, template_locale)) {
- Perl_croak(aTHX_
- "panic: %s: %d: Could not change %s locale to %s, errno=%d\n",
- __FILE__, __LINE__, category_name(switch_category),
- template_locale, errno);
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): toggle_locale_i: index=%d(%s), wanted=%s,"
+ " actual=%s\n",
+ caller_line, cat_index, category_names[cat_index],
+ new_locale, locale_to_restore_to));
+
+ if (! locale_to_restore_to) {
+ locale_panic_(Perl_form(aTHX_
+ "Could not find current %s locale, errno=%d",
+ category_names[cat_index], errno));
}
- DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n",
- category_name(switch_category), template_locale));
-
- return restore_to_locale;
-}
-
-STATIC void
-S_restore_switched_locale(pTHX_ const int category, const char * const original_locale)
-{
- /* Restores the locale for LC_'category' to 'original_locale' (which is a
- * copy that will be freed by this function), or do nothing if the latter
- * parameter is NULL */
-
- if (original_locale == NULL) {
- return;
- }
+ /* If the locales are the same, there's nothing to do */
+ if (strEQ(locale_to_restore_to, new_locale)) {
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): %s locale unchanged as %s\n",
+ caller_line, category_names[cat_index],
+ new_locale));
- if (! do_setlocale_r(category, original_locale)) {
- Perl_croak(aTHX_
- "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n",
- __FILE__, __LINE__,
- category_name(category), original_locale, errno);
+ return NULL;
}
- Safefree(original_locale);
-}
-
-/* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */
-#define CUR_LC_BUFFER_SIZE 64
-
-bool
-Perl__is_cur_LC_category_utf8(pTHX_ int category)
-{
- /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE
- * otherwise. 'category' may not be LC_ALL. If the platform doesn't have
- * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence
- * could give the wrong result. The result will very likely be correct for
- * languages that have commonly used non-ASCII characters, but for notably
- * English, it comes down to if the locale's name ends in something like
- * "UTF-8". It errs on the side of not being a UTF-8 locale.
- *
- * If the platform is early C89, not containing mbtowc(), or we are
- * compiled to not pay attention to LC_CTYPE, this employs heuristics.
- * These work very well for non-Latin locales or those whose currency
- * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at
- * least MB_CUR_MAX, English locales with an ASCII currency symbol depend
- * on the name containing UTF-8 or not. */
-
- /* Name of current locale corresponding to the input category */
- const char *save_input_locale = NULL;
-
- bool is_utf8 = FALSE; /* The return value */
-
- /* The variables below are for the cache of previous lookups using this
- * function. The cache is a C string, described at the definition for
- * 'C_and_POSIX_utf8ness'.
- *
- * The first part of the cache is fixed, for the C and POSIX locales. The
- * varying part starts just after them. */
- char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness);
-
- Size_t utf8ness_cache_size; /* Size of the varying portion */
- Size_t input_name_len; /* Length in bytes of save_input_locale */
- Size_t input_name_len_with_overhead; /* plus extra chars used to store
- the name in the cache */
- char * delimited; /* The name plus the delimiters used to store
- it in the cache */
- char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */
- char * name_pos; /* position of 'delimited' in the cache, or 0
- if not there */
+ /* Finally, change the locale to the new one */
+ void_setlocale_i(cat_index, new_locale);
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): %s locale switched to %s\n",
+ caller_line, category_names[cat_index], new_locale));
-# ifdef LC_ALL
-
- assert(category != LC_ALL);
+ return locale_to_restore_to;
+# ifndef DEBUGGING
+ PERL_UNUSED_ARG(caller_line);
# endif
- /* Get the desired category's locale */
- save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL)));
- if (! save_input_locale) {
- Perl_croak(aTHX_
- "panic: %s: %d: Could not find current %s locale, errno=%d\n",
- __FILE__, __LINE__, category_name(category), errno);
- }
-
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "Current locale for %s is %s\n",
- category_name(category), save_input_locale));
+}
- input_name_len = strlen(save_input_locale);
+STATIC void
+S_restore_toggled_locale_i(pTHX_ const unsigned int cat_index,
+ const char * restore_locale,
+ const line_t caller_line)
+{
+ /* Restores the locale for LC_category corresponding to cat_indes to
+ * 'restore_locale' (which is a copy that will be freed by this function),
+ * or do nothing if the latter parameter is NULL */
- /* In our cache, each name is accompanied by two delimiters and a single
- * utf8ness digit */
- input_name_len_with_overhead = input_name_len + 3;
+ PERL_ARGS_ASSERT_RESTORE_TOGGLED_LOCALE_I;
+ assert(cat_index <= NOMINAL_LC_ALL_INDEX);
- if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) {
- /* we can use the buffer, avoid a malloc */
- delimited = buffer;
- } else { /* need a malloc */
- /* Allocate and populate space for a copy of the name surrounded by the
- * delimiters */
- Newx(delimited, input_name_len_with_overhead, char);
+ if (restore_locale == NULL) {
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): No need to restore %s\n",
+ caller_line, category_names[cat_index]));
+ return;
}
- delimited[0] = UTF8NESS_SEP[0];
- Copy(save_input_locale, delimited + 1, input_name_len, char);
- delimited[input_name_len+1] = UTF8NESS_PREFIX[0];
- delimited[input_name_len+2] = '\0';
-
- /* And see if that is in the cache */
- name_pos = instr(PL_locale_utf8ness, delimited);
- if (name_pos) {
- is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0';
-
-# ifdef DEBUGGING
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "(%" LINE_Tf "): %s restoring locale to %s\n",
+ caller_line, category_names[cat_index],
+ restore_locale));
- if (DEBUG_Lv_TEST || debug_initialization) {
- PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n",
- save_input_locale, is_utf8);
- }
+ void_setlocale_i(cat_index, restore_locale);
+# ifndef DEBUGGING
+ PERL_UNUSED_ARG(caller_line);
# endif
- /* And, if not already in that position, move it to the beginning of
- * the non-constant portion of the list, since it is the most recently
- * used. (We don't have to worry about overflow, since just moving
- * existing names around) */
- if (name_pos > utf8ness_cache) {
- Move(utf8ness_cache,
- utf8ness_cache + input_name_len_with_overhead,
- name_pos - utf8ness_cache, char);
- Copy(delimited,
- utf8ness_cache,
- input_name_len_with_overhead - 1, char);
- utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
- }
-
- /* free only when not using the buffer */
- if ( delimited != buffer ) Safefree(delimited);
- Safefree(save_input_locale);
- return is_utf8;
- }
-
- /* Here we don't have stored the utf8ness for the input locale. We have to
- * calculate it */
-
-# if defined(USE_LOCALE_CTYPE) \
- && ( defined(HAS_NL_LANGINFO) \
- || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)))
-
- {
- const char *original_ctype_locale
- = switch_category_locale_to_template(LC_CTYPE,
- category,
- save_input_locale);
-
- /* Here the current LC_CTYPE is set to the locale of the category whose
- * information is desired. This means that nl_langinfo() and mbtowc()
- * should give the correct results */
-
-# ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding
- calling the functions if we have this */
-
- /* Standard UTF-8 needs at least 4 bytes to represent the maximum
- * Unicode code point. */
-
- DEBUG_L(PerlIO_printf(Perl_debug_log, "%s: %d: MB_CUR_MAX=%d\n",
- __FILE__, __LINE__, (int) MB_CUR_MAX));
- if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) {
- is_utf8 = FALSE;
- restore_switched_locale(LC_CTYPE, original_ctype_locale);
- goto finish_and_return;
- }
-
-# endif
-# if defined(HAS_NL_LANGINFO)
-
- { /* The task is easiest if the platform has this POSIX 2001 function.
- Except on some platforms it can wrongly return "", so have to have
- a fallback. And it can return that it's UTF-8, even if there are
- variances from that. For example, Turkish locales may use the
- alternate dotted I rules, and sometimes it appears to be a
- defective locale definition. XXX We should probably check for
- these in the Latin1 range and warn (but on glibc, requires
- iswalnum() etc. due to their not handling 80-FF correctly */
- const char *codeset = my_nl_langinfo(CODESET, FALSE);
- /* FALSE => already in dest locale */
-
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "\tnllanginfo returned CODESET '%s'\n", codeset));
-
- if (codeset && strNE(codeset, "")) {
-
- /* If the implementation of foldEQ() somehow were
- * to change to not go byte-by-byte, this could
- * read past end of string, as only one length is
- * checked. But currently, a premature NUL will
- * compare false, and it will stop there */
- is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8"))
- || foldEQ(codeset, STR_WITH_LEN("UTF8")));
-
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n",
- codeset, is_utf8));
- restore_switched_locale(LC_CTYPE, original_ctype_locale);
- goto finish_and_return;
- }
- }
-
-# endif
-# if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC)
- /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a
- * late adder to C89, so very likely to have it. However, testing has
- * shown that, like nl_langinfo() above, there are locales that are not
- * strictly UTF-8 that this will return that they are */
-
- {
- wchar_t wc;
- int len;
- dSAVEDERRNO;
-
-# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
-
- mbstate_t ps;
-
-# endif
-
- /* mbrtowc() and mbtowc() convert a byte string to a wide
- * character. Feed a byte string to one of them and check that the
- * result is the expected Unicode code point */
-
-# if defined(HAS_MBRTOWC) && defined(USE_ITHREADS)
- /* Prefer this function if available, as it's reentrant */
-
- memzero(&ps, sizeof(ps));;
- PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift
- state */
- SETERRNO(0, 0);
- len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps);
- SAVE_ERRNO;
-
-# else
-
- MBTOWC_LOCK;
- PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */
- SETERRNO(0, 0);
- len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8));
- SAVE_ERRNO;
- MBTOWC_UNLOCK;
+}
-# endif
+# ifdef USE_LOCALE_CTYPE
- RESTORE_ERRNO;
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n",
- len, (unsigned int) wc, GET_ERRNO));
+STATIC bool
+S_is_codeset_name_UTF8(const char * name)
+{
+ /* Return a boolean as to if the passed-in name indicates it is a UTF-8
+ * code set. Several variants are possible */
+ const Size_t len = strlen(name);
- is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8)
- && wc == (wchar_t) UNICODE_REPLACEMENT);
- }
+ PERL_ARGS_ASSERT_IS_CODESET_NAME_UTF8;
-# endif
+# ifdef WIN32
- restore_switched_locale(LC_CTYPE, original_ctype_locale);
- goto finish_and_return;
+ /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
+ if (memENDs(name, len, "65001")) {
+ return TRUE;
}
-# else
-
- /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next
- * try looking at the currency symbol to see if it disambiguates
- * things. Often that will be in the native script, and if the symbol
- * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII
- * UTF-8, we infer that the locale is too, as the odds of a non-UTF8
- * string being valid UTF-8 are quite small */
-
-# ifdef USE_LOCALE_MONETARY
-
- /* If have LC_MONETARY, we can look at the currency symbol. Often that
- * will be in the native script. We do this one first because there is
- * just one string to examine, so potentially avoids work */
-
- {
- const char *original_monetary_locale
- = switch_category_locale_to_template(LC_MONETARY,
- category,
- save_input_locale);
- bool only_ascii = FALSE;
- const U8 * currency_string
- = (const U8 *) my_nl_langinfo(CRNCYSTR, FALSE);
- /* 2nd param not relevant for this item */
- const U8 * first_variant;
-
- assert( *currency_string == '-'
- || *currency_string == '+'
- || *currency_string == '.');
-
- currency_string++;
-
- if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant))
- {
- DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
- only_ascii = TRUE;
- }
- else {
- is_utf8 = is_strict_utf8_string(first_variant, 0);
- }
-
- restore_switched_locale(LC_MONETARY, original_monetary_locale);
-
- if (! only_ascii) {
-
- /* It isn't a UTF-8 locale if the symbol is not legal UTF-8;
- * otherwise assume the locale is UTF-8 if and only if the symbol
- * is non-ascii UTF-8. */
- DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n",
- save_input_locale, is_utf8));
- goto finish_and_return;
- }
- }
-
-# endif /* USE_LOCALE_MONETARY */
-# if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME)
-
- /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try
- * the names of the months and weekdays, timezone, and am/pm indicator */
- {
- const char *original_time_locale
- = switch_category_locale_to_template(LC_TIME,
- category,
- save_input_locale);
- int hour = 10;
- bool is_dst = FALSE;
- int dom = 1;
- int month = 0;
- int i;
- char * formatted_time;
-
- /* Here the current LC_TIME is set to the locale of the category
- * whose information is desired. Look at all the days of the week and
- * month names, and the timezone and am/pm indicator for UTF-8 variant
- * characters. The first such a one found will tell us if the locale
- * is UTF-8 or not */
-
- for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */
- formatted_time = my_strftime("%A %B %Z %p",
- 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst);
- if ( ! formatted_time
- || is_utf8_invariant_string((U8 *) formatted_time, 0))
- {
-
- /* Here, we didn't find a non-ASCII. Try the next time through
- * with the complemented dst and am/pm, and try with the next
- * weekday. After we have gotten all weekdays, try the next
- * month */
- is_dst = ! is_dst;
- hour = (hour + 12) % 24;
- dom++;
- if (i > 6) {
- month++;
- }
- continue;
- }
-
- /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8;
- * false otherwise. But first, restore LC_TIME to its original
- * locale if we changed it */
- restore_switched_locale(LC_TIME, original_time_locale);
-
- DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n",
- save_input_locale,
- is_utf8_string((U8 *) formatted_time, 0)));
- is_utf8 = is_utf8_string((U8 *) formatted_time, 0);
- goto finish_and_return;
- }
-
- /* Falling off the end of the loop indicates all the names were just
- * ASCII. Go on to the next test. If we changed it, restore LC_TIME
- * to its original locale */
- restore_switched_locale(LC_TIME, original_time_locale);
- DEBUG_Lv(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
- }
-
# endif
+ /* 'UTF8' or 'UTF-8' */
+ return ( inRANGE(len, 4, 5)
+ && name[len-1] == '8'
+ && ( memBEGINs(name, len, "UTF")
+ || memBEGINs(name, len, "utf"))
+ && (len == 4 || name[3] == '-'));
+}
-# if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST)
-
- /* This code is ifdefd out because it was found to not be necessary in testing
- * on our dromedary test machine, which has over 700 locales. There, this
- * added no value to looking at the currency symbol and the time strings. I
- * left it in so as to avoid rewriting it if real-world experience indicates
- * that dromedary is an outlier. Essentially, instead of returning abpve if we
- * haven't found illegal utf8, we continue on and examine all the strerror()
- * messages on the platform for utf8ness. If all are ASCII, we still don't
- * know the answer; but otherwise we have a pretty good indication of the
- * utf8ness. The reason this doesn't help much is that the messages may not
- * have been translated into the locale. The currency symbol and time strings
- * are much more likely to have been translated. */
- {
- int e;
- bool non_ascii = FALSE;
- const char *original_messages_locale
- = switch_category_locale_to_template(LC_MESSAGES,
- category,
- save_input_locale);
- const char * errmsg = NULL;
-
- /* Here the current LC_MESSAGES is set to the locale of the category
- * whose information is desired. Look through all the messages. We
- * can't use Strerror() here because it may expand to code that
- * segfaults in miniperl */
-
- for (e = 0; e <= sys_nerr; e++) {
- errno = 0;
- errmsg = sys_errlist[e];
- if (errno || !errmsg) {
- break;
- }
- errmsg = savepv(errmsg);
- if (! is_utf8_invariant_string((U8 *) errmsg, 0)) {
- non_ascii = TRUE;
- is_utf8 = is_utf8_string((U8 *) errmsg, 0);
- break;
- }
- }
- Safefree(errmsg);
+# endif
+#endif /* USE_LOCALE */
- restore_switched_locale(LC_MESSAGES, original_messages_locale);
+bool
+Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
+{
+ /* Internal function which returns if we are in the scope of a pragma that
+ * enables the locale category 'category'. 'compiling' should indicate if
+ * this is during the compilation phase (TRUE) or not (FALSE). */
- if (non_ascii) {
+ const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
- /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid,
- * any non-ascii means it is one; otherwise we assume it isn't */
- DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n",
- save_input_locale,
- is_utf8));
- goto finish_and_return;
- }
+ SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
+ if (! these_categories || these_categories == &PL_sv_placeholder) {
+ return FALSE;
+ }
- DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale));
- }
+ /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
+ * a valid unsigned */
+ assert(category >= -1);
+ return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
+}
-# endif
-# ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a
- UTF-8 locale */
+/* my_strerror() returns a mortalized copy of the text of the error message
+ * associated with 'errnum'.
+ *
+ * If not called from within the scope of 'use locale', it uses the text from
+ * the C locale. If Perl is compiled to not pay attention to LC_CTYPE nor
+ * LC_MESSAGES, it uses whatever strerror() returns. Otherwise the text is
+ * derived from the locale, LC_MESSAGES if we have that; LC_CTYPE if not.
+ *
+ * It returns in *utf8ness the result's UTF-8ness
+ *
+ * The function just calls strerror(), but temporarily switches locales, if
+ * needed. Many platforms require LC_CTYPE and LC_MESSAGES to be in the same
+ * CODESET in order for the return from strerror() to not contain '?' symbols,
+ * or worse, mojibaked. It's cheaper to just use the stricter criteria of
+ * being in the same locale. So the code below uses a common locale for both
+ * categories. Again, that is C if not within 'use locale' scope; or the
+ * LC_MESSAGES locale if in scope and we have that category; and LC_CTYPE if we
+ * don't have LC_MESSAGES; and whatever strerror returns if we don't have
+ * either category.
+ *
+ * There are two sets of implementations. The first below is if we have
+ * strerror_l(). This is the simpler. We just use the already-built C locale
+ * object if not in locale scope, or build up a custom one otherwise.
+ *
+ * When strerror_l() is not available, we may have to swap locales temporarily
+ * to bring the two categories into sync with each other, and possibly to the C
+ * locale.
+ *
+ * Because the prepropessing directives to conditionally compile this function
+ * would greatly obscure the logic of the various implementations, the whole
+ * function is repeated for each configuration, with some common macros. */
+
+/* Used to shorten the definitions of the following implementations of
+ * my_strerror() */
+#define DEBUG_STRERROR_ENTER(errnum, in_locale) \
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, \
+ "my_strerror called with errnum %d;" \
+ " Within locale scope=%d\n", \
+ errnum, in_locale))
+#define DEBUG_STRERROR_RETURN(errstr, utf8ness) \
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, \
+ "Strerror returned; saving a copy: '%s';" \
+ " utf8ness=%d\n", \
+ get_displayable_string(errstr, \
+ errstr + strlen(errstr), \
+ *utf8ness), \
+ (int) *utf8ness))
+
+/* On platforms that have precisely one of these categories (Windows
+ * qualifies), these yield the correct one */
+#if defined(USE_LOCALE_CTYPE)
+# define WHICH_LC_INDEX LC_CTYPE_INDEX_
+#elif defined(USE_LOCALE_MESSAGES)
+# define WHICH_LC_INDEX LC_MESSAGES_INDEX_
+#endif
- /* As a last resort, look at the locale name to see if it matches
- * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the
- * return of setlocale(), is actually defined to be opaque, so we can't
- * really rely on the absence of various substrings in the name to indicate
- * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to
- * be a UTF-8 locale. Similarly for the other common names */
+/*==========================================================================*/
+/* First set of implementations, when have strerror_l() */
- {
- const Size_t final_pos = strlen(save_input_locale) - 1;
+#if defined(USE_POSIX_2008_LOCALE) && defined(HAS_STRERROR_L)
- if (final_pos >= 3) {
- const char *name = save_input_locale;
+# if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
- /* Find next 'U' or 'u' and look from there */
- while ((name += strcspn(name, "Uu") + 1)
- <= save_input_locale + final_pos - 2)
- {
- if ( isALPHA_FOLD_NE(*name, 't')
- || isALPHA_FOLD_NE(*(name + 1), 'f'))
- {
- continue;
- }
- name += 2;
- if (*(name) == '-') {
- if ((name > save_input_locale + final_pos - 1)) {
- break;
- }
- name++;
- }
- if (*(name) == '8') {
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "Locale %s ends with UTF-8 in name\n",
- save_input_locale));
- is_utf8 = TRUE;
- goto finish_and_return;
- }
- }
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "Locale %s doesn't end with UTF-8 in name\n",
- save_input_locale));
- }
+/* Here, neither category is defined: use the C locale */
+const char *
+Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
+{
+ PERL_ARGS_ASSERT_MY_STRERROR;
-# ifdef WIN32
+ DEBUG_STRERROR_ENTER(errnum, 0);
- /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */
- if (memENDs(save_input_locale, final_pos, "65001")) {
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "Locale %s ends with 65001 in name, is UTF-8 locale\n",
- save_input_locale));
- is_utf8 = TRUE;
- goto finish_and_return;
- }
+ const char *errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
+ *utf8ness = UTF8NESS_IMMATERIAL;
-# endif
- }
-# endif
+ DEBUG_STRERROR_RETURN(errstr, utf8ness);
- /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But
- * since we are about to return FALSE anyway, there is no point in doing
- * this extra work */
+ SAVEFREEPV(errstr);
+ return errstr;
+}
-# if 0
- if (instr(save_input_locale, "8859")) {
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "Locale %s has 8859 in name, not UTF-8 locale\n",
- save_input_locale));
- is_utf8 = FALSE;
- goto finish_and_return;
- }
-# endif
+# elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "Assuming locale %s is not a UTF-8 locale\n",
- save_input_locale));
- is_utf8 = FALSE;
+/*--------------------------------------------------------------------------*/
-# endif /* the code that is compiled when no modern LC_CTYPE */
+/* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we
+ * are not within 'use locale' scope of the only one defined, we use the C
+ * locale; otherwise use the current locale object */
- finish_and_return:
+const char *
+Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
+{
+ PERL_ARGS_ASSERT_MY_STRERROR;
- /* Cache this result so we don't have to go through all this next time. */
- utf8ness_cache_size = sizeof(PL_locale_utf8ness)
- - (utf8ness_cache - PL_locale_utf8ness);
+ DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
- /* But we can't save it if it is too large for the total space available */
- if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) {
- Size_t utf8ness_cache_len = strlen(utf8ness_cache);
+ /* Use C if not within locale scope; Otherwise, use current locale */
+ const locale_t which_obj = (IN_LC(categories[WHICH_LC_INDEX]))
+ ? PL_C_locale_obj
+ : use_curlocale_scratch();
- /* Here it can fit, but we may need to clear out the oldest cached
- * result(s) to do so. Check */
- if (utf8ness_cache_len + input_name_len_with_overhead
- >= utf8ness_cache_size)
- {
- /* Here we have to clear something out to make room for this.
- * Start looking at the rightmost place where it could fit and find
- * the beginning of the entry that extends past that. */
- char * cutoff = (char *) my_memrchr(utf8ness_cache,
- UTF8NESS_SEP[0],
- utf8ness_cache_size
- - input_name_len_with_overhead);
-
- assert(cutoff);
- assert(cutoff >= utf8ness_cache);
-
- /* This and all subsequent entries must be removed */
- *cutoff = '\0';
- utf8ness_cache_len = strlen(utf8ness_cache);
- }
+ const char *errstr = savepv(strerror_l(errnum, which_obj));
+ *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
+ NULL, WHICH_LC_INDEX);
+ DEBUG_STRERROR_RETURN(errstr, utf8ness);
- /* Make space for the new entry */
- Move(utf8ness_cache,
- utf8ness_cache + input_name_len_with_overhead,
- utf8ness_cache_len + 1 /* Incl. trailing NUL */, char);
-
- /* And insert it */
- Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char);
- utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0';
-
- if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') {
- Perl_croak(aTHX_
- "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu,"
- " inserted_name=%s, its_len=%zu\n",
- __FILE__, __LINE__,
- PL_locale_utf8ness, strlen(PL_locale_utf8ness),
- delimited, input_name_len_with_overhead);
- }
- }
+ SAVEFREEPV(errstr);
+ return errstr;
+}
-# ifdef DEBUGGING
+/*--------------------------------------------------------------------------*/
+# else /* Are using both categories. Place them in the same CODESET,
+ * either C or the LC_MESSAGES locale */
- if (DEBUG_Lv_TEST) {
- const char * s = PL_locale_utf8ness;
+const char *
+Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
+{
+ PERL_ARGS_ASSERT_MY_STRERROR;
- /* Audit the structure */
- while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) {
- const char *e;
+ DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
- if (*s != UTF8NESS_SEP[0]) {
- Perl_croak(aTHX_
- "panic: %s: %d: Corrupt utf8ness_cache: missing"
- " separator %.*s<-- HERE %s\n",
- __FILE__, __LINE__,
- (int) (s - PL_locale_utf8ness), PL_locale_utf8ness,
- s);
- }
- s++;
- e = strchr(s, UTF8NESS_PREFIX[0]);
- if (! e) {
- e = PL_locale_utf8ness + strlen(PL_locale_utf8ness);
- Perl_croak(aTHX_
- "panic: %s: %d: Corrupt utf8ness_cache: missing"
- " separator %.*s<-- HERE %s\n",
- __FILE__, __LINE__,
- (int) (e - PL_locale_utf8ness), PL_locale_utf8ness,
- e);
- }
- e++;
- if (*e != '0' && *e != '1') {
- Perl_croak(aTHX_
- "panic: %s: %d: Corrupt utf8ness_cache: utf8ness"
- " must be [01] %.*s<-- HERE %s\n",
- __FILE__, __LINE__,
- (int) (e + 1 - PL_locale_utf8ness),
- PL_locale_utf8ness, e + 1);
- }
- if (ninstr(PL_locale_utf8ness, s, s-1, e)) {
- Perl_croak(aTHX_
- "panic: %s: %d: Corrupt utf8ness_cache: entry"
- " has duplicate %.*s<-- HERE %s\n",
- __FILE__, __LINE__,
- (int) (e - PL_locale_utf8ness), PL_locale_utf8ness,
- e);
- }
- s = e + 1;
- }
+ const char *errstr;
+ if (! IN_LC(LC_MESSAGES)) { /* Use C if not within locale scope */
+ errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
+ *utf8ness = UTF8NESS_IMMATERIAL;
}
-
- if (DEBUG_Lv_TEST || debug_initialization) {
-
- PerlIO_printf(Perl_debug_log,
- "PL_locale_utf8ness is now %s; returning %d\n",
- PL_locale_utf8ness, is_utf8);
+ else { /* Otherwise, use the LC_MESSAGES locale, making sure LC_CTYPE
+ matches */
+ locale_t cur = duplocale(use_curlocale_scratch());
+
+ cur = newlocale(LC_CTYPE_MASK, querylocale_c(LC_MESSAGES), cur);
+ errstr = savepv(strerror_l(errnum, cur));
+ *utf8ness = get_locale_string_utf8ness_i(errstr,
+ LOCALE_UTF8NESS_UNKNOWN,
+ NULL, LC_MESSAGES_INDEX_);
+ freelocale(cur);
}
-# endif
+ DEBUG_STRERROR_RETURN(errstr, utf8ness);
- /* free only when not using the buffer */
- if ( delimited != buffer ) Safefree(delimited);
- Safefree(save_input_locale);
- return is_utf8;
+ SAVEFREEPV(errstr);
+ return errstr;
}
+# endif /* Above is using strerror_l */
+/*==========================================================================*/
+#else /* Below is not using strerror_l */
+# if ! defined(USE_LOCALE_CTYPE) && ! defined(USE_LOCALE_MESSAGES)
-#endif
+/* If not using using either of the categories, return plain, unadorned
+ * strerror */
-bool
-Perl__is_in_locale_category(pTHX_ const bool compiling, const int category)
+const char *
+Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
{
- /* Internal function which returns if we are in the scope of a pragma that
- * enables the locale category 'category'. 'compiling' should indicate if
- * this is during the compilation phase (TRUE) or not (FALSE). */
+ PERL_ARGS_ASSERT_MY_STRERROR;
- const COP * const cop = (compiling) ? &PL_compiling : PL_curcop;
+ DEBUG_STRERROR_ENTER(errnum, 0);
- SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0);
- if (! these_categories || these_categories == &PL_sv_placeholder) {
- return FALSE;
- }
+ const char *errstr = savepv(Strerror(errnum));
+ *utf8ness = UTF8NESS_IMMATERIAL;
- /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get
- * a valid unsigned */
- assert(category >= -1);
- return cBOOL(SvUV(these_categories) & (1U << (category + 1)));
-}
-
-char *
-Perl_my_strerror(pTHX_ const int errnum)
-{
- /* Returns a mortalized copy of the text of the error message associated
- * with 'errnum'. It uses the current locale's text unless the platform
- * doesn't have the LC_MESSAGES category or we are not being called from
- * within the scope of 'use locale'. In the former case, it uses whatever
- * strerror returns; in the latter case it uses the text from the C locale.
- *
- * The function just calls strerror(), but temporarily switches, if needed,
- * to the C locale */
+ DEBUG_STRERROR_RETURN(errstr, utf8ness);
- char *errstr;
-
-#ifndef USE_LOCALE_MESSAGES
-
- /* If platform doesn't have messages category, we don't do any switching to
- * the C locale; we just use whatever strerror() returns */
+ SAVEFREEPV(errstr);
+ return errstr;
+}
- errstr = savepv(Strerror(errnum));
+/*--------------------------------------------------------------------------*/
+# elif ! defined(USE_LOCALE_CTYPE) || ! defined(USE_LOCALE_MESSAGES)
-#else /* Has locale messages */
+/* Here one or the other of CTYPE or MESSAGES is defined, but not both. If we
+ * are not within 'use locale' scope of the only one defined, we use the C
+ * locale; otherwise use the current locale */
- const bool within_locale_scope = IN_LC(LC_MESSAGES);
+const char *
+Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
+{
+ PERL_ARGS_ASSERT_MY_STRERROR;
-# ifndef USE_ITHREADS
+ DEBUG_STRERROR_ENTER(errnum, IN_LC(categories[WHICH_LC_INDEX]));
- /* This function is trivial without threads. */
- if (within_locale_scope) {
- errstr = savepv(strerror(errnum));
+ const char *errstr;
+ if (IN_LC(categories[WHICH_LC_INDEX])) {
+ errstr = savepv(Strerror(errnum));
+ *utf8ness = get_locale_string_utf8ness_i(errstr,
+ LOCALE_UTF8NESS_UNKNOWN,
+ NULL, WHICH_LC_INDEX);
}
else {
- const char * save_locale = savepv(do_setlocale_c(LC_MESSAGES, NULL));
- do_setlocale_c(LC_MESSAGES, "C");
- errstr = savepv(strerror(errnum));
- do_setlocale_c(LC_MESSAGES, save_locale);
- Safefree(save_locale);
- }
+ SETLOCALE_LOCK;
-# elif defined(USE_POSIX_2008_LOCALE) \
- && defined(HAS_STRERROR_L)
+ const char * orig_locale = toggle_locale_i(WHICH_LC_INDEX, "C");
- /* This function is also trivial if we don't have to worry about thread
- * safety and have strerror_l(), as it handles the switch of locales so we
- * don't have to deal with that. We don't have to worry about thread
- * safety if strerror_r() is also available. Both it and strerror_l() are
- * thread-safe. Plain strerror() isn't thread safe. But on threaded
- * builds when strerror_r() is available, the apparent call to strerror()
- * below is actually a macro that behind-the-scenes calls strerror_r(). */
+ errstr = savepv(Strerror(errnum));
-# ifdef HAS_STRERROR_R
+ restore_toggled_locale_i(WHICH_LC_INDEX, orig_locale);
- if (within_locale_scope) {
- errstr = savepv(strerror(errnum));
- }
- else {
- errstr = savepv(strerror_l(errnum, PL_C_locale_obj));
- }
+ SETLOCALE_UNLOCK;
-# else
+ *utf8ness = UTF8NESS_IMMATERIAL;
- /* Here we have strerror_l(), but not strerror_r() and we are on a
- * threaded-build. We use strerror_l() for everything, constructing a
- * locale to pass to it if necessary */
+ }
- bool do_free = FALSE;
- locale_t locale_to_use;
+ DEBUG_STRERROR_RETURN(errstr, utf8ness);
- if (within_locale_scope) {
- locale_to_use = uselocale((locale_t) 0);
- if (locale_to_use == LC_GLOBAL_LOCALE) {
- locale_to_use = duplocale(LC_GLOBAL_LOCALE);
- do_free = TRUE;
- }
- }
- else { /* Use C locale if not within 'use locale' scope */
- locale_to_use = PL_C_locale_obj;
- }
+ SAVEFREEPV(errstr);
+ return errstr;
+}
- errstr = savepv(strerror_l(errnum, locale_to_use));
+/*--------------------------------------------------------------------------*/
+# else
- if (do_free) {
- freelocale(locale_to_use);
- }
+/* Below, have both LC_CTYPE and LC_MESSAGES. Place them in the same CODESET,
+ * either C or the LC_MESSAGES locale */
-# endif
-# else /* Doesn't have strerror_l() */
+const char *
+Perl_my_strerror(pTHX_ const int errnum, utf8ness_t * utf8ness)
+{
+ PERL_ARGS_ASSERT_MY_STRERROR;
+
+ DEBUG_STRERROR_ENTER(errnum, IN_LC(LC_MESSAGES));
- const char * save_locale = NULL;
- bool locale_is_C = FALSE;
+ const char * desired_locale = (IN_LC(LC_MESSAGES))
+ ? querylocale_c(LC_MESSAGES)
+ : "C";
+ /* XXX Can fail on z/OS */
- /* We have a critical section to prevent another thread from executing this
- * same code at the same time. (On thread-safe perls, the LOCK is a
- * no-op.) Since this is the only place in core that changes LC_MESSAGES
- * (unless the user has called setlocale(), this works to prevent races. */
SETLOCALE_LOCK;
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "my_strerror called with errnum %d\n", errnum));
- if (! within_locale_scope) {
- save_locale = do_setlocale_c(LC_MESSAGES, NULL);
- if (! save_locale) {
- SETLOCALE_UNLOCK;
- Perl_croak(aTHX_
- "panic: %s: %d: Could not find current LC_MESSAGES locale,"
- " errno=%d\n", __FILE__, __LINE__, errno);
- }
- else {
- locale_is_C = isNAME_C_OR_POSIX(save_locale);
-
- /* Switch to the C locale if not already in it */
- if (! locale_is_C) {
-
- /* The setlocale() just below likely will zap 'save_locale', so
- * create a copy. */
- save_locale = savepv(save_locale);
- if (! do_setlocale_c(LC_MESSAGES, "C")) {
-
- /* If, for some reason, the locale change failed, we
- * soldier on as best as possible under the circumstances,
- * using the current locale, and clear save_locale, so we
- * don't try to change back. On z/0S, all setlocale()
- * calls fail after you've created a thread. This is their
- * way of making sure the entire process is always a single
- * locale. This means that 'use locale' is always in place
- * for messages under these circumstances. */
- Safefree(save_locale);
- save_locale = NULL;
- }
- }
- }
- } /* end of ! within_locale_scope */
- else {
- DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n",
- __FILE__, __LINE__));
- }
+ const char* orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, desired_locale);
+ const char* orig_MESSAGES_locale = toggle_locale_c(LC_MESSAGES,
+ desired_locale);
+ const char *errstr = savepv(Strerror(errnum));
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "Any locale change has been done; about to call Strerror\n"));
- errstr = savepv(Strerror(errnum));
-
- if (! within_locale_scope) {
- if (save_locale && ! locale_is_C) {
- if (! do_setlocale_c(LC_MESSAGES, save_locale)) {
- SETLOCALE_UNLOCK;
- Perl_croak(aTHX_
- "panic: %s: %d: setlocale restore to '%s' failed, errno=%d\n",
- __FILE__, __LINE__, save_locale, errno);
- }
- Safefree(save_locale);
- }
- }
+ restore_toggled_locale_c(LC_MESSAGES, orig_MESSAGES_locale);
+ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
SETLOCALE_UNLOCK;
-# endif /* End of doesn't have strerror_l */
-# ifdef DEBUGGING
-
- if (DEBUG_Lv_TEST) {
- PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '");
- print_bytes_for_locale(errstr, errstr + strlen(errstr), 0);
- PerlIO_printf(Perl_debug_log, "'\n");
- }
-
-# endif
-#endif /* End of does have locale messages */
+ *utf8ness = get_locale_string_utf8ness_i(errstr, LOCALE_UTF8NESS_UNKNOWN,
+ NULL, LC_MESSAGES_INDEX_);
+ DEBUG_STRERROR_RETURN(errstr, utf8ness);
SAVEFREEPV(errstr);
return errstr;
}
+/*--------------------------------------------------------------------------*/
+# endif /* end of not using strerror_l() */
+#endif /* end of all the my_strerror() implementations */
+
/*
=for apidoc switch_to_global_locale
-On systems without locale support, or on typical single-threaded builds, or on
-platforms that do not support per-thread locale operations, this function does
-nothing. On such systems that do have locale support, only a locale global to
-the whole program is available.
+This function copies the locale state of the calling thread into the program's
+global locale, and converts the thread to use that global locale.
+
+It is intended so that Perl can safely be used with C libraries that access the
+global locale and which can't be converted to not access it. Effectively, this
+means libraries that call C<L<setlocale(3)>> on non-Windows systems. (For
+portability, it is a good idea to use it on Windows as well.)
-On multi-threaded builds on systems that do have per-thread locale operations,
-this function converts the thread it is running in to use the global locale.
-This is for code that has not yet or cannot be updated to handle multi-threaded
-locale operation. As long as only a single thread is so-converted, everything
-works fine, as all the other threads continue to ignore the global one, so only
-this thread looks at it.
+A downside of using it is that it disables the services that Perl provides to
+hide locale gotchas from your code. The service you most likely will miss
+regards the radix character (decimal point) in floating point numbers. Code
+executed after this function is called can no longer just assume that this
+character is correct for the current circumstances.
-However, on Windows systems this isn't quite true prior to Visual Studio 15,
-at which point Microsoft fixed a bug. A race can occur if you use the
-following operations on earlier Windows platforms:
+To return to Perl control, and restart the gotcha prevention services, call
+C<L</sync_locale>>. Behavior is undefined for any pure Perl code that executes
+while the switch is in effect.
+
+The global locale and the per-thread locales are independent. As long as just
+one thread converts to the global locale, everything works smoothly. But if
+more than one does, they can easily interfere with each other, and races are
+likely. On Windows systems prior to Visual Studio 15 (at which point Microsoft
+fixed a bug), races can occur (even if only one thread has been converted to
+the global locale), but only if you use the following operations:
=over
@@ -5352,59 +6962,98 @@ following operations on earlier Windows platforms:
=back
The first item is not fixable (except by upgrading to a later Visual Studio
-release), but it would be possible to work around the latter two items by using
-the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches
+release), but it would be possible to work around the latter two items by
+having Perl change its algorithm for calculating these to use Windows API
+functions (likely C<GetNumberFormat> and C<GetCurrencyFormat>); patches
welcome.
-Without this function call, threads that use the L<C<setlocale(3)>> system
-function will not work properly, as all the locale-sensitive functions will
-look at the per-thread locale, and C<setlocale> will have no effect on this
-thread.
-
-Perl code should convert to either call
-L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system
-C<setlocale>) or use the methods given in L<perlcall> to call
+XS code should never call plain C<setlocale>, but should instead be converted
+to either call L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in
+for the system C<setlocale>) or use the methods given in L<perlcall> to call
L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly
handle all cases of single- vs multi-thread, POSIX 2008-supported or not.
-Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can
-continue to work if this function is called before transferring control to the
-library.
-
-Upon return from the code that needs to use the global locale,
-L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe
-multi-thread operation.
-
=cut
*/
void
-Perl_switch_to_global_locale()
+Perl_switch_to_global_locale(pTHX)
{
-#ifdef USE_THREAD_SAFE_LOCALE
-# ifdef WIN32
+#ifdef USE_LOCALE
- _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
+ DEBUG_L(PerlIO_printf(Perl_debug_log, "Entering switch_to_global; %s\n",
+ get_LC_ALL_display()));
+ bool perl_controls = false;
-# else
-# ifdef HAS_QUERYLOCALE
+# ifdef USE_THREAD_SAFE_LOCALE
+
+ /* In these cases, we use the system state to determine if we are in the
+ * global locale or not. */
+
+# ifdef USE_POSIX_2008_LOCALE
+
+ perl_controls = (LC_GLOBAL_LOCALE != uselocale((locale_t) 0));
- setlocale(LC_ALL, querylocale(LC_ALL_MASK, uselocale((locale_t) 0)));
+# elif defined(WIN32)
+
+ perl_controls = (_configthreadlocale(0) == _ENABLE_PER_THREAD_LOCALE);
# else
+# error Unexpected Configuration
+# endif
+# endif
- {
- unsigned int i;
+ /* No-op if already in global */
+ if (! perl_controls) {
+ return;
+ }
- for (i = 0; i < LC_ALL_INDEX; i++) {
- setlocale(categories[i], do_setlocale_r(categories[i], NULL));
- }
+# ifdef USE_THREAD_SAFE_LOCALE
+# if defined(WIN32)
+
+ const char * thread_locale = posix_setlocale(LC_ALL, NULL);
+ _configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
+ posix_setlocale(LC_ALL, thread_locale);
+
+# else /* Must be USE_POSIX_2008_LOCALE) */
+
+ const char * cur_thread_locales[NOMINAL_LC_ALL_INDEX + 1];
+
+ /* Save each category's current per-thread state */
+ for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ cur_thread_locales[i] = querylocale_i(i);
+ }
+
+ /* Now switch to global */
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Switching to global locale\n"));
+
+ locale_t old_locale = uselocale(LC_GLOBAL_LOCALE);
+ if (! old_locale) {
+ locale_panic_("Could not change to global locale");
+ }
+
+ /* Free the per-thread memory */
+ if (old_locale != LC_GLOBAL_LOCALE && old_locale != PL_C_locale_obj) {
+ freelocale(old_locale);
+ }
+
+ /* Set the global to what was our per-thread state */
+ POSIX_SETLOCALE_LOCK;
+ for (unsigned int i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ posix_setlocale(categories[i], cur_thread_locales[i]);
}
+ POSIX_SETLOCALE_UNLOCK;
# endif
+# endif
+# ifdef USE_LOCALE_NUMERIC
- uselocale(LC_GLOBAL_LOCALE);
+ /* Switch to the underlying C numeric locale; the application is on its
+ * own. */
+ POSIX_SETLOCALE_LOCK;
+ posix_setlocale(LC_NUMERIC, PL_numeric_name);
+ POSIX_SETLOCALE_UNLOCK;
# endif
#endif
@@ -5415,27 +7064,45 @@ Perl_switch_to_global_locale()
=for apidoc sync_locale
+This function copies the state of the program global locale into the calling
+thread, and converts that thread to using per-thread locales, if it wasn't
+already, and the platform supports them. The LC_NUMERIC locale is toggled into
+the standard state (using the C locale's conventions), if not within the
+lexical scope of S<C<use locale>>.
+
+Perl will now consider itself to have control of the locale.
+
+Since unthreaded perls have only a global locale, this function is a no-op
+without threads.
+
+This function is intended for use with C libraries that do locale manipulation.
+It allows Perl to accommodate the use of them. Call this function before
+transferring back to Perl space so that it knows what state the C code has left
+things in.
+
+XS code should not manipulate the locale on its own. Instead,
L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or
change the locale (though changing the locale is antisocial and dangerous on
multi-threaded systems that don't have multi-thread safe locale operations.
-(See L<perllocale/Multi-threaded operation>). Using the system
-L<C<setlocale(3)>> should be avoided. Nevertheless, certain non-Perl libraries
-called from XS, such as C<Gtk> do so, and this can't be changed. When the
-locale is changed by XS code that didn't use
-L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the
-locale has changed. Use this function to do so, before returning to Perl.
+(See L<perllocale/Multi-threaded operation>).
+
+Using the libc L<C<setlocale(3)>> function should be avoided. Nevertheless,
+certain non-Perl libraries called from XS, do call it, and their behavior may
+not be able to be changed. This function, along with
+C<L</switch_to_global_locale>>, can be used to get seamless behavior in these
+circumstances, as long as only one thread is involved.
+
+If the library has an option to turn off its locale manipulation, doing that is
+preferable to using this mechanism. C<Gtk> is such a library.
The return value is a boolean: TRUE if the global locale at the time of call
-was in effect; and FALSE if a per-thread locale was in effect. This can be
-used by the caller that needs to restore things as-they-were to decide whether
-or not to call
-L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>.
+was in effect for the caller; and FALSE if a per-thread locale was in effect.
=cut
*/
bool
-Perl_sync_locale()
+Perl_sync_locale(pTHX)
{
#ifndef USE_LOCALE
@@ -5444,77 +7111,52 @@ Perl_sync_locale()
#else
- const char * newlocale;
- dTHX;
-
-# ifdef USE_POSIX_2008_LOCALE
+ bool was_in_global = TRUE;
- bool was_in_global_locale = FALSE;
- locale_t cur_obj = uselocale((locale_t) 0);
+# ifdef USE_THREAD_SAFE_LOCALE
+# if defined(WIN32)
- /* On Windows, unless the foreign code has turned off the thread-safe
- * locale setting, any plain setlocale() will have affected what we see, so
- * no need to worry. Otherwise, If the foreign code has done a plain
- * setlocale(), it will only affect the global locale on POSIX systems, but
- * will affect the */
- if (cur_obj == LC_GLOBAL_LOCALE) {
+ was_in_global = _configthreadlocale(_DISABLE_PER_THREAD_LOCALE)
+ == _DISABLE_PER_THREAD_LOCALE;
-# ifdef HAS_QUERY_LOCALE
+# elif defined(USE_POSIX_2008_LOCALE)
- do_setlocale_c(LC_ALL, setlocale(LC_ALL, NULL));
+ was_in_global = (LC_GLOBAL_LOCALE == uselocale((locale_t) 0));
# else
-
- unsigned int i;
-
- /* We can't trust that we can read the LC_ALL format on the
- * platform, so do them individually */
- for (i = 0; i < LC_ALL_INDEX; i++) {
- do_setlocale_r(categories[i], setlocale(categories[i], NULL));
- }
-
+# error Unexpected Configuration
# endif
-
- was_in_global_locale = TRUE;
+# endif /* USE_THREAD_SAFE_LOCALE */
+
+ /* Here, we are in the global locale. Get and save the values for each
+ * category. */
+ const char * current_globals[NOMINAL_LC_ALL_INDEX];
+ for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ POSIX_SETLOCALE_LOCK;
+ current_globals[i] = savepv(stdized_setlocale(categories[i], NULL));
+ POSIX_SETLOCALE_UNLOCK;
}
-# else
-
- bool was_in_global_locale = TRUE;
+ /* Now we have to convert the current thread to use them */
-# endif
-# ifdef USE_LOCALE_CTYPE
-
- newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL));
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "%s:%d: %s\n", __FILE__, __LINE__,
- setlocale_debug_string(LC_CTYPE, NULL, newlocale)));
- new_ctype(newlocale);
- Safefree(newlocale);
+# if defined(WIN32)
-# endif /* USE_LOCALE_CTYPE */
-# ifdef USE_LOCALE_COLLATE
-
- newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL));
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "%s:%d: %s\n", __FILE__, __LINE__,
- setlocale_debug_string(LC_COLLATE, NULL, newlocale)));
- new_collate(newlocale);
- Safefree(newlocale);
+ /* On Windows, convert to per-thread behavior. This isn't necessary in
+ * POSIX 2008, as the conversion gets done automatically in the loop below.
+ * */
+ _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
# endif
-# ifdef USE_LOCALE_NUMERIC
- newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL));
- DEBUG_Lv(PerlIO_printf(Perl_debug_log,
- "%s:%d: %s\n", __FILE__, __LINE__,
- setlocale_debug_string(LC_NUMERIC, NULL, newlocale)));
- new_numeric(newlocale);
- Safefree(newlocale);
+ for (unsigned i = 0; i < NOMINAL_LC_ALL_INDEX; i++) {
+ setlocale_i(i, current_globals[i]);
+ Safefree(current_globals[i]);
+ }
-# endif /* USE_LOCALE_NUMERIC */
+ /* And update our remaining records. 'true' => force recalculation */
+ new_LC_ALL(NULL, true);
- return was_in_global_locale;
+ return was_in_global;
#endif
@@ -5523,75 +7165,127 @@ Perl_sync_locale()
#if defined(DEBUGGING) && defined(USE_LOCALE)
STATIC char *
-S_setlocale_debug_string(const int category, /* category number,
- like LC_ALL */
- const char* const locale, /* locale name */
+S_my_setlocale_debug_string_i(pTHX_
+ const unsigned cat_index,
+ const char* locale, /* Optional locale name */
+
+ /* return value from setlocale() when attempting
+ * to set 'category' to 'locale' */
+ const char* retval,
- /* return value from setlocale() when attempting to
- * set 'category' to 'locale' */
- const char* const retval)
+ const line_t line)
{
/* Returns a pointer to a NUL-terminated string in static storage with
* added text about the info passed in. This is not thread safe and will
* be overwritten by the next call, so this should be used just to
* formulate a string to immediately print or savepv() on. */
- static char ret[256];
+ const char * locale_quote;
+ const char * retval_quote;
- my_strlcpy(ret, "setlocale(", sizeof(ret));
- my_strlcat(ret, category_name(category), sizeof(ret));
- my_strlcat(ret, ", ", sizeof(ret));
+ assert(cat_index <= NOMINAL_LC_ALL_INDEX);
- if (locale) {
- my_strlcat(ret, "\"", sizeof(ret));
- my_strlcat(ret, locale, sizeof(ret));
- my_strlcat(ret, "\"", sizeof(ret));
+ if (locale == NULL) {
+ locale_quote = "";
+ locale = "NULL";
}
else {
- my_strlcat(ret, "NULL", sizeof(ret));
+ locale_quote = "\"";
}
- my_strlcat(ret, ") returned ", sizeof(ret));
-
- if (retval) {
- my_strlcat(ret, "\"", sizeof(ret));
- my_strlcat(ret, retval, sizeof(ret));
- my_strlcat(ret, "\"", sizeof(ret));
+ if (retval == NULL) {
+ retval_quote = "";
+ retval = "NULL";
}
else {
- my_strlcat(ret, "NULL", sizeof(ret));
+ retval_quote = "\"";
}
- assert(strlen(ret) < sizeof(ret));
+# ifdef USE_LOCALE_THREADS
+# define THREAD_FORMAT "%p:"
+# define THREAD_ARGUMENT aTHX_
+# else
+# define THREAD_FORMAT
+# define THREAD_ARGUMENT
+# endif
- return ret;
+ return Perl_form(aTHX_
+ "%s:%" LINE_Tf ": " THREAD_FORMAT
+ " setlocale(%s[%d], %s%s%s) returned %s%s%s\n",
+
+ __FILE__, line, THREAD_ARGUMENT
+ category_names[cat_index], categories[cat_index],
+ locale_quote, locale, locale_quote,
+ retval_quote, retval, retval_quote);
}
#endif
+#ifdef USE_PERL_SWITCH_LOCALE_CONTEXT
void
-Perl_thread_locale_init()
+Perl_switch_locale_context()
{
- /* Called from a thread on startup*/
+ /* libc keeps per-thread locale status information in some configurations.
+ * So, we can't just switch out aTHX to switch to a new thread. libc has
+ * to follow along. This routine does that based on per-interpreter
+ * variables we keep just for this purpose */
-#ifdef USE_THREAD_SAFE_LOCALE
+ /* Can't use pTHX, because we may be called from a place where that
+ * isn't available */
+ dTHX;
- dTHX_DEBUGGING;
+ if (UNLIKELY( aTHX == NULL
+ || PL_veto_switch_non_tTHX_context
+ || PL_phase == PERL_PHASE_CONSTRUCT))
+ {
+ return;
+ }
- /* C starts the new thread in the global C locale. If we are thread-safe,
- * we want to not be in the global locale */
+# ifdef USE_POSIX_2008_LOCALE
- DEBUG_L(PerlIO_printf(Perl_debug_log,
- "%s:%d: new thread, initial locale is %s; calling setlocale\n",
- __FILE__, __LINE__, setlocale(LC_ALL, NULL)));
+ if (! uselocale(PL_cur_locale_obj)) {
+ locale_panic_(Perl_form(aTHX_
+ "Can't uselocale(%p), LC_ALL supposed to be '%s",
+ PL_cur_locale_obj, get_LC_ALL_display()));
+ }
-# ifdef WIN32
+# elif defined(WIN32)
- _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
+ if (! bool_setlocale_c(LC_ALL, PL_cur_LC_ALL)) {
+ locale_panic_(Perl_form(aTHX_ "Can't setlocale(%s)", PL_cur_LC_ALL));
+ }
-# else
+# endif
- Perl_setlocale(LC_ALL, "C");
+}
+
+#endif
+
+void
+Perl_thread_locale_init(pTHX)
+{
+
+#ifdef USE_THREAD_SAFE_LOCALE
+# ifdef USE_POSIX_2008_LOCALE
+
+ /* Called from a thread on startup.
+ *
+ * The operations here have to be done from within the calling thread, as
+ * they affect libc's knowledge of the thread; libc has no knowledge of
+ * aTHX */
+
+ DEBUG_L(PerlIO_printf(Perl_debug_log,
+ "new thread, initial locale is %s;"
+ " calling setlocale(LC_ALL, \"C\")\n",
+ get_LC_ALL_display()));
+
+ uselocale(PL_C_locale_obj);
+
+# elif defined(WIN32)
+
+ /* On Windows, make sure new thread has per-thread locales enabled */
+ _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
+ void_setlocale_c(LC_ALL, "C");
# endif
#endif
@@ -5599,25 +7293,36 @@ Perl_thread_locale_init()
}
void
-Perl_thread_locale_term()
+Perl_thread_locale_term(pTHX)
{
- /* Called from a thread as it gets ready to terminate */
+ /* Called from a thread as it gets ready to terminate.
+ *
+ * The operations here have to be done from within the calling thread, as
+ * they affect libc's knowledge of the thread; libc has no knowledge of
+ * aTHX */
-#ifdef USE_THREAD_SAFE_LOCALE
+#ifdef USE_POSIX_2008_LOCALE
/* C starts the new thread in the global C locale. If we are thread-safe,
* we want to not be in the global locale */
-# ifndef WIN32
+ /* Free up */
+ locale_t actual_obj = uselocale(LC_GLOBAL_LOCALE);
+ if (actual_obj != LC_GLOBAL_LOCALE && actual_obj != PL_C_locale_obj) {
+ freelocale(actual_obj);
+ }
- { /* Free up */
- locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE);
- if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) {
- freelocale(cur_obj);
- }
+ /* Prevent leaks even if something has gone wrong */
+ locale_t expected_obj = PL_cur_locale_obj;
+ if (UNLIKELY( expected_obj != actual_obj
+ && expected_obj != LC_GLOBAL_LOCALE
+ && expected_obj != PL_C_locale_obj))
+ {
+ freelocale(expected_obj);
}
-# endif
+ PL_cur_locale_obj = LC_GLOBAL_LOCALE;
+
#endif
}