PostgreSQL/src/include/utils/pg_locale.h
Jeff Davis ea1db8ae70 Canonicalize ICU locale names to language tags.
Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().

This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut
2023-04-04 10:38:58 -07:00

137 lines
4.2 KiB
C

/*-----------------------------------------------------------------------
*
* PostgreSQL locale utilities
*
* src/include/utils/pg_locale.h
*
* Copyright (c) 2002-2023, PostgreSQL Global Development Group
*
*-----------------------------------------------------------------------
*/
#ifndef _PG_LOCALE_
#define _PG_LOCALE_
#if defined(LOCALE_T_IN_XLOCALE) || defined(WCSTOMBS_L_IN_XLOCALE)
#include <xlocale.h>
#endif
#ifdef USE_ICU
#include <unicode/ucol.h>
#endif
#ifdef USE_ICU
/*
* ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
* (see
* <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
*/
#if U_ICU_VERSION_MAJOR_NUM >= 53
#define HAVE_UCOL_STRCOLLUTF8 1
#else
#undef HAVE_UCOL_STRCOLLUTF8
#endif
#endif
/* use for libc locale names */
#define LOCALE_NAME_BUFLEN 128
/* GUC settings */
extern PGDLLIMPORT char *locale_messages;
extern PGDLLIMPORT char *locale_monetary;
extern PGDLLIMPORT char *locale_numeric;
extern PGDLLIMPORT char *locale_time;
extern PGDLLIMPORT int icu_validation_level;
/* lc_time localization cache */
extern PGDLLIMPORT char *localized_abbrev_days[];
extern PGDLLIMPORT char *localized_full_days[];
extern PGDLLIMPORT char *localized_abbrev_months[];
extern PGDLLIMPORT char *localized_full_months[];
/* is the databases's LC_CTYPE the C locale? */
extern PGDLLIMPORT bool database_ctype_is_c;
extern bool check_locale(int category, const char *locale, char **canonname);
extern char *pg_perm_setlocale(int category, const char *locale);
extern void check_strxfrm_bug(void);
extern bool lc_collate_is_c(Oid collation);
extern bool lc_ctype_is_c(Oid collation);
/*
* Return the POSIX lconv struct (contains number/money formatting
* information) with locale information for all categories.
*/
extern struct lconv *PGLC_localeconv(void);
extern void cache_locale_time(void);
/*
* We define our own wrapper around locale_t so we can keep the same
* function signatures for all builds, while not having to create a
* fake version of the standard type locale_t in the global namespace.
* pg_locale_t is occasionally checked for truth, so make it a pointer.
*/
struct pg_locale_struct
{
char provider;
bool deterministic;
union
{
#ifdef HAVE_LOCALE_T
locale_t lt;
#endif
#ifdef USE_ICU
struct
{
const char *locale;
UCollator *ucol;
} icu;
#endif
int dummy; /* in case we have neither LOCALE_T nor ICU */
} info;
};
typedef struct pg_locale_struct *pg_locale_t;
extern PGDLLIMPORT struct pg_locale_struct default_locale;
extern void make_icu_collator(const char *iculocstr,
const char *icurules,
struct pg_locale_struct *resultp);
extern bool pg_locale_deterministic(pg_locale_t locale);
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
extern char *get_collation_actual_version(char collprovider, const char *collcollate);
extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
extern int pg_strncoll(const char *arg1, size_t len1,
const char *arg2, size_t len2, pg_locale_t locale);
extern bool pg_strxfrm_enabled(pg_locale_t locale);
extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize,
pg_locale_t locale);
extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src,
size_t srclen, pg_locale_t locale);
extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale);
extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
pg_locale_t locale);
extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
size_t srclen, pg_locale_t locale);
extern void icu_validate_locale(const char *loc_str);
extern char *icu_language_tag(const char *loc_str, int elevel);
#ifdef USE_ICU
extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
#endif
/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
pg_locale_t locale);
extern size_t char2wchar(wchar_t *to, size_t tolen,
const char *from, size_t fromlen, pg_locale_t locale);
#endif /* _PG_LOCALE_ */