Add support for collation attributes on older ICU versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.

Also add some tests for ICU collation customization.

Reported-by: Daniel Verite <daniel@manitou-mail.org>
Discussion: https://www.postgresql.org/message-id/0270ebd4-f67c-8774-1a5a-91adfb9bb41f@2ndquadrant.com
This commit is contained in:
Peter Eisentraut 2019-03-17 08:16:33 +01:00
parent 042162d628
commit b8f9a2a69a
3 changed files with 164 additions and 0 deletions

View File

@ -58,6 +58,7 @@
#include "catalog/pg_control.h"
#include "mb/pg_wchar.h"
#include "utils/builtins.h"
#include "utils/formatting.h"
#include "utils/hsearch.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
@ -132,6 +133,9 @@ static HTAB *collation_cache = NULL;
static char *IsoLocaleName(const char *); /* MSVC specific */
#endif
#ifdef USE_ICU
static void icu_set_collation_attributes(UCollator *collator, const char *loc);
#endif
/*
* pg_perm_setlocale
@ -1380,6 +1384,9 @@ pg_newlocale_from_collation(Oid collid)
(errmsg("could not open collator for locale \"%s\": %s",
collcollate, u_errorName(status))));
if (U_ICU_VERSION_MAJOR_NUM < 54)
icu_set_collation_attributes(collator, collcollate);
/* We will leak this string if we get an error below :-( */
result.info.icu.locale = MemoryContextStrdup(TopMemoryContext,
collcollate);
@ -1588,6 +1595,103 @@ icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
return len_result;
}
/*
* Parse collation attributes and apply them to the open collator. This takes
* a string like "und@colStrength=primary;colCaseLevel=yes" and parses and
* applies the key-value arguments.
*
* Starting with ICU version 54, the attributes are processed automatically by
* ucol_open(), so this is only necessary for emulating this behavior on older
* versions.
*/
pg_attribute_unused()
static void
icu_set_collation_attributes(UCollator *collator, const char *loc)
{
char *str = asc_tolower(loc, strlen(loc));
str = strchr(str, '@');
if (!str)
return;
str++;
for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";"))
{
char *e = strchr(token, '=');
if (e)
{
char *name;
char *value;
UColAttribute uattr = -1;
UColAttributeValue uvalue = -1;
UErrorCode status;
status = U_ZERO_ERROR;
*e = '\0';
name = token;
value = e + 1;
/*
* See attribute name and value lists in ICU i18n/coll.cpp
*/
if (strcmp(name, "colstrength") == 0)
uattr = UCOL_STRENGTH;
else if (strcmp(name, "colbackwards") == 0)
uattr = UCOL_FRENCH_COLLATION;
else if (strcmp(name, "colcaselevel") == 0)
uattr = UCOL_CASE_LEVEL;
else if (strcmp(name, "colcasefirst") == 0)
uattr = UCOL_CASE_FIRST;
else if (strcmp(name, "colalternate") == 0)
uattr = UCOL_ALTERNATE_HANDLING;
else if (strcmp(name, "colnormalization") == 0)
uattr = UCOL_NORMALIZATION_MODE;
else if (strcmp(name, "colnumeric") == 0)
uattr = UCOL_NUMERIC_COLLATION;
/* ignore if unknown */
if (strcmp(value, "primary") == 0)
uvalue = UCOL_PRIMARY;
else if (strcmp(value, "secondary") == 0)
uvalue = UCOL_SECONDARY;
else if (strcmp(value, "tertiary") == 0)
uvalue = UCOL_TERTIARY;
else if (strcmp(value, "quaternary") == 0)
uvalue = UCOL_QUATERNARY;
else if (strcmp(value, "identical") == 0)
uvalue = UCOL_IDENTICAL;
else if (strcmp(value, "no") == 0)
uvalue = UCOL_OFF;
else if (strcmp(value, "yes") == 0)
uvalue = UCOL_ON;
else if (strcmp(value, "shifted") == 0)
uvalue = UCOL_SHIFTED;
else if (strcmp(value, "non-ignorable") == 0)
uvalue = UCOL_NON_IGNORABLE;
else if (strcmp(value, "lower") == 0)
uvalue = UCOL_LOWER_FIRST;
else if (strcmp(value, "upper") == 0)
uvalue = UCOL_UPPER_FIRST;
else
status = U_ILLEGAL_ARGUMENT_ERROR;
if (uattr != -1 && uvalue != -1)
ucol_setAttribute(collator, uattr, uvalue, &status);
/*
* Pretend the error came from ucol_open(), for consistent error
* message across ICU versions.
*/
if (U_FAILURE(status))
ereport(ERROR,
(errmsg("could not open collator for locale \"%s\": %s",
loc, u_errorName(status))));
}
}
}
#endif /* USE_ICU */
/*

View File

@ -1100,6 +1100,45 @@ select textrange_en_us('A','Z') @> 'b'::text;
drop type textrange_c;
drop type textrange_en_us;
-- test ICU collation customization
CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents;
?column? | ?column?
----------+----------
t | t
(1 row)
CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes');
SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards;
?column? | ?column?
----------+----------
t | t
(1 row)
CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower');
CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper');
SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first;
?column? | ?column?
----------+----------
t | t
(1 row)
CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted');
SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted;
?column? | ?column?
----------+----------
t | t
(1 row)
CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric;
?column? | ?column?
----------+----------
t | t
(1 row)
CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower');
ERROR: could not open collator for locale "@colNumeric=lower": U_ILLEGAL_ARGUMENT_ERROR
-- cleanup
SET client_min_messages TO warning;
DROP SCHEMA collate_tests CASCADE;

View File

@ -425,6 +425,27 @@ drop type textrange_c;
drop type textrange_en_us;
-- test ICU collation customization
CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents;
CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes');
SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards;
CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower');
CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper');
SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first;
CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted');
SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted;
CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric;
CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower');
-- cleanup
SET client_min_messages TO warning;
DROP SCHEMA collate_tests CASCADE;