mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-30 00:04:49 -04:00 
			
		
		
		
	Fix INITCAP() word boundaries for PG_UNICODE_FAST.
Word boundaries are based on whether a character is alphanumeric or not. For the PG_UNICODE_FAST collation, alphanumeric includes non-ASCII digits; whereas for the PG_C_UTF8 collation, it only includes digits 0-9. Pass down the right information from the pg_locale_t into initcap_wbnext to differentiate the behavior. Reported-by: Noah Misch <noah@leadboat.com> Reviewed-by: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/20250417135841.33.nmisch@google.com
This commit is contained in:
		
							parent
							
								
									80b727eb9d
								
							
						
					
					
						commit
						90260e2ec6
					
				| @ -40,6 +40,7 @@ struct WordBoundaryState | ||||
| 	const char *str; | ||||
| 	size_t		len; | ||||
| 	size_t		offset; | ||||
| 	bool		posix; | ||||
| 	bool		init; | ||||
| 	bool		prev_alnum; | ||||
| }; | ||||
| @ -58,7 +59,7 @@ initcap_wbnext(void *state) | ||||
| 	{ | ||||
| 		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str + | ||||
| 										wbstate->offset); | ||||
| 		bool		curr_alnum = pg_u_isalnum(u, true); | ||||
| 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix); | ||||
| 
 | ||||
| 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum) | ||||
| 		{ | ||||
| @ -92,6 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, | ||||
| 		.str = src, | ||||
| 		.len = srclen, | ||||
| 		.offset = 0, | ||||
| 		.posix = !locale->info.builtin.casemap_full, | ||||
| 		.init = false, | ||||
| 		.prev_alnum = false, | ||||
| 	}; | ||||
|  | ||||
| @ -41,6 +41,7 @@ struct WordBoundaryState | ||||
| 	const char *str; | ||||
| 	size_t		len; | ||||
| 	size_t		offset; | ||||
| 	bool		posix; | ||||
| 	bool		init; | ||||
| 	bool		prev_alnum; | ||||
| }; | ||||
| @ -55,7 +56,7 @@ initcap_wbnext(void *state) | ||||
| 	{ | ||||
| 		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str + | ||||
| 										wbstate->offset); | ||||
| 		bool		curr_alnum = pg_u_isalnum(u, true); | ||||
| 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix); | ||||
| 
 | ||||
| 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum) | ||||
| 		{ | ||||
| @ -112,10 +113,13 @@ icu_test_full(char *str) | ||||
| 	char		icu_upper[BUFSZ]; | ||||
| 	char		icu_fold[BUFSZ]; | ||||
| 	UErrorCode	status; | ||||
| 
 | ||||
| 	/* full case mapping doesn't use posix semantics */ | ||||
| 	struct WordBoundaryState wbstate = { | ||||
| 		.str = str, | ||||
| 		.len = strlen(str), | ||||
| 		.offset = 0, | ||||
| 		.posix = false, | ||||
| 		.init = false, | ||||
| 		.prev_alnum = false, | ||||
| 	}; | ||||
| @ -344,6 +348,12 @@ test_convert_case() | ||||
| 	test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'"); | ||||
| 	test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς"); | ||||
| 	test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ"); | ||||
| 	/* test that alphanumerics are word characters */ | ||||
| 	test_convert(tfunc_title, "λλ", "Λλ"); | ||||
| 	test_convert(tfunc_title, "1a", "1a"); | ||||
| 	/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */ | ||||
| 	test_convert(tfunc_title, "\uFF11a", "\uFF11a"); | ||||
| 
 | ||||
| 
 | ||||
| #ifdef USE_ICU | ||||
| 	icu_test_full(""); | ||||
| @ -354,6 +364,7 @@ test_convert_case() | ||||
| 	icu_test_full("abc 123xyz"); | ||||
| 	icu_test_full("σςΣ ΣΣΣ"); | ||||
| 	icu_test_full("ıiIİ"); | ||||
| 	icu_test_full("\uFF11a"); | ||||
| 	/* test <alpha><iota_subscript><acute> */ | ||||
| 	icu_test_full("\u0391\u0345\u0301"); | ||||
| #endif | ||||
|  | ||||
| @ -52,6 +52,7 @@ INSERT INTO test_pg_c_utf8 VALUES | ||||
|   ('abc DEF 123abc'), | ||||
|   ('ábc sßs ßss DÉF'), | ||||
|   ('DŽxxDŽ džxxDž Džxxdž'), | ||||
|   (U&'Λλ 1a \FF11a'), | ||||
|   ('ȺȺȺ'), | ||||
|   ('ⱥⱥⱥ'), | ||||
|   ('ⱥȺ'); | ||||
| @ -67,10 +68,11 @@ SELECT | ||||
|  abc DEF 123abc  | abc def 123abc  | Abc Def 123abc  | ABC DEF 123ABC  |      14 |            14 |              14 |            14 | ||||
|  ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF |      19 |            19 |              19 |            19 | ||||
|  DŽxxDŽ džxxDž Džxxdž  | džxxdž džxxdž džxxdž  | DŽxxdž DŽxxdž DŽxxdž  | DŽXXDŽ DŽXXDŽ DŽXXDŽ  |      20 |            20 |              20 |            20 | ||||
|  Λλ 1a 1a       | λλ 1a 1a       | Λλ 1a 1A       | ΛΛ 1A 1A       |      12 |            12 |              12 |            12 | ||||
|  ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       6 |             9 |               8 |             6 | ||||
|  ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ             | ȺȺȺ             |       9 |             9 |               8 |             6 | ||||
|  ⱥȺ              | ⱥⱥ              | Ⱥⱥ              | ȺȺ              |       5 |             6 |               5 |             4 | ||||
| (6 rows) | ||||
| (7 rows) | ||||
| 
 | ||||
| DROP TABLE test_pg_c_utf8; | ||||
| -- negative test: Final_Sigma not used for builtin locale C.UTF-8 | ||||
| @ -182,6 +184,7 @@ INSERT INTO test_pg_unicode_fast VALUES | ||||
|   ('abc DEF 123abc'), | ||||
|   ('ábc sßs ßss DÉF'), | ||||
|   ('DŽxxDŽ džxxDž Džxxdž'), | ||||
|   (U&'Λλ 1a \FF11a'), | ||||
|   ('ȺȺȺ'), | ||||
|   ('ⱥⱥⱥ'), | ||||
|   ('ⱥȺ'); | ||||
| @ -197,10 +200,11 @@ SELECT | ||||
|  abc DEF 123abc  | abc def 123abc  | Abc Def 123abc   | ABC DEF 123ABC    |      14 |            14 |              14 |            14 | ||||
|  ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF |      19 |            19 |              19 |            19 | ||||
|  DŽxxDŽ džxxDž Džxxdž  | džxxdž džxxdž džxxdž  | Džxxdž Džxxdž Džxxdž   | DŽXXDŽ DŽXXDŽ DŽXXDŽ    |      20 |            20 |              20 |            20 | ||||
|  Λλ 1a 1a       | λλ 1a 1a       | Λλ 1a 1a        | ΛΛ 1A 1A         |      12 |            12 |              12 |            12 | ||||
|  ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       6 |             9 |               8 |             6 | ||||
|  ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       9 |             9 |               8 |             6 | ||||
|  ⱥȺ              | ⱥⱥ              | Ⱥⱥ               | ȺȺ                |       5 |             6 |               5 |             4 | ||||
| (6 rows) | ||||
| (7 rows) | ||||
| 
 | ||||
| DROP TABLE test_pg_unicode_fast; | ||||
| -- test Final_Sigma | ||||
|  | ||||
| @ -45,6 +45,7 @@ INSERT INTO test_pg_c_utf8 VALUES | ||||
|   ('abc DEF 123abc'), | ||||
|   ('ábc sßs ßss DÉF'), | ||||
|   ('DŽxxDŽ džxxDž Džxxdž'), | ||||
|   (U&'Λλ 1a \FF11a'), | ||||
|   ('ȺȺȺ'), | ||||
|   ('ⱥⱥⱥ'), | ||||
|   ('ⱥȺ'); | ||||
| @ -100,6 +101,7 @@ INSERT INTO test_pg_unicode_fast VALUES | ||||
|   ('abc DEF 123abc'), | ||||
|   ('ábc sßs ßss DÉF'), | ||||
|   ('DŽxxDŽ džxxDž Džxxdž'), | ||||
|   (U&'Λλ 1a \FF11a'), | ||||
|   ('ȺȺȺ'), | ||||
|   ('ⱥⱥⱥ'), | ||||
|   ('ⱥȺ'); | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user