mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 00:03:57 -04:00 
			
		
		
		
	Refactor to add pg_strcoll(), pg_strxfrm(), and variants.
Offers a generally better separation of responsibilities for collation code. Also, a step towards multi-lib ICU, which should be based on a clean separation of the routines required for collation providers. Callers with NUL-terminated strings should call pg_strcoll() or pg_strxfrm(); callers with strings and their length should call the variants pg_strncoll() or pg_strnxfrm(). Reviewed-by: Peter Eisentraut, Peter Geoghegan Discussion: https://postgr.es/m/a581136455c940d7bd0ff482d3a2bd51af25a94f.camel%40j-davis.com
This commit is contained in:
		
							parent
							
								
									e9960732a9
								
							
						
					
					
						commit
						d87d548cd0
					
				| @ -292,21 +292,24 @@ hashtext(PG_FUNCTION_ARGS) | ||||
| #ifdef USE_ICU | ||||
| 		if (mylocale->provider == COLLPROVIDER_ICU) | ||||
| 		{ | ||||
| 			int32_t		ulen = -1; | ||||
| 			UChar	   *uchar = NULL; | ||||
| 			Size		bsize; | ||||
| 			uint8_t    *buf; | ||||
| 			Size		bsize, rsize; | ||||
| 			char	   *buf; | ||||
| 			const char *keydata = VARDATA_ANY(key); | ||||
| 			size_t		keylen = VARSIZE_ANY_EXHDR(key); | ||||
| 
 | ||||
| 			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); | ||||
| 			bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); | ||||
| 			buf = palloc(bsize + 1); | ||||
| 
 | ||||
| 			bsize = ucol_getSortKey(mylocale->info.icu.ucol, | ||||
| 									uchar, ulen, NULL, 0); | ||||
| 			buf = palloc(bsize); | ||||
| 			ucol_getSortKey(mylocale->info.icu.ucol, | ||||
| 							uchar, ulen, buf, bsize); | ||||
| 			pfree(uchar); | ||||
| 			rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale); | ||||
| 			if (rsize != bsize) | ||||
| 				elog(ERROR, "pg_strnxfrm() returned unexpected result"); | ||||
| 
 | ||||
| 			result = hash_any(buf, bsize); | ||||
| 			/*
 | ||||
| 			 * In principle, there's no reason to include the terminating NUL | ||||
| 			 * character in the hash, but it was done before and the behavior | ||||
| 			 * must be preserved. | ||||
| 			 */ | ||||
| 			result = hash_any((uint8_t *) buf, bsize + 1); | ||||
| 
 | ||||
| 			pfree(buf); | ||||
| 		} | ||||
| @ -350,21 +353,25 @@ hashtextextended(PG_FUNCTION_ARGS) | ||||
| #ifdef USE_ICU | ||||
| 		if (mylocale->provider == COLLPROVIDER_ICU) | ||||
| 		{ | ||||
| 			int32_t		ulen = -1; | ||||
| 			UChar	   *uchar = NULL; | ||||
| 			Size		bsize; | ||||
| 			uint8_t    *buf; | ||||
| 			Size		bsize, rsize; | ||||
| 			char	   *buf; | ||||
| 			const char *keydata = VARDATA_ANY(key); | ||||
| 			size_t		keylen = VARSIZE_ANY_EXHDR(key); | ||||
| 
 | ||||
| 			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); | ||||
| 			bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); | ||||
| 			buf = palloc(bsize + 1); | ||||
| 
 | ||||
| 			bsize = ucol_getSortKey(mylocale->info.icu.ucol, | ||||
| 									uchar, ulen, NULL, 0); | ||||
| 			buf = palloc(bsize); | ||||
| 			ucol_getSortKey(mylocale->info.icu.ucol, | ||||
| 							uchar, ulen, buf, bsize); | ||||
| 			pfree(uchar); | ||||
| 			rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale); | ||||
| 			if (rsize != bsize) | ||||
| 				elog(ERROR, "pg_strnxfrm() returned unexpected result"); | ||||
| 
 | ||||
| 			result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1)); | ||||
| 			/*
 | ||||
| 			 * In principle, there's no reason to include the terminating NUL | ||||
| 			 * character in the hash, but it was done before and the behavior | ||||
| 			 * must be preserved. | ||||
| 			 */ | ||||
| 			result = hash_any_extended((uint8_t *) buf, bsize + 1, | ||||
| 									   PG_GETARG_INT64(1)); | ||||
| 
 | ||||
| 			pfree(buf); | ||||
| 		} | ||||
|  | ||||
| @ -79,6 +79,12 @@ | ||||
| #include <shlwapi.h> | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * This should be large enough that most strings will fit, but small enough | ||||
|  * that we feel comfortable putting it on the stack | ||||
|  */ | ||||
| #define		TEXTBUFLEN			1024 | ||||
| 
 | ||||
| #define		MAX_L10N_DATA		80 | ||||
| 
 | ||||
| 
 | ||||
| @ -123,6 +129,19 @@ static char *IsoLocaleName(const char *); | ||||
| #endif | ||||
| 
 | ||||
| #ifdef USE_ICU | ||||
| /*
 | ||||
|  * Converter object for converting between ICU's UChar strings and C strings | ||||
|  * in database encoding.  Since the database encoding doesn't change, we only | ||||
|  * need one of these per session. | ||||
|  */ | ||||
| static UConverter *icu_converter = NULL; | ||||
| 
 | ||||
| static void init_icu_converter(void); | ||||
| static size_t uchar_length(UConverter *converter, | ||||
| 						   const char *str, int32_t len); | ||||
| static int32_t uchar_convert(UConverter *converter, | ||||
| 							 UChar *dest, int32_t destlen, | ||||
| 							 const char *str, int32_t srclen); | ||||
| static void icu_set_collation_attributes(UCollator *collator, const char *loc); | ||||
| #endif | ||||
| 
 | ||||
| @ -1731,15 +1750,705 @@ get_collation_actual_version(char collprovider, const char *collcollate) | ||||
| 	return collversion; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strncoll_libc_win32_utf8 | ||||
|  * | ||||
|  * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and | ||||
|  * invoke wcscoll() or wcscoll_l(). | ||||
|  */ | ||||
| #ifdef WIN32 | ||||
| static int | ||||
| pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, | ||||
| 							size_t len2, pg_locale_t locale) | ||||
| { | ||||
| 	char		sbuf[TEXTBUFLEN]; | ||||
| 	char	   *buf = sbuf; | ||||
| 	char	   *a1p, | ||||
| 			   *a2p; | ||||
| 	int			a1len = len1 * 2 + 2; | ||||
| 	int			a2len = len2 * 2 + 2; | ||||
| 	int			r; | ||||
| 	int			result; | ||||
| 
 | ||||
| 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC); | ||||
| 	Assert(GetDatabaseEncoding() == PG_UTF8); | ||||
| #ifndef WIN32 | ||||
| 	Assert(false); | ||||
| #endif | ||||
| 
 | ||||
| 	if (a1len + a2len > TEXTBUFLEN) | ||||
| 		buf = palloc(a1len + a2len); | ||||
| 
 | ||||
| 	a1p = buf; | ||||
| 	a2p = buf + a1len; | ||||
| 
 | ||||
| 	/* API does not work for zero-length input */ | ||||
| 	if (len1 == 0) | ||||
| 		r = 0; | ||||
| 	else | ||||
| 	{ | ||||
| 		r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, | ||||
| 								(LPWSTR) a1p, a1len / 2); | ||||
| 		if (!r) | ||||
| 			ereport(ERROR, | ||||
| 					(errmsg("could not convert string to UTF-16: error code %lu", | ||||
| 							GetLastError()))); | ||||
| 	} | ||||
| 	((LPWSTR) a1p)[r] = 0; | ||||
| 
 | ||||
| 	if (len2 == 0) | ||||
| 		r = 0; | ||||
| 	else | ||||
| 	{ | ||||
| 		r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, | ||||
| 								(LPWSTR) a2p, a2len / 2); | ||||
| 		if (!r) | ||||
| 			ereport(ERROR, | ||||
| 					(errmsg("could not convert string to UTF-16: error code %lu", | ||||
| 							GetLastError()))); | ||||
| 	} | ||||
| 	((LPWSTR) a2p)[r] = 0; | ||||
| 
 | ||||
| 	errno = 0; | ||||
| #ifdef HAVE_LOCALE_T | ||||
| 	if (locale) | ||||
| 		result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); | ||||
| 	else | ||||
| #endif | ||||
| 		result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); | ||||
| 	if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw
 | ||||
| 								 * headers */ | ||||
| 		ereport(ERROR, | ||||
| 				(errmsg("could not compare Unicode strings: %m"))); | ||||
| 
 | ||||
| 	if (buf != sbuf) | ||||
| 		pfree(buf); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| #endif							/* WIN32 */ | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strcoll_libc | ||||
|  * | ||||
|  * Call strcoll(), strcoll_l(), wcscoll(), or wcscoll_l() as appropriate for | ||||
|  * the given locale, platform, and database encoding. If the locale is NULL, | ||||
|  * use the database collation. | ||||
|  * | ||||
|  * Arguments must be encoded in the database encoding and nul-terminated. | ||||
|  */ | ||||
| static int | ||||
| pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale) | ||||
| { | ||||
| 	int result; | ||||
| 
 | ||||
| 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC); | ||||
| #ifdef WIN32 | ||||
| 	if (GetDatabaseEncoding() == PG_UTF8) | ||||
| 	{ | ||||
| 		size_t len1 = strlen(arg1); | ||||
| 		size_t len2 = strlen(arg2); | ||||
| 		result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); | ||||
| 	} | ||||
| 	else | ||||
| #endif							/* WIN32 */ | ||||
| 	if (locale) | ||||
| 	{ | ||||
| #ifdef HAVE_LOCALE_T | ||||
| 		result = strcoll_l(arg1, arg2, locale->info.lt); | ||||
| #else | ||||
| 		/* shouldn't happen */ | ||||
| 		elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| #endif | ||||
| 	} | ||||
| 	else | ||||
| 		result = strcoll(arg1, arg2); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strncoll_libc | ||||
|  * | ||||
|  * Nul-terminate the arguments and call pg_strcoll_libc(). | ||||
|  */ | ||||
| static int | ||||
| pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, | ||||
| 				 pg_locale_t locale) | ||||
| { | ||||
| 	char	 sbuf[TEXTBUFLEN]; | ||||
| 	char	*buf	  = sbuf; | ||||
| 	size_t	 bufsize1 = len1 + 1; | ||||
| 	size_t	 bufsize2 = len2 + 1; | ||||
| 	char	*arg1n; | ||||
| 	char	*arg2n; | ||||
| 	int		 result; | ||||
| 
 | ||||
| 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC); | ||||
| 
 | ||||
| #ifdef WIN32 | ||||
| 	/* check for this case before doing the work for nul-termination */ | ||||
| 	if (GetDatabaseEncoding() == PG_UTF8) | ||||
| 		return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); | ||||
| #endif							/* WIN32 */ | ||||
| 
 | ||||
| 	if (bufsize1 + bufsize2 > TEXTBUFLEN) | ||||
| 		buf = palloc(bufsize1 + bufsize2); | ||||
| 
 | ||||
| 	arg1n = buf; | ||||
| 	arg2n = buf + bufsize1; | ||||
| 
 | ||||
| 	/* nul-terminate arguments */ | ||||
| 	memcpy(arg1n, arg1, len1); | ||||
| 	arg1n[len1] = '\0'; | ||||
| 	memcpy(arg2n, arg2, len2); | ||||
| 	arg2n[len2] = '\0'; | ||||
| 
 | ||||
| 	result = pg_strcoll_libc(arg1n, arg2n, locale); | ||||
| 
 | ||||
| 	if (buf != sbuf) | ||||
| 		pfree(buf); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| #ifdef USE_ICU | ||||
| /*
 | ||||
|  * Converter object for converting between ICU's UChar strings and C strings | ||||
|  * in database encoding.  Since the database encoding doesn't change, we only | ||||
|  * need one of these per session. | ||||
|  */ | ||||
| static UConverter *icu_converter = NULL; | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strncoll_icu_no_utf8 | ||||
|  * | ||||
|  * Convert the arguments from the database encoding to UChar strings, then | ||||
|  * call ucol_strcoll(). An argument length of -1 means that the string is | ||||
|  * NUL-terminated. | ||||
|  * | ||||
|  * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(), | ||||
|  * caller should call that instead. | ||||
|  */ | ||||
| static int | ||||
| pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1, | ||||
| 						const char *arg2, int32_t len2, pg_locale_t locale) | ||||
| { | ||||
| 	char	 sbuf[TEXTBUFLEN]; | ||||
| 	char	*buf = sbuf; | ||||
| 	int32_t	 ulen1; | ||||
| 	int32_t	 ulen2; | ||||
| 	size_t   bufsize1; | ||||
| 	size_t   bufsize2; | ||||
| 	UChar	*uchar1, | ||||
| 			*uchar2; | ||||
| 	int		 result; | ||||
| 
 | ||||
| 	Assert(locale->provider == COLLPROVIDER_ICU); | ||||
| #ifdef HAVE_UCOL_STRCOLLUTF8 | ||||
| 	Assert(GetDatabaseEncoding() != PG_UTF8); | ||||
| #endif | ||||
| 
 | ||||
| 	init_icu_converter(); | ||||
| 
 | ||||
| 	ulen1 = uchar_length(icu_converter, arg1, len1); | ||||
| 	ulen2 = uchar_length(icu_converter, arg2, len2); | ||||
| 
 | ||||
| 	bufsize1 = (ulen1 + 1) * sizeof(UChar); | ||||
| 	bufsize2 = (ulen2 + 1) * sizeof(UChar); | ||||
| 
 | ||||
| 	if (bufsize1 + bufsize2 > TEXTBUFLEN) | ||||
| 		buf = palloc(bufsize1 + bufsize2); | ||||
| 
 | ||||
| 	uchar1 = (UChar *) buf; | ||||
| 	uchar2 = (UChar *) (buf + bufsize1); | ||||
| 
 | ||||
| 	ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); | ||||
| 	ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); | ||||
| 
 | ||||
| 	result = ucol_strcoll(locale->info.icu.ucol, | ||||
| 						  uchar1, ulen1, | ||||
| 						  uchar2, ulen2); | ||||
| 
 | ||||
| 	if (buf != sbuf) | ||||
| 		pfree(buf); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strncoll_icu | ||||
|  * | ||||
|  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given | ||||
|  * database encoding. An argument length of -1 means the string is | ||||
|  * NUL-terminated. | ||||
|  * | ||||
|  * Arguments must be encoded in the database encoding. | ||||
|  */ | ||||
| static int | ||||
| pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2, | ||||
| 				pg_locale_t locale) | ||||
| { | ||||
| 	int result; | ||||
| 
 | ||||
| 	Assert(locale->provider == COLLPROVIDER_ICU); | ||||
| 
 | ||||
| #ifdef HAVE_UCOL_STRCOLLUTF8 | ||||
| 	if (GetDatabaseEncoding() == PG_UTF8) | ||||
| 	{ | ||||
| 		UErrorCode	status; | ||||
| 
 | ||||
| 		status = U_ZERO_ERROR; | ||||
| 		result = ucol_strcollUTF8(locale->info.icu.ucol, | ||||
| 								  arg1, len1, | ||||
| 								  arg2, len2, | ||||
| 								  &status); | ||||
| 		if (U_FAILURE(status)) | ||||
| 			ereport(ERROR, | ||||
| 					(errmsg("collation failed: %s", u_errorName(status)))); | ||||
| 	} | ||||
| 	else | ||||
| #endif | ||||
| 	{ | ||||
| 		result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); | ||||
| 	} | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| #endif							/* USE_ICU */ | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strcoll | ||||
|  * | ||||
|  * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(), | ||||
|  * or wcscoll_l() as appropriate for the given locale, platform, and database | ||||
|  * encoding. If the locale is not specified, use the database collation. | ||||
|  * | ||||
|  * Arguments must be encoded in the database encoding and nul-terminated. | ||||
|  * | ||||
|  * The caller is responsible for breaking ties if the collation is | ||||
|  * deterministic; this maintains consistency with pg_strxfrm(), which cannot | ||||
|  * easily account for deterministic collations. | ||||
|  */ | ||||
| int | ||||
| pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) | ||||
| { | ||||
| 	int			result; | ||||
| 
 | ||||
| 	if (!locale || locale->provider == COLLPROVIDER_LIBC) | ||||
| 		result = pg_strcoll_libc(arg1, arg2, locale); | ||||
| #ifdef USE_ICU | ||||
| 	else if (locale->provider == COLLPROVIDER_ICU) | ||||
| 		result = pg_strncoll_icu(arg1, -1, arg2, -1, locale); | ||||
| #endif | ||||
| 	else | ||||
| 		/* shouldn't happen */ | ||||
| 		elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strncoll | ||||
|  * | ||||
|  * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(), | ||||
|  * or wcscoll_l() as appropriate for the given locale, platform, and database | ||||
|  * encoding. If the locale is not specified, use the database collation. | ||||
|  * | ||||
|  * Arguments must be encoded in the database encoding. | ||||
|  * | ||||
|  * This function may need to nul-terminate the arguments for libc functions; | ||||
|  * so if the caller already has nul-terminated strings, it should call | ||||
|  * pg_strcoll() instead. | ||||
|  * | ||||
|  * The caller is responsible for breaking ties if the collation is | ||||
|  * deterministic; this maintains consistency with pg_strnxfrm(), which cannot | ||||
|  * easily account for deterministic collations. | ||||
|  */ | ||||
| int | ||||
| pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2, | ||||
| 			pg_locale_t locale) | ||||
| { | ||||
| 	int		 result; | ||||
| 
 | ||||
| 	if (!locale || locale->provider == COLLPROVIDER_LIBC) | ||||
| 		result = pg_strncoll_libc(arg1, len1, arg2, len2, locale); | ||||
| #ifdef USE_ICU | ||||
| 	else if (locale->provider == COLLPROVIDER_ICU) | ||||
| 		result = pg_strncoll_icu(arg1, len1, arg2, len2, locale); | ||||
| #endif | ||||
| 	else | ||||
| 		/* shouldn't happen */ | ||||
| 		elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| static size_t | ||||
| pg_strxfrm_libc(char *dest, const char *src, size_t destsize, | ||||
| 				pg_locale_t locale) | ||||
| { | ||||
| 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC); | ||||
| 
 | ||||
| #ifdef TRUST_STRXFRM | ||||
| #ifdef HAVE_LOCALE_T | ||||
| 	if (locale) | ||||
| 		return strxfrm_l(dest, src, destsize, locale->info.lt); | ||||
| 	else | ||||
| #endif | ||||
| 		return strxfrm(dest, src, destsize); | ||||
| #else | ||||
| 	/* shouldn't happen */ | ||||
| 	elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| static size_t | ||||
| pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize, | ||||
| 				 pg_locale_t locale) | ||||
| { | ||||
| 	char	 sbuf[TEXTBUFLEN]; | ||||
| 	char	*buf	 = sbuf; | ||||
| 	size_t	 bufsize = srclen + 1; | ||||
| 	size_t	 result; | ||||
| 
 | ||||
| 	Assert(!locale || locale->provider == COLLPROVIDER_LIBC); | ||||
| 
 | ||||
| 	if (bufsize > TEXTBUFLEN) | ||||
| 		buf = palloc(bufsize); | ||||
| 
 | ||||
| 	/* nul-terminate arguments */ | ||||
| 	memcpy(buf, src, srclen); | ||||
| 	buf[srclen] = '\0'; | ||||
| 
 | ||||
| 	result = pg_strxfrm_libc(dest, buf, destsize, locale); | ||||
| 
 | ||||
| 	if (buf != sbuf) | ||||
| 		pfree(buf); | ||||
| 
 | ||||
| 	/* if dest is defined, it should be nul-terminated */ | ||||
| 	Assert(result >= destsize || dest[result] == '\0'); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| #ifdef USE_ICU | ||||
| 
 | ||||
| /* 'srclen' of -1 means the strings are NUL-terminated */ | ||||
| static size_t | ||||
| pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize, | ||||
| 				pg_locale_t locale) | ||||
| { | ||||
| 	char	 sbuf[TEXTBUFLEN]; | ||||
| 	char	*buf	= sbuf; | ||||
| 	UChar	*uchar; | ||||
| 	int32_t	 ulen; | ||||
| 	size_t   uchar_bsize; | ||||
| 	Size	 result_bsize; | ||||
| 
 | ||||
| 	Assert(locale->provider == COLLPROVIDER_ICU); | ||||
| 
 | ||||
| 	init_icu_converter(); | ||||
| 
 | ||||
| 	ulen = uchar_length(icu_converter, src, srclen); | ||||
| 
 | ||||
| 	uchar_bsize = (ulen + 1) * sizeof(UChar); | ||||
| 
 | ||||
| 	if (uchar_bsize > TEXTBUFLEN) | ||||
| 		buf = palloc(uchar_bsize); | ||||
| 
 | ||||
| 	uchar = (UChar *) buf; | ||||
| 
 | ||||
| 	ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); | ||||
| 
 | ||||
| 	result_bsize = ucol_getSortKey(locale->info.icu.ucol, | ||||
| 								   uchar, ulen, | ||||
| 								   (uint8_t *) dest, destsize); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * ucol_getSortKey() counts the nul-terminator in the result length, but | ||||
| 	 * this function should not. | ||||
| 	 */ | ||||
| 	Assert(result_bsize > 0); | ||||
| 	result_bsize--; | ||||
| 
 | ||||
| 	if (buf != sbuf) | ||||
| 		pfree(buf); | ||||
| 
 | ||||
| 	/* if dest is defined, it should be nul-terminated */ | ||||
| 	Assert(result_bsize >= destsize || dest[result_bsize] == '\0'); | ||||
| 
 | ||||
| 	return result_bsize; | ||||
| } | ||||
| 
 | ||||
| /* 'srclen' of -1 means the strings are NUL-terminated */ | ||||
| static size_t | ||||
| pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen, | ||||
| 							   int32_t destsize, pg_locale_t locale) | ||||
| { | ||||
| 	char			 sbuf[TEXTBUFLEN]; | ||||
| 	char			*buf   = sbuf; | ||||
| 	UCharIterator	 iter; | ||||
| 	uint32_t		 state[2]; | ||||
| 	UErrorCode		 status; | ||||
| 	int32_t			 ulen  = -1; | ||||
| 	UChar			*uchar = NULL; | ||||
| 	size_t			 uchar_bsize; | ||||
| 	Size			 result_bsize; | ||||
| 
 | ||||
| 	Assert(locale->provider == COLLPROVIDER_ICU); | ||||
| 	Assert(GetDatabaseEncoding() != PG_UTF8); | ||||
| 
 | ||||
| 	init_icu_converter(); | ||||
| 
 | ||||
| 	ulen = uchar_length(icu_converter, src, srclen); | ||||
| 
 | ||||
| 	uchar_bsize = (ulen + 1) * sizeof(UChar); | ||||
| 
 | ||||
| 	if (uchar_bsize > TEXTBUFLEN) | ||||
| 		buf = palloc(uchar_bsize); | ||||
| 
 | ||||
| 	uchar = (UChar *) buf; | ||||
| 
 | ||||
| 	ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); | ||||
| 
 | ||||
| 	uiter_setString(&iter, uchar, ulen); | ||||
| 	state[0] = state[1] = 0;	/* won't need that again */ | ||||
| 	status = U_ZERO_ERROR; | ||||
| 	result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, | ||||
| 										&iter, | ||||
| 										state, | ||||
| 										(uint8_t *) dest, | ||||
| 										destsize, | ||||
| 										&status); | ||||
| 	if (U_FAILURE(status)) | ||||
| 		ereport(ERROR, | ||||
| 				(errmsg("sort key generation failed: %s", | ||||
| 						u_errorName(status)))); | ||||
| 
 | ||||
| 	return result_bsize; | ||||
| } | ||||
| 
 | ||||
| /* 'srclen' of -1 means the strings are NUL-terminated */ | ||||
| static size_t | ||||
| pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen, | ||||
| 					   int32_t destsize, pg_locale_t locale) | ||||
| { | ||||
| 	size_t result; | ||||
| 
 | ||||
| 	Assert(locale->provider == COLLPROVIDER_ICU); | ||||
| 
 | ||||
| 	if (GetDatabaseEncoding() == PG_UTF8) | ||||
| 	{ | ||||
| 		UCharIterator iter; | ||||
| 		uint32_t	state[2]; | ||||
| 		UErrorCode	status; | ||||
| 
 | ||||
| 		uiter_setUTF8(&iter, src, srclen); | ||||
| 		state[0] = state[1] = 0;	/* won't need that again */ | ||||
| 		status = U_ZERO_ERROR; | ||||
| 		result = ucol_nextSortKeyPart(locale->info.icu.ucol, | ||||
| 									  &iter, | ||||
| 									  state, | ||||
| 									  (uint8_t *) dest, | ||||
| 									  destsize, | ||||
| 									  &status); | ||||
| 		if (U_FAILURE(status)) | ||||
| 			ereport(ERROR, | ||||
| 					(errmsg("sort key generation failed: %s", | ||||
| 							u_errorName(status)))); | ||||
| 	} | ||||
| 	else | ||||
| 		result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize, | ||||
| 												locale); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * Return true if the collation provider supports pg_strxfrm() and | ||||
|  * pg_strnxfrm(); otherwise false. | ||||
|  * | ||||
|  * Unfortunately, it seems that strxfrm() for non-C collations is broken on | ||||
|  * many common platforms; testing of multiple versions of glibc reveals that, | ||||
|  * for many locales, strcoll() and strxfrm() do not return consistent | ||||
|  * results. While no other libc other than Cygwin has so far been shown to | ||||
|  * have a problem, we take the conservative course of action for right now and | ||||
|  * disable this categorically.  (Users who are certain this isn't a problem on | ||||
|  * their system can define TRUST_STRXFRM.) | ||||
|  * | ||||
|  * No similar problem is known for the ICU provider. | ||||
|  */ | ||||
| bool | ||||
| pg_strxfrm_enabled(pg_locale_t locale) | ||||
| { | ||||
| 	if (!locale || locale->provider == COLLPROVIDER_LIBC) | ||||
| #ifdef TRUST_STRXFRM | ||||
| 		return true; | ||||
| #else | ||||
| 		return false; | ||||
| #endif | ||||
| 	else if (locale->provider == COLLPROVIDER_ICU) | ||||
| 		return true; | ||||
| 	else | ||||
| 		/* shouldn't happen */ | ||||
| 		elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strxfrm | ||||
|  * | ||||
|  * Transforms 'src' to a nul-terminated string stored in 'dest' such that | ||||
|  * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on | ||||
|  * untransformed strings. | ||||
|  * | ||||
|  * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest' | ||||
|  * may be NULL. | ||||
|  * | ||||
|  * Returns the number of bytes needed to store the transformed string, | ||||
|  * excluding the terminating nul byte. If the value returned is 'destsize' or | ||||
|  * greater, the resulting contents of 'dest' are undefined. | ||||
|  */ | ||||
| size_t | ||||
| pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) | ||||
| { | ||||
| 	size_t result; | ||||
| 
 | ||||
| 	if (!locale || locale->provider == COLLPROVIDER_LIBC) | ||||
| 		result = pg_strxfrm_libc(dest, src, destsize, locale); | ||||
| #ifdef USE_ICU | ||||
| 	else if (locale->provider == COLLPROVIDER_ICU) | ||||
| 		result = pg_strnxfrm_icu(dest, src, -1, destsize, locale); | ||||
| #endif | ||||
| 	else | ||||
| 		/* shouldn't happen */ | ||||
| 		elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strnxfrm | ||||
|  * | ||||
|  * Transforms 'src' to a nul-terminated string stored in 'dest' such that | ||||
|  * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on | ||||
|  * untransformed strings. | ||||
|  * | ||||
|  * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may | ||||
|  * be NULL. | ||||
|  * | ||||
|  * Returns the number of bytes needed to store the transformed string, | ||||
|  * excluding the terminating nul byte. If the value returned is 'destsize' or | ||||
|  * greater, the resulting contents of 'dest' are undefined. | ||||
|  * | ||||
|  * This function may need to nul-terminate the argument for libc functions; | ||||
|  * so if the caller already has a nul-terminated string, it should call | ||||
|  * pg_strxfrm() instead. | ||||
|  */ | ||||
| size_t | ||||
| pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen, | ||||
| 			pg_locale_t locale) | ||||
| { | ||||
| 	size_t result; | ||||
| 
 | ||||
| 	if (!locale || locale->provider == COLLPROVIDER_LIBC) | ||||
| 		result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale); | ||||
| #ifdef USE_ICU | ||||
| 	else if (locale->provider == COLLPROVIDER_ICU) | ||||
| 		result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale); | ||||
| #endif | ||||
| 	else | ||||
| 		/* shouldn't happen */ | ||||
| 		elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return true if the collation provider supports pg_strxfrm_prefix() and | ||||
|  * pg_strnxfrm_prefix(); otherwise false. | ||||
|  */ | ||||
| bool | ||||
| pg_strxfrm_prefix_enabled(pg_locale_t locale) | ||||
| { | ||||
| 	if (!locale || locale->provider == COLLPROVIDER_LIBC) | ||||
| 		return false; | ||||
| 	else if (locale->provider == COLLPROVIDER_ICU) | ||||
| 		return true; | ||||
| 	else | ||||
| 		/* shouldn't happen */ | ||||
| 		elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strxfrm_prefix | ||||
|  * | ||||
|  * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary | ||||
|  * memcmp() on the byte sequence is equivalent to pg_strcoll() on | ||||
|  * untransformed strings. The result is not nul-terminated. | ||||
|  * | ||||
|  * The provided 'src' must be nul-terminated. | ||||
|  * | ||||
|  * If destsize is not large enough to hold the resulting byte sequence, stores | ||||
|  * only the first destsize bytes in 'dest'. Returns the number of bytes | ||||
|  * actually copied to 'dest'. | ||||
|  */ | ||||
| size_t | ||||
| pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, | ||||
| 				  pg_locale_t locale) | ||||
| { | ||||
| 	size_t result; | ||||
| 
 | ||||
| 	if (!locale || locale->provider == COLLPROVIDER_LIBC) | ||||
| 		elog(ERROR, "collprovider '%c' does not support pg_strxfrm_prefix()", | ||||
| 			 locale->provider); | ||||
| #ifdef USE_ICU | ||||
| 	else if (locale->provider == COLLPROVIDER_ICU) | ||||
| 		result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale); | ||||
| #endif | ||||
| 	else | ||||
| 		/* shouldn't happen */ | ||||
| 		elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * pg_strnxfrm_prefix | ||||
|  * | ||||
|  * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary | ||||
|  * memcmp() on the byte sequence is equivalent to pg_strcoll() on | ||||
|  * untransformed strings. The result is not nul-terminated. | ||||
|  * | ||||
|  * The provided 'src' must be nul-terminated. | ||||
|  * | ||||
|  * If destsize is not large enough to hold the resulting byte sequence, stores | ||||
|  * only the first destsize bytes in 'dest'. Returns the number of bytes | ||||
|  * actually copied to 'dest'. | ||||
|  * | ||||
|  * This function may need to nul-terminate the argument for libc functions; | ||||
|  * so if the caller already has a nul-terminated string, it should call | ||||
|  * pg_strxfrm_prefix() instead. | ||||
|  */ | ||||
| size_t | ||||
| pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, | ||||
| 				   size_t srclen, pg_locale_t locale) | ||||
| { | ||||
| 	size_t result; | ||||
| 
 | ||||
| 	if (!locale || locale->provider == COLLPROVIDER_LIBC) | ||||
| 		elog(ERROR, "collprovider '%c' does not support pg_strnxfrm_prefix()", | ||||
| 			 locale->provider); | ||||
| #ifdef USE_ICU | ||||
| 	else if (locale->provider == COLLPROVIDER_ICU) | ||||
| 		result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale); | ||||
| #endif | ||||
| 	else | ||||
| 		/* shouldn't happen */ | ||||
| 		elog(ERROR, "unsupported collprovider: %c", locale->provider); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| #ifdef USE_ICU | ||||
| static void | ||||
| init_icu_converter(void) | ||||
| { | ||||
| @ -1767,6 +2476,39 @@ init_icu_converter(void) | ||||
| 	icu_converter = conv; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Find length, in UChars, of given string if converted to UChar string. | ||||
|  */ | ||||
| static size_t | ||||
| uchar_length(UConverter *converter, const char *str, int32_t len) | ||||
| { | ||||
| 	UErrorCode	status = U_ZERO_ERROR; | ||||
| 	int32_t		ulen; | ||||
| 	ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status); | ||||
| 	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) | ||||
| 		ereport(ERROR, | ||||
| 				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); | ||||
| 	return ulen; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Convert the given source string into a UChar string, stored in dest, and | ||||
|  * return the length (in UChars). | ||||
|  */ | ||||
| static int32_t | ||||
| uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, | ||||
| 			  const char *src, int32_t srclen) | ||||
| { | ||||
| 	UErrorCode	status = U_ZERO_ERROR; | ||||
| 	int32_t		ulen; | ||||
| 	status = U_ZERO_ERROR; | ||||
| 	ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status); | ||||
| 	if (U_FAILURE(status)) | ||||
| 		ereport(ERROR, | ||||
| 				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); | ||||
| 	return ulen; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Convert a string in the database encoding into a string of UChars. | ||||
|  * | ||||
| @ -1782,26 +2524,15 @@ init_icu_converter(void) | ||||
| int32_t | ||||
| icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) | ||||
| { | ||||
| 	UErrorCode	status; | ||||
| 	int32_t		len_uchar; | ||||
| 	int32_t len_uchar; | ||||
| 
 | ||||
| 	init_icu_converter(); | ||||
| 
 | ||||
| 	status = U_ZERO_ERROR; | ||||
| 	len_uchar = ucnv_toUChars(icu_converter, NULL, 0, | ||||
| 							  buff, nbytes, &status); | ||||
| 	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) | ||||
| 		ereport(ERROR, | ||||
| 				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); | ||||
| 	len_uchar = uchar_length(icu_converter, buff, nbytes); | ||||
| 
 | ||||
| 	*buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar)); | ||||
| 
 | ||||
| 	status = U_ZERO_ERROR; | ||||
| 	len_uchar = ucnv_toUChars(icu_converter, *buff_uchar, len_uchar + 1, | ||||
| 							  buff, nbytes, &status); | ||||
| 	if (U_FAILURE(status)) | ||||
| 		ereport(ERROR, | ||||
| 				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); | ||||
| 	len_uchar = uchar_convert(icu_converter, | ||||
| 							  *buff_uchar, len_uchar + 1, buff, nbytes); | ||||
| 
 | ||||
| 	return len_uchar; | ||||
| } | ||||
|  | ||||
| @ -1024,21 +1024,22 @@ hashbpchar(PG_FUNCTION_ARGS) | ||||
| #ifdef USE_ICU | ||||
| 		if (mylocale->provider == COLLPROVIDER_ICU) | ||||
| 		{ | ||||
| 			int32_t		ulen = -1; | ||||
| 			UChar	   *uchar = NULL; | ||||
| 			Size		bsize; | ||||
| 			uint8_t    *buf; | ||||
| 			Size		bsize, rsize; | ||||
| 			char	   *buf; | ||||
| 
 | ||||
| 			ulen = icu_to_uchar(&uchar, keydata, keylen); | ||||
| 			bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); | ||||
| 			buf = palloc(bsize + 1); | ||||
| 
 | ||||
| 			bsize = ucol_getSortKey(mylocale->info.icu.ucol, | ||||
| 									uchar, ulen, NULL, 0); | ||||
| 			buf = palloc(bsize); | ||||
| 			ucol_getSortKey(mylocale->info.icu.ucol, | ||||
| 							uchar, ulen, buf, bsize); | ||||
| 			pfree(uchar); | ||||
| 			rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale); | ||||
| 			if (rsize != bsize) | ||||
| 				elog(ERROR, "pg_strnxfrm() returned unexpected result"); | ||||
| 
 | ||||
| 			result = hash_any(buf, bsize); | ||||
| 			/*
 | ||||
| 			 * In principle, there's no reason to include the terminating NUL | ||||
| 			 * character in the hash, but it was done before and the behavior | ||||
| 			 * must be preserved. | ||||
| 			 */ | ||||
| 			result = hash_any((uint8_t *) buf, bsize + 1); | ||||
| 
 | ||||
| 			pfree(buf); | ||||
| 		} | ||||
| @ -1086,21 +1087,23 @@ hashbpcharextended(PG_FUNCTION_ARGS) | ||||
| #ifdef USE_ICU | ||||
| 		if (mylocale->provider == COLLPROVIDER_ICU) | ||||
| 		{ | ||||
| 			int32_t		ulen = -1; | ||||
| 			UChar	   *uchar = NULL; | ||||
| 			Size		bsize; | ||||
| 			uint8_t    *buf; | ||||
| 			Size		bsize, rsize; | ||||
| 			char	   *buf; | ||||
| 
 | ||||
| 			ulen = icu_to_uchar(&uchar, keydata, keylen); | ||||
| 			bsize = pg_strnxfrm(NULL, 0, keydata, keylen, mylocale); | ||||
| 			buf = palloc(bsize + 1); | ||||
| 
 | ||||
| 			bsize = ucol_getSortKey(mylocale->info.icu.ucol, | ||||
| 									uchar, ulen, NULL, 0); | ||||
| 			buf = palloc(bsize); | ||||
| 			ucol_getSortKey(mylocale->info.icu.ucol, | ||||
| 							uchar, ulen, buf, bsize); | ||||
| 			pfree(uchar); | ||||
| 			rsize = pg_strnxfrm(buf, bsize + 1, keydata, keylen, mylocale); | ||||
| 			if (rsize != bsize) | ||||
| 				elog(ERROR, "pg_strnxfrm() returned unexpected result"); | ||||
| 
 | ||||
| 			result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1)); | ||||
| 			/*
 | ||||
| 			 * In principle, there's no reason to include the terminating NUL | ||||
| 			 * character in the hash, but it was done before and the behavior | ||||
| 			 * must be preserved. | ||||
| 			 */ | ||||
| 			result = hash_any_extended((uint8_t *) buf, bsize + 1, | ||||
| 									   PG_GETARG_INT64(1)); | ||||
| 
 | ||||
| 			pfree(buf); | ||||
| 		} | ||||
|  | ||||
| @ -1553,10 +1553,6 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) | ||||
| 	} | ||||
| 	else | ||||
| 	{ | ||||
| 		char		a1buf[TEXTBUFLEN]; | ||||
| 		char		a2buf[TEXTBUFLEN]; | ||||
| 		char	   *a1p, | ||||
| 				   *a2p; | ||||
| 		pg_locale_t mylocale; | ||||
| 
 | ||||
| 		mylocale = pg_newlocale_from_collation(collid); | ||||
| @ -1573,171 +1569,16 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) | ||||
| 		if (len1 == len2 && memcmp(arg1, arg2, len1) == 0) | ||||
| 			return 0; | ||||
| 
 | ||||
| #ifdef WIN32 | ||||
| 		/* Win32 does not have UTF-8, so we need to map to UTF-16 */ | ||||
| 		if (GetDatabaseEncoding() == PG_UTF8 | ||||
| 			&& (!mylocale || mylocale->provider == COLLPROVIDER_LIBC)) | ||||
| 		{ | ||||
| 			int			a1len; | ||||
| 			int			a2len; | ||||
| 			int			r; | ||||
| 
 | ||||
| 			if (len1 >= TEXTBUFLEN / 2) | ||||
| 			{ | ||||
| 				a1len = len1 * 2 + 2; | ||||
| 				a1p = palloc(a1len); | ||||
| 			} | ||||
| 			else | ||||
| 			{ | ||||
| 				a1len = TEXTBUFLEN; | ||||
| 				a1p = a1buf; | ||||
| 			} | ||||
| 			if (len2 >= TEXTBUFLEN / 2) | ||||
| 			{ | ||||
| 				a2len = len2 * 2 + 2; | ||||
| 				a2p = palloc(a2len); | ||||
| 			} | ||||
| 			else | ||||
| 			{ | ||||
| 				a2len = TEXTBUFLEN; | ||||
| 				a2p = a2buf; | ||||
| 			} | ||||
| 
 | ||||
| 			/* stupid Microsloth API does not work for zero-length input */ | ||||
| 			if (len1 == 0) | ||||
| 				r = 0; | ||||
| 			else | ||||
| 			{ | ||||
| 				r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, | ||||
| 										(LPWSTR) a1p, a1len / 2); | ||||
| 				if (!r) | ||||
| 					ereport(ERROR, | ||||
| 							(errmsg("could not convert string to UTF-16: error code %lu", | ||||
| 									GetLastError()))); | ||||
| 			} | ||||
| 			((LPWSTR) a1p)[r] = 0; | ||||
| 
 | ||||
| 			if (len2 == 0) | ||||
| 				r = 0; | ||||
| 			else | ||||
| 			{ | ||||
| 				r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, | ||||
| 										(LPWSTR) a2p, a2len / 2); | ||||
| 				if (!r) | ||||
| 					ereport(ERROR, | ||||
| 							(errmsg("could not convert string to UTF-16: error code %lu", | ||||
| 									GetLastError()))); | ||||
| 			} | ||||
| 			((LPWSTR) a2p)[r] = 0; | ||||
| 
 | ||||
| 			errno = 0; | ||||
| #ifdef HAVE_LOCALE_T | ||||
| 			if (mylocale) | ||||
| 				result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt); | ||||
| 			else | ||||
| #endif | ||||
| 				result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); | ||||
| 			if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw
 | ||||
| 										 * headers */ | ||||
| 				ereport(ERROR, | ||||
| 						(errmsg("could not compare Unicode strings: %m"))); | ||||
| 
 | ||||
| 			/* Break tie if necessary. */ | ||||
| 			if (result == 0 && | ||||
| 				(!mylocale || mylocale->deterministic)) | ||||
| 			{ | ||||
| 				result = memcmp(arg1, arg2, Min(len1, len2)); | ||||
| 				if ((result == 0) && (len1 != len2)) | ||||
| 					result = (len1 < len2) ? -1 : 1; | ||||
| 			} | ||||
| 
 | ||||
| 			if (a1p != a1buf) | ||||
| 				pfree(a1p); | ||||
| 			if (a2p != a2buf) | ||||
| 				pfree(a2p); | ||||
| 
 | ||||
| 			return result; | ||||
| 		} | ||||
| #endif							/* WIN32 */ | ||||
| 
 | ||||
| 		if (len1 >= TEXTBUFLEN) | ||||
| 			a1p = (char *) palloc(len1 + 1); | ||||
| 		else | ||||
| 			a1p = a1buf; | ||||
| 		if (len2 >= TEXTBUFLEN) | ||||
| 			a2p = (char *) palloc(len2 + 1); | ||||
| 		else | ||||
| 			a2p = a2buf; | ||||
| 
 | ||||
| 		memcpy(a1p, arg1, len1); | ||||
| 		a1p[len1] = '\0'; | ||||
| 		memcpy(a2p, arg2, len2); | ||||
| 		a2p[len2] = '\0'; | ||||
| 
 | ||||
| 		if (mylocale) | ||||
| 		{ | ||||
| 			if (mylocale->provider == COLLPROVIDER_ICU) | ||||
| 			{ | ||||
| #ifdef USE_ICU | ||||
| #ifdef HAVE_UCOL_STRCOLLUTF8 | ||||
| 				if (GetDatabaseEncoding() == PG_UTF8) | ||||
| 				{ | ||||
| 					UErrorCode	status; | ||||
| 
 | ||||
| 					status = U_ZERO_ERROR; | ||||
| 					result = ucol_strcollUTF8(mylocale->info.icu.ucol, | ||||
| 											  arg1, len1, | ||||
| 											  arg2, len2, | ||||
| 											  &status); | ||||
| 					if (U_FAILURE(status)) | ||||
| 						ereport(ERROR, | ||||
| 								(errmsg("collation failed: %s", u_errorName(status)))); | ||||
| 				} | ||||
| 				else | ||||
| #endif | ||||
| 				{ | ||||
| 					int32_t		ulen1, | ||||
| 								ulen2; | ||||
| 					UChar	   *uchar1, | ||||
| 							   *uchar2; | ||||
| 
 | ||||
| 					ulen1 = icu_to_uchar(&uchar1, arg1, len1); | ||||
| 					ulen2 = icu_to_uchar(&uchar2, arg2, len2); | ||||
| 
 | ||||
| 					result = ucol_strcoll(mylocale->info.icu.ucol, | ||||
| 										  uchar1, ulen1, | ||||
| 										  uchar2, ulen2); | ||||
| 
 | ||||
| 					pfree(uchar1); | ||||
| 					pfree(uchar2); | ||||
| 				} | ||||
| #else							/* not USE_ICU */ | ||||
| 				/* shouldn't happen */ | ||||
| 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider); | ||||
| #endif							/* not USE_ICU */ | ||||
| 			} | ||||
| 			else | ||||
| 			{ | ||||
| #ifdef HAVE_LOCALE_T | ||||
| 				result = strcoll_l(a1p, a2p, mylocale->info.lt); | ||||
| #else | ||||
| 				/* shouldn't happen */ | ||||
| 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider); | ||||
| #endif | ||||
| 			} | ||||
| 		} | ||||
| 		else | ||||
| 			result = strcoll(a1p, a2p); | ||||
| 		result = pg_strncoll(arg1, len1, arg2, len2, mylocale); | ||||
| 
 | ||||
| 		/* Break tie if necessary. */ | ||||
| 		if (result == 0 && | ||||
| 			(!mylocale || mylocale->deterministic)) | ||||
| 			result = strcmp(a1p, a2p); | ||||
| 
 | ||||
| 		if (a1p != a1buf) | ||||
| 			pfree(a1p); | ||||
| 		if (a2p != a2buf) | ||||
| 			pfree(a2p); | ||||
| 		{ | ||||
| 			result = memcmp(arg1, arg2, Min(len1, len2)); | ||||
| 			if ((result == 0) && (len1 != len2)) | ||||
| 				result = (len1 < len2) ? -1 : 1; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return result; | ||||
| @ -2073,20 +1914,6 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) | ||||
| 		 */ | ||||
| 		locale = pg_newlocale_from_collation(collid); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * There is a further exception on Windows.  When the database | ||||
| 		 * encoding is UTF-8 and we are not using the C collation, complex | ||||
| 		 * hacks are required.  We don't currently have a comparator that | ||||
| 		 * handles that case, so we fall back on the slow method of having the | ||||
| 		 * sort code invoke bttextcmp() (in the case of text) via the fmgr | ||||
| 		 * trampoline.  ICU locales work just the same on Windows, however. | ||||
| 		 */ | ||||
| #ifdef WIN32 | ||||
| 		if (GetDatabaseEncoding() == PG_UTF8 && | ||||
| 			!(locale && locale->provider == COLLPROVIDER_ICU)) | ||||
| 			return; | ||||
| #endif | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We use varlenafastcmp_locale except for type NAME. | ||||
| 		 */ | ||||
| @ -2102,13 +1929,7 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Unfortunately, it seems that abbreviation for non-C collations is | ||||
| 	 * broken on many common platforms; testing of multiple versions of glibc | ||||
| 	 * reveals that, for many locales, strcoll() and strxfrm() do not return | ||||
| 	 * consistent results, which is fatal to this optimization.  While no | ||||
| 	 * other libc other than Cygwin has so far been shown to have a problem, | ||||
| 	 * we take the conservative course of action for right now and disable | ||||
| 	 * this categorically.  (Users who are certain this isn't a problem on | ||||
| 	 * their system can define TRUST_STRXFRM.) | ||||
| 	 * broken on many common platforms; see pg_strxfrm_enabled(). | ||||
| 	 * | ||||
| 	 * Even apart from the risk of broken locales, it's possible that there | ||||
| 	 * are platforms where the use of abbreviated keys should be disabled at | ||||
| @ -2121,10 +1942,8 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) | ||||
| 	 * categorically, we may still want or need to disable it for particular | ||||
| 	 * platforms. | ||||
| 	 */ | ||||
| #ifndef TRUST_STRXFRM | ||||
| 	if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU)) | ||||
| 	if (!collate_c && !pg_strxfrm_enabled(locale)) | ||||
| 		abbreviate = false; | ||||
| #endif | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we're using abbreviated keys, or if we're using a locale-aware | ||||
| @ -2395,60 +2214,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) | ||||
| 		return sss->last_returned; | ||||
| 	} | ||||
| 
 | ||||
| 	if (sss->locale) | ||||
| 	{ | ||||
| 		if (sss->locale->provider == COLLPROVIDER_ICU) | ||||
| 		{ | ||||
| #ifdef USE_ICU | ||||
| #ifdef HAVE_UCOL_STRCOLLUTF8 | ||||
| 			if (GetDatabaseEncoding() == PG_UTF8) | ||||
| 			{ | ||||
| 				UErrorCode	status; | ||||
| 
 | ||||
| 				status = U_ZERO_ERROR; | ||||
| 				result = ucol_strcollUTF8(sss->locale->info.icu.ucol, | ||||
| 										  a1p, len1, | ||||
| 										  a2p, len2, | ||||
| 										  &status); | ||||
| 				if (U_FAILURE(status)) | ||||
| 					ereport(ERROR, | ||||
| 							(errmsg("collation failed: %s", u_errorName(status)))); | ||||
| 			} | ||||
| 			else | ||||
| #endif | ||||
| 			{ | ||||
| 				int32_t		ulen1, | ||||
| 							ulen2; | ||||
| 				UChar	   *uchar1, | ||||
| 						   *uchar2; | ||||
| 
 | ||||
| 				ulen1 = icu_to_uchar(&uchar1, a1p, len1); | ||||
| 				ulen2 = icu_to_uchar(&uchar2, a2p, len2); | ||||
| 
 | ||||
| 				result = ucol_strcoll(sss->locale->info.icu.ucol, | ||||
| 									  uchar1, ulen1, | ||||
| 									  uchar2, ulen2); | ||||
| 
 | ||||
| 				pfree(uchar1); | ||||
| 				pfree(uchar2); | ||||
| 			} | ||||
| #else							/* not USE_ICU */ | ||||
| 			/* shouldn't happen */ | ||||
| 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); | ||||
| #endif							/* not USE_ICU */ | ||||
| 		} | ||||
| 		else | ||||
| 		{ | ||||
| #ifdef HAVE_LOCALE_T | ||||
| 			result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt); | ||||
| #else | ||||
| 			/* shouldn't happen */ | ||||
| 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); | ||||
| #endif | ||||
| 		} | ||||
| 	} | ||||
| 	else | ||||
| 		result = strcoll(sss->buf1, sss->buf2); | ||||
| 	result = pg_strcoll(sss->buf1, sss->buf2, sss->locale); | ||||
| 
 | ||||
| 	/* Break tie if necessary. */ | ||||
| 	if (result == 0 && | ||||
| @ -2471,6 +2237,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) | ||||
| static Datum | ||||
| varstr_abbrev_convert(Datum original, SortSupport ssup) | ||||
| { | ||||
| 	const size_t max_prefix_bytes = sizeof(Datum); | ||||
| 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; | ||||
| 	VarString  *authoritative = DatumGetVarStringPP(original); | ||||
| 	char	   *authoritative_data = VARDATA_ANY(authoritative); | ||||
| @ -2483,7 +2250,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) | ||||
| 
 | ||||
| 	pres = (char *) &res; | ||||
| 	/* memset(), so any non-overwritten bytes are NUL */ | ||||
| 	memset(pres, 0, sizeof(Datum)); | ||||
| 	memset(pres, 0, max_prefix_bytes); | ||||
| 	len = VARSIZE_ANY_EXHDR(authoritative); | ||||
| 
 | ||||
| 	/* Get number of bytes, ignoring trailing spaces */ | ||||
| @ -2518,14 +2285,10 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) | ||||
| 	 * thing: explicitly consider string length. | ||||
| 	 */ | ||||
| 	if (sss->collate_c) | ||||
| 		memcpy(pres, authoritative_data, Min(len, sizeof(Datum))); | ||||
| 		memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); | ||||
| 	else | ||||
| 	{ | ||||
| 		Size		bsize; | ||||
| #ifdef USE_ICU | ||||
| 		int32_t		ulen = -1; | ||||
| 		UChar	   *uchar = NULL; | ||||
| #endif | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We're not using the C collation, so fall back on strxfrm or ICU | ||||
| @ -2543,7 +2306,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) | ||||
| 		if (sss->last_len1 == len && sss->cache_blob && | ||||
| 			memcmp(sss->buf1, authoritative_data, len) == 0) | ||||
| 		{ | ||||
| 			memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2)); | ||||
| 			memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2)); | ||||
| 			/* No change affecting cardinality, so no hashing required */ | ||||
| 			goto done; | ||||
| 		} | ||||
| @ -2551,81 +2314,49 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) | ||||
| 		memcpy(sss->buf1, authoritative_data, len); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not | ||||
| 		 * necessary for ICU, but doesn't hurt. | ||||
| 		 * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated | ||||
| 		 * strings. | ||||
| 		 */ | ||||
| 		sss->buf1[len] = '\0'; | ||||
| 		sss->last_len1 = len; | ||||
| 
 | ||||
| #ifdef USE_ICU | ||||
| 		/* When using ICU and not UTF8, convert string to UChar. */ | ||||
| 		if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU && | ||||
| 			GetDatabaseEncoding() != PG_UTF8) | ||||
| 			ulen = icu_to_uchar(&uchar, sss->buf1, len); | ||||
| #endif | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer, | ||||
| 		 * and try again.  Both of these functions have the result buffer | ||||
| 		 * content undefined if the result did not fit, so we need to retry | ||||
| 		 * until everything fits, even though we only need the first few bytes | ||||
| 		 * in the end.  When using ucol_nextSortKeyPart(), however, we only | ||||
| 		 * ask for as many bytes as we actually need. | ||||
| 		 */ | ||||
| 		for (;;) | ||||
| 		if (pg_strxfrm_prefix_enabled(sss->locale)) | ||||
| 		{ | ||||
| #ifdef USE_ICU | ||||
| 			if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU) | ||||
| 			if (sss->buflen2 < max_prefix_bytes) | ||||
| 			{ | ||||
| 				/*
 | ||||
| 				 * When using UTF8, use the iteration interface so we only | ||||
| 				 * need to produce as many bytes as we actually need. | ||||
| 				 */ | ||||
| 				if (GetDatabaseEncoding() == PG_UTF8) | ||||
| 				{ | ||||
| 					UCharIterator iter; | ||||
| 					uint32_t	state[2]; | ||||
| 					UErrorCode	status; | ||||
| 
 | ||||
| 					uiter_setUTF8(&iter, sss->buf1, len); | ||||
| 					state[0] = state[1] = 0;	/* won't need that again */ | ||||
| 					status = U_ZERO_ERROR; | ||||
| 					bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol, | ||||
| 												 &iter, | ||||
| 												 state, | ||||
| 												 (uint8_t *) sss->buf2, | ||||
| 												 Min(sizeof(Datum), sss->buflen2), | ||||
| 												 &status); | ||||
| 					if (U_FAILURE(status)) | ||||
| 						ereport(ERROR, | ||||
| 								(errmsg("sort key generation failed: %s", | ||||
| 										u_errorName(status)))); | ||||
| 				} | ||||
| 				else | ||||
| 					bsize = ucol_getSortKey(sss->locale->info.icu.ucol, | ||||
| 											uchar, ulen, | ||||
| 											(uint8_t *) sss->buf2, sss->buflen2); | ||||
| 				sss->buflen2 = Max(max_prefix_bytes, | ||||
| 								   Min(sss->buflen2 * 2, MaxAllocSize)); | ||||
| 				sss->buf2 = repalloc(sss->buf2, sss->buflen2); | ||||
| 			} | ||||
| 			else | ||||
| #endif | ||||
| #ifdef HAVE_LOCALE_T | ||||
| 			if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC) | ||||
| 				bsize = strxfrm_l(sss->buf2, sss->buf1, | ||||
| 								  sss->buflen2, sss->locale->info.lt); | ||||
| 			else | ||||
| #endif | ||||
| 				bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2); | ||||
| 
 | ||||
| 			sss->last_len2 = bsize; | ||||
| 			if (bsize < sss->buflen2) | ||||
| 				break; | ||||
| 
 | ||||
| 			bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1, | ||||
| 									  max_prefix_bytes, sss->locale); | ||||
| 		} | ||||
| 		else | ||||
| 		{ | ||||
| 			/*
 | ||||
| 			 * Grow buffer and retry. | ||||
| 			 * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try | ||||
| 			 * again.  The pg_strxfrm() function leaves the result buffer | ||||
| 			 * content undefined if the result did not fit, so we need to | ||||
| 			 * retry until everything fits, even though we only need the first | ||||
| 			 * few bytes in the end. | ||||
| 			 */ | ||||
| 			sss->buflen2 = Max(bsize + 1, | ||||
| 							   Min(sss->buflen2 * 2, MaxAllocSize)); | ||||
| 			sss->buf2 = repalloc(sss->buf2, sss->buflen2); | ||||
| 			for (;;) | ||||
| 			{ | ||||
| 				bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2, | ||||
| 								   sss->locale); | ||||
| 
 | ||||
| 				sss->last_len2 = bsize; | ||||
| 				if (bsize < sss->buflen2) | ||||
| 					break; | ||||
| 
 | ||||
| 				/*
 | ||||
| 				 * Grow buffer and retry. | ||||
| 				 */ | ||||
| 				sss->buflen2 = Max(bsize + 1, | ||||
| 								   Min(sss->buflen2 * 2, MaxAllocSize)); | ||||
| 				sss->buf2 = repalloc(sss->buf2, sss->buflen2); | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| @ -2637,12 +2368,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) | ||||
| 		 * (Actually, even if there were NUL bytes in the blob it would be | ||||
| 		 * okay.  See remarks on bytea case above.) | ||||
| 		 */ | ||||
| 		memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize)); | ||||
| 
 | ||||
| #ifdef USE_ICU | ||||
| 		if (uchar) | ||||
| 			pfree(uchar); | ||||
| #endif | ||||
| 		memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize)); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
|  | ||||
| @ -100,6 +100,19 @@ extern void make_icu_collator(const char *iculocstr, | ||||
| extern pg_locale_t pg_newlocale_from_collation(Oid collid); | ||||
| 
 | ||||
| extern char *get_collation_actual_version(char collprovider, const char *collcollate); | ||||
| extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); | ||||
| extern int pg_strncoll(const char *arg1, size_t len1, | ||||
| 					   const char *arg2, size_t len2, pg_locale_t locale); | ||||
| extern bool pg_strxfrm_enabled(pg_locale_t locale); | ||||
| extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize, | ||||
| 						 pg_locale_t locale); | ||||
| extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, | ||||
| 						  size_t srclen, pg_locale_t locale); | ||||
| extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale); | ||||
| extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, | ||||
| 								pg_locale_t locale); | ||||
| extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, | ||||
| 								 size_t srclen, pg_locale_t locale); | ||||
| 
 | ||||
| #ifdef USE_ICU | ||||
| extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes); | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user