mirror of
https://github.com/postgres/postgres.git
synced 2025-05-15 00:02:24 -04:00
Allow Unicode escapes in any server encoding, not only UTF-8.
SQL includes provisions for numeric Unicode escapes in string literals and identifiers. Previously we only accepted those if they represented ASCII characters or the server encoding was UTF-8, making the conversion to internal form trivial. This patch adjusts things so that we'll call the appropriate encoding conversion function in less-trivial cases, allowing the escape sequence to be accepted so long as it corresponds to some character available in the server encoding. This also applies to processing of Unicode escapes in JSONB. However, the old restriction still applies to client-side JSON processing, since that hasn't got access to the server's encoding conversion infrastructure. This patch includes some lexer infrastructure that simplifies throwing errors with error cursors pointing into the middle of a string (or other complex token). For the moment I only used it for errors relating to Unicode escapes, but we might later expand the usage to some other cases. Patch by me, reviewed by John Naylor. Discussion: https://postgr.es/m/2393.1578958316@sss.pgh.pa.us
This commit is contained in:
parent
fe30e7ebfa
commit
a6525588b7
@ -61,8 +61,8 @@
|
|||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
<productname>PostgreSQL</productname> allows only one character set
|
RFC 7159 specifies that JSON strings should be encoded in UTF8.
|
||||||
encoding per database. It is therefore not possible for the JSON
|
It is therefore not possible for the JSON
|
||||||
types to conform rigidly to the JSON specification unless the database
|
types to conform rigidly to the JSON specification unless the database
|
||||||
encoding is UTF8. Attempts to directly include characters that
|
encoding is UTF8. Attempts to directly include characters that
|
||||||
cannot be represented in the database encoding will fail; conversely,
|
cannot be represented in the database encoding will fail; conversely,
|
||||||
@ -77,13 +77,13 @@
|
|||||||
regardless of the database encoding, and are checked only for syntactic
|
regardless of the database encoding, and are checked only for syntactic
|
||||||
correctness (that is, that four hex digits follow <literal>\u</literal>).
|
correctness (that is, that four hex digits follow <literal>\u</literal>).
|
||||||
However, the input function for <type>jsonb</type> is stricter: it disallows
|
However, the input function for <type>jsonb</type> is stricter: it disallows
|
||||||
Unicode escapes for non-ASCII characters (those above <literal>U+007F</literal>)
|
Unicode escapes for characters that cannot be represented in the database
|
||||||
unless the database encoding is UTF8. The <type>jsonb</type> type also
|
encoding. The <type>jsonb</type> type also
|
||||||
rejects <literal>\u0000</literal> (because that cannot be represented in
|
rejects <literal>\u0000</literal> (because that cannot be represented in
|
||||||
<productname>PostgreSQL</productname>'s <type>text</type> type), and it insists
|
<productname>PostgreSQL</productname>'s <type>text</type> type), and it insists
|
||||||
that any use of Unicode surrogate pairs to designate characters outside
|
that any use of Unicode surrogate pairs to designate characters outside
|
||||||
the Unicode Basic Multilingual Plane be correct. Valid Unicode escapes
|
the Unicode Basic Multilingual Plane be correct. Valid Unicode escapes
|
||||||
are converted to the equivalent ASCII or UTF8 character for storage;
|
are converted to the equivalent single character for storage;
|
||||||
this includes folding surrogate pairs into a single character.
|
this includes folding surrogate pairs into a single character.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
@ -96,9 +96,8 @@
|
|||||||
not <type>jsonb</type>. The fact that the <type>json</type> input function does
|
not <type>jsonb</type>. The fact that the <type>json</type> input function does
|
||||||
not make these checks may be considered a historical artifact, although
|
not make these checks may be considered a historical artifact, although
|
||||||
it does allow for simple storage (without processing) of JSON Unicode
|
it does allow for simple storage (without processing) of JSON Unicode
|
||||||
escapes in a non-UTF8 database encoding. In general, it is best to
|
escapes in a database encoding that does not support the represented
|
||||||
avoid mixing Unicode escapes in JSON with a non-UTF8 database encoding,
|
characters.
|
||||||
if possible.
|
|
||||||
</para>
|
</para>
|
||||||
</note>
|
</note>
|
||||||
|
|
||||||
@ -144,8 +143,8 @@
|
|||||||
<row>
|
<row>
|
||||||
<entry><type>string</type></entry>
|
<entry><type>string</type></entry>
|
||||||
<entry><type>text</type></entry>
|
<entry><type>text</type></entry>
|
||||||
<entry><literal>\u0000</literal> is disallowed, as are non-ASCII Unicode
|
<entry><literal>\u0000</literal> is disallowed, as are Unicode escapes
|
||||||
escapes if database encoding is not UTF8</entry>
|
representing characters not available in the database encoding</entry>
|
||||||
</row>
|
</row>
|
||||||
<row>
|
<row>
|
||||||
<entry><type>number</type></entry>
|
<entry><type>number</type></entry>
|
||||||
|
@ -189,6 +189,23 @@ UPDATE "my_table" SET "a" = 5;
|
|||||||
ampersands. The length limitation still applies.
|
ampersands. The length limitation still applies.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
Quoting an identifier also makes it case-sensitive, whereas
|
||||||
|
unquoted names are always folded to lower case. For example, the
|
||||||
|
identifiers <literal>FOO</literal>, <literal>foo</literal>, and
|
||||||
|
<literal>"foo"</literal> are considered the same by
|
||||||
|
<productname>PostgreSQL</productname>, but
|
||||||
|
<literal>"Foo"</literal> and <literal>"FOO"</literal> are
|
||||||
|
different from these three and each other. (The folding of
|
||||||
|
unquoted names to lower case in <productname>PostgreSQL</productname> is
|
||||||
|
incompatible with the SQL standard, which says that unquoted names
|
||||||
|
should be folded to upper case. Thus, <literal>foo</literal>
|
||||||
|
should be equivalent to <literal>"FOO"</literal> not
|
||||||
|
<literal>"foo"</literal> according to the standard. If you want
|
||||||
|
to write portable applications you are advised to always quote a
|
||||||
|
particular name or never quote it.)
|
||||||
|
</para>
|
||||||
|
|
||||||
<indexterm>
|
<indexterm>
|
||||||
<primary>Unicode escape</primary>
|
<primary>Unicode escape</primary>
|
||||||
<secondary>in identifiers</secondary>
|
<secondary>in identifiers</secondary>
|
||||||
@ -230,7 +247,8 @@ U&"d!0061t!+000061" UESCAPE '!'
|
|||||||
The escape character can be any single character other than a
|
The escape character can be any single character other than a
|
||||||
hexadecimal digit, the plus sign, a single quote, a double quote,
|
hexadecimal digit, the plus sign, a single quote, a double quote,
|
||||||
or a whitespace character. Note that the escape character is
|
or a whitespace character. Note that the escape character is
|
||||||
written in single quotes, not double quotes.
|
written in single quotes, not double quotes,
|
||||||
|
after <literal>UESCAPE</literal>.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
@ -239,32 +257,18 @@ U&"d!0061t!+000061" UESCAPE '!'
|
|||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
The Unicode escape syntax works only when the server encoding is
|
Either the 4-digit or the 6-digit escape form can be used to
|
||||||
<literal>UTF8</literal>. When other server encodings are used, only code
|
|
||||||
points in the ASCII range (up to <literal>\007F</literal>) can be
|
|
||||||
specified. Both the 4-digit and the 6-digit form can be used to
|
|
||||||
specify UTF-16 surrogate pairs to compose characters with code
|
specify UTF-16 surrogate pairs to compose characters with code
|
||||||
points larger than U+FFFF, although the availability of the
|
points larger than U+FFFF, although the availability of the
|
||||||
6-digit form technically makes this unnecessary. (Surrogate
|
6-digit form technically makes this unnecessary. (Surrogate
|
||||||
pairs are not stored directly, but combined into a single
|
pairs are not stored directly, but are combined into a single
|
||||||
code point that is then encoded in UTF-8.)
|
code point.)
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
Quoting an identifier also makes it case-sensitive, whereas
|
If the server encoding is not UTF-8, the Unicode code point identified
|
||||||
unquoted names are always folded to lower case. For example, the
|
by one of these escape sequences is converted to the actual server
|
||||||
identifiers <literal>FOO</literal>, <literal>foo</literal>, and
|
encoding; an error is reported if that's not possible.
|
||||||
<literal>"foo"</literal> are considered the same by
|
|
||||||
<productname>PostgreSQL</productname>, but
|
|
||||||
<literal>"Foo"</literal> and <literal>"FOO"</literal> are
|
|
||||||
different from these three and each other. (The folding of
|
|
||||||
unquoted names to lower case in <productname>PostgreSQL</productname> is
|
|
||||||
incompatible with the SQL standard, which says that unquoted names
|
|
||||||
should be folded to upper case. Thus, <literal>foo</literal>
|
|
||||||
should be equivalent to <literal>"FOO"</literal> not
|
|
||||||
<literal>"foo"</literal> according to the standard. If you want
|
|
||||||
to write portable applications you are advised to always quote a
|
|
||||||
particular name or never quote it.)
|
|
||||||
</para>
|
</para>
|
||||||
</sect2>
|
</sect2>
|
||||||
|
|
||||||
@ -427,25 +431,11 @@ SELECT 'foo' 'bar';
|
|||||||
<para>
|
<para>
|
||||||
It is your responsibility that the byte sequences you create,
|
It is your responsibility that the byte sequences you create,
|
||||||
especially when using the octal or hexadecimal escapes, compose
|
especially when using the octal or hexadecimal escapes, compose
|
||||||
valid characters in the server character set encoding. When the
|
valid characters in the server character set encoding.
|
||||||
server encoding is UTF-8, then the Unicode escapes or the
|
A useful alternative is to use Unicode escapes or the
|
||||||
alternative Unicode escape syntax, explained
|
alternative Unicode escape syntax, explained
|
||||||
in <xref linkend="sql-syntax-strings-uescape"/>, should be used
|
in <xref linkend="sql-syntax-strings-uescape"/>; then the server
|
||||||
instead. (The alternative would be doing the UTF-8 encoding by
|
will check that the character conversion is possible.
|
||||||
hand and writing out the bytes, which would be very cumbersome.)
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<para>
|
|
||||||
The Unicode escape syntax works fully only when the server
|
|
||||||
encoding is <literal>UTF8</literal>. When other server encodings are
|
|
||||||
used, only code points in the ASCII range (up
|
|
||||||
to <literal>\u007F</literal>) can be specified. Both the 4-digit and
|
|
||||||
the 8-digit form can be used to specify UTF-16 surrogate pairs to
|
|
||||||
compose characters with code points larger than U+FFFF, although
|
|
||||||
the availability of the 8-digit form technically makes this
|
|
||||||
unnecessary. (When surrogate pairs are used when the server
|
|
||||||
encoding is <literal>UTF8</literal>, they are first combined into a
|
|
||||||
single code point that is then encoded in UTF-8.)
|
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<caution>
|
<caution>
|
||||||
@ -524,16 +514,23 @@ U&'d!0061t!+000061' UESCAPE '!'
|
|||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
The Unicode escape syntax works only when the server encoding is
|
To include the escape character in the string literally, write
|
||||||
<literal>UTF8</literal>. When other server encodings are used, only
|
it twice.
|
||||||
code points in the ASCII range (up to <literal>\007F</literal>)
|
</para>
|
||||||
can be specified. Both the 4-digit and the 6-digit form can be
|
|
||||||
used to specify UTF-16 surrogate pairs to compose characters with
|
<para>
|
||||||
code points larger than U+FFFF, although the availability of the
|
Either the 4-digit or the 6-digit escape form can be used to
|
||||||
6-digit form technically makes this unnecessary. (When surrogate
|
specify UTF-16 surrogate pairs to compose characters with code
|
||||||
pairs are used when the server encoding is <literal>UTF8</literal>, they
|
points larger than U+FFFF, although the availability of the
|
||||||
are first combined into a single code point that is then encoded
|
6-digit form technically makes this unnecessary. (Surrogate
|
||||||
in UTF-8.)
|
pairs are not stored directly, but are combined into a single
|
||||||
|
code point.)
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
If the server encoding is not UTF-8, the Unicode code point identified
|
||||||
|
by one of these escape sequences is converted to the actual server
|
||||||
|
encoding; an error is reported if that's not possible.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
@ -546,11 +543,6 @@ U&'d!0061t!+000061' UESCAPE '!'
|
|||||||
parameter is set to off, this syntax will be rejected with an
|
parameter is set to off, this syntax will be rejected with an
|
||||||
error message.
|
error message.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
|
||||||
To include the escape character in the string literally, write it
|
|
||||||
twice.
|
|
||||||
</para>
|
|
||||||
</sect3>
|
</sect3>
|
||||||
|
|
||||||
<sect3 id="sql-syntax-dollar-quoting">
|
<sect3 id="sql-syntax-dollar-quoting">
|
||||||
|
@ -292,22 +292,14 @@ hexval(unsigned char c)
|
|||||||
return 0; /* not reached */
|
return 0; /* not reached */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* is Unicode code point acceptable in database's encoding? */
|
/* is Unicode code point acceptable? */
|
||||||
static void
|
static void
|
||||||
check_unicode_value(pg_wchar c, int pos, core_yyscan_t yyscanner)
|
check_unicode_value(pg_wchar c)
|
||||||
{
|
{
|
||||||
/* See also addunicode() in scan.l */
|
if (!is_valid_unicode_codepoint(c))
|
||||||
if (c == 0 || c > 0x10FFFF)
|
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||||
errmsg("invalid Unicode escape value"),
|
errmsg("invalid Unicode escape value")));
|
||||||
scanner_errposition(pos, yyscanner)));
|
|
||||||
|
|
||||||
if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8)
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
||||||
errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"),
|
|
||||||
scanner_errposition(pos, yyscanner)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
|
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
|
||||||
@ -338,20 +330,39 @@ str_udeescape(const char *str, char escape,
|
|||||||
const char *in;
|
const char *in;
|
||||||
char *new,
|
char *new,
|
||||||
*out;
|
*out;
|
||||||
|
size_t new_len;
|
||||||
pg_wchar pair_first = 0;
|
pg_wchar pair_first = 0;
|
||||||
|
ScannerCallbackState scbstate;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This relies on the subtle assumption that a UTF-8 expansion cannot be
|
* Guesstimate that result will be no longer than input, but allow enough
|
||||||
* longer than its escaped representation.
|
* padding for Unicode conversion.
|
||||||
*/
|
*/
|
||||||
new = palloc(strlen(str) + 1);
|
new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
|
||||||
|
new = palloc(new_len);
|
||||||
|
|
||||||
in = str;
|
in = str;
|
||||||
out = new;
|
out = new;
|
||||||
while (*in)
|
while (*in)
|
||||||
{
|
{
|
||||||
|
/* Enlarge string if needed */
|
||||||
|
size_t out_dist = out - new;
|
||||||
|
|
||||||
|
if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
|
||||||
|
{
|
||||||
|
new_len *= 2;
|
||||||
|
new = repalloc(new, new_len);
|
||||||
|
out = new + out_dist;
|
||||||
|
}
|
||||||
|
|
||||||
if (in[0] == escape)
|
if (in[0] == escape)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Any errors reported while processing this escape sequence will
|
||||||
|
* have an error cursor pointing at the escape.
|
||||||
|
*/
|
||||||
|
setup_scanner_errposition_callback(&scbstate, yyscanner,
|
||||||
|
in - str + position + 3); /* 3 for U&" */
|
||||||
if (in[1] == escape)
|
if (in[1] == escape)
|
||||||
{
|
{
|
||||||
if (pair_first)
|
if (pair_first)
|
||||||
@ -370,9 +381,7 @@ str_udeescape(const char *str, char escape,
|
|||||||
(hexval(in[2]) << 8) +
|
(hexval(in[2]) << 8) +
|
||||||
(hexval(in[3]) << 4) +
|
(hexval(in[3]) << 4) +
|
||||||
hexval(in[4]);
|
hexval(in[4]);
|
||||||
check_unicode_value(unicode,
|
check_unicode_value(unicode);
|
||||||
in - str + position + 3, /* 3 for U&" */
|
|
||||||
yyscanner);
|
|
||||||
if (pair_first)
|
if (pair_first)
|
||||||
{
|
{
|
||||||
if (is_utf16_surrogate_second(unicode))
|
if (is_utf16_surrogate_second(unicode))
|
||||||
@ -390,8 +399,8 @@ str_udeescape(const char *str, char escape,
|
|||||||
pair_first = unicode;
|
pair_first = unicode;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
unicode_to_utf8(unicode, (unsigned char *) out);
|
pg_unicode_to_server(unicode, (unsigned char *) out);
|
||||||
out += pg_mblen(out);
|
out += strlen(out);
|
||||||
}
|
}
|
||||||
in += 5;
|
in += 5;
|
||||||
}
|
}
|
||||||
@ -411,9 +420,7 @@ str_udeescape(const char *str, char escape,
|
|||||||
(hexval(in[5]) << 8) +
|
(hexval(in[5]) << 8) +
|
||||||
(hexval(in[6]) << 4) +
|
(hexval(in[6]) << 4) +
|
||||||
hexval(in[7]);
|
hexval(in[7]);
|
||||||
check_unicode_value(unicode,
|
check_unicode_value(unicode);
|
||||||
in - str + position + 3, /* 3 for U&" */
|
|
||||||
yyscanner);
|
|
||||||
if (pair_first)
|
if (pair_first)
|
||||||
{
|
{
|
||||||
if (is_utf16_surrogate_second(unicode))
|
if (is_utf16_surrogate_second(unicode))
|
||||||
@ -431,17 +438,18 @@ str_udeescape(const char *str, char escape,
|
|||||||
pair_first = unicode;
|
pair_first = unicode;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
unicode_to_utf8(unicode, (unsigned char *) out);
|
pg_unicode_to_server(unicode, (unsigned char *) out);
|
||||||
out += pg_mblen(out);
|
out += strlen(out);
|
||||||
}
|
}
|
||||||
in += 8;
|
in += 8;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||||
errmsg("invalid Unicode escape value"),
|
errmsg("invalid Unicode escape"),
|
||||||
scanner_errposition(in - str + position + 3, /* 3 for U&" */
|
errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
|
||||||
yyscanner)));
|
|
||||||
|
cancel_scanner_errposition_callback(&scbstate);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -457,15 +465,13 @@ str_udeescape(const char *str, char escape,
|
|||||||
goto invalid_pair;
|
goto invalid_pair;
|
||||||
|
|
||||||
*out = '\0';
|
*out = '\0';
|
||||||
|
|
||||||
/*
|
|
||||||
* We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
|
|
||||||
* codes; but it's probably not worth the trouble, since this isn't likely
|
|
||||||
* to be a performance-critical path.
|
|
||||||
*/
|
|
||||||
pg_verifymbstr(new, out - new, false);
|
|
||||||
return new;
|
return new;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We might get here with the error callback active, or not. Call
|
||||||
|
* scanner_errposition to make sure an error cursor appears; if the
|
||||||
|
* callback is active, this is duplicative but harmless.
|
||||||
|
*/
|
||||||
invalid_pair:
|
invalid_pair:
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||||
|
@ -106,6 +106,18 @@ const uint16 ScanKeywordTokens[] = {
|
|||||||
*/
|
*/
|
||||||
#define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
|
#define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sometimes, we do want yylloc to point into the middle of a token; this is
|
||||||
|
* useful for instance to throw an error about an escape sequence within a
|
||||||
|
* string literal. But if we find no error there, we want to revert yylloc
|
||||||
|
* to the token start, so that that's the location reported to the parser.
|
||||||
|
* Use PUSH_YYLLOC/POP_YYLLOC to save/restore yylloc around such code.
|
||||||
|
* (Currently the implied "stack" is just one location, but someday we might
|
||||||
|
* need to nest these.)
|
||||||
|
*/
|
||||||
|
#define PUSH_YYLLOC() (yyextra->save_yylloc = *(yylloc))
|
||||||
|
#define POP_YYLLOC() (*(yylloc) = yyextra->save_yylloc)
|
||||||
|
|
||||||
#define startlit() ( yyextra->literallen = 0 )
|
#define startlit() ( yyextra->literallen = 0 )
|
||||||
static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
|
static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
|
||||||
static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
|
static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
|
||||||
@ -605,8 +617,18 @@ other .
|
|||||||
<xe>{xeunicode} {
|
<xe>{xeunicode} {
|
||||||
pg_wchar c = strtoul(yytext + 2, NULL, 16);
|
pg_wchar c = strtoul(yytext + 2, NULL, 16);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For consistency with other productions, issue any
|
||||||
|
* escape warning with cursor pointing to start of string.
|
||||||
|
* We might want to change that, someday.
|
||||||
|
*/
|
||||||
check_escape_warning(yyscanner);
|
check_escape_warning(yyscanner);
|
||||||
|
|
||||||
|
/* Remember start of overall string token ... */
|
||||||
|
PUSH_YYLLOC();
|
||||||
|
/* ... and set the error cursor to point at this esc seq */
|
||||||
|
SET_YYLLOC();
|
||||||
|
|
||||||
if (is_utf16_surrogate_first(c))
|
if (is_utf16_surrogate_first(c))
|
||||||
{
|
{
|
||||||
yyextra->utf16_first_part = c;
|
yyextra->utf16_first_part = c;
|
||||||
@ -616,10 +638,18 @@ other .
|
|||||||
yyerror("invalid Unicode surrogate pair");
|
yyerror("invalid Unicode surrogate pair");
|
||||||
else
|
else
|
||||||
addunicode(c, yyscanner);
|
addunicode(c, yyscanner);
|
||||||
|
|
||||||
|
/* Restore yylloc to be start of string token */
|
||||||
|
POP_YYLLOC();
|
||||||
}
|
}
|
||||||
<xeu>{xeunicode} {
|
<xeu>{xeunicode} {
|
||||||
pg_wchar c = strtoul(yytext + 2, NULL, 16);
|
pg_wchar c = strtoul(yytext + 2, NULL, 16);
|
||||||
|
|
||||||
|
/* Remember start of overall string token ... */
|
||||||
|
PUSH_YYLLOC();
|
||||||
|
/* ... and set the error cursor to point at this esc seq */
|
||||||
|
SET_YYLLOC();
|
||||||
|
|
||||||
if (!is_utf16_surrogate_second(c))
|
if (!is_utf16_surrogate_second(c))
|
||||||
yyerror("invalid Unicode surrogate pair");
|
yyerror("invalid Unicode surrogate pair");
|
||||||
|
|
||||||
@ -627,12 +657,21 @@ other .
|
|||||||
|
|
||||||
addunicode(c, yyscanner);
|
addunicode(c, yyscanner);
|
||||||
|
|
||||||
|
/* Restore yylloc to be start of string token */
|
||||||
|
POP_YYLLOC();
|
||||||
|
|
||||||
BEGIN(xe);
|
BEGIN(xe);
|
||||||
}
|
}
|
||||||
<xeu>. { yyerror("invalid Unicode surrogate pair"); }
|
<xeu>. |
|
||||||
<xeu>\n { yyerror("invalid Unicode surrogate pair"); }
|
<xeu>\n |
|
||||||
<xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
|
<xeu><<EOF>> {
|
||||||
|
/* Set the error cursor to point at missing esc seq */
|
||||||
|
SET_YYLLOC();
|
||||||
|
yyerror("invalid Unicode surrogate pair");
|
||||||
|
}
|
||||||
<xe,xeu>{xeunicodefail} {
|
<xe,xeu>{xeunicodefail} {
|
||||||
|
/* Set the error cursor to point at malformed esc seq */
|
||||||
|
SET_YYLLOC();
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
|
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
|
||||||
errmsg("invalid Unicode escape"),
|
errmsg("invalid Unicode escape"),
|
||||||
@ -1029,12 +1068,13 @@ other .
|
|||||||
* scanner_errposition
|
* scanner_errposition
|
||||||
* Report a lexer or grammar error cursor position, if possible.
|
* Report a lexer or grammar error cursor position, if possible.
|
||||||
*
|
*
|
||||||
* This is expected to be used within an ereport() call. The return value
|
* This is expected to be used within an ereport() call, or via an error
|
||||||
|
* callback such as setup_scanner_errposition_callback(). The return value
|
||||||
* is a dummy (always 0, in fact).
|
* is a dummy (always 0, in fact).
|
||||||
*
|
*
|
||||||
* Note that this can only be used for messages emitted during raw parsing
|
* Note that this can only be used for messages emitted during raw parsing
|
||||||
* (essentially, scan.l and gram.y), since it requires the yyscanner struct
|
* (essentially, scan.l, parser.c, and gram.y), since it requires the
|
||||||
* to still be available.
|
* yyscanner struct to still be available.
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
scanner_errposition(int location, core_yyscan_t yyscanner)
|
scanner_errposition(int location, core_yyscan_t yyscanner)
|
||||||
@ -1050,6 +1090,62 @@ scanner_errposition(int location, core_yyscan_t yyscanner)
|
|||||||
return errposition(pos);
|
return errposition(pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Error context callback for inserting scanner error location.
|
||||||
|
*
|
||||||
|
* Note that this will be called for *any* error occurring while the
|
||||||
|
* callback is installed. We avoid inserting an irrelevant error location
|
||||||
|
* if the error is a query cancel --- are there any other important cases?
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
scb_error_callback(void *arg)
|
||||||
|
{
|
||||||
|
ScannerCallbackState *scbstate = (ScannerCallbackState *) arg;
|
||||||
|
|
||||||
|
if (geterrcode() != ERRCODE_QUERY_CANCELED)
|
||||||
|
(void) scanner_errposition(scbstate->location, scbstate->yyscanner);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* setup_scanner_errposition_callback
|
||||||
|
* Arrange for non-scanner errors to report an error position
|
||||||
|
*
|
||||||
|
* Sometimes the scanner calls functions that aren't part of the scanner
|
||||||
|
* subsystem and can't reasonably be passed the yyscanner pointer; yet
|
||||||
|
* we would like any errors thrown in those functions to be tagged with an
|
||||||
|
* error location. Use this function to set up an error context stack
|
||||||
|
* entry that will accomplish that. Usage pattern:
|
||||||
|
*
|
||||||
|
* declare a local variable "ScannerCallbackState scbstate"
|
||||||
|
* ...
|
||||||
|
* setup_scanner_errposition_callback(&scbstate, yyscanner, location);
|
||||||
|
* call function that might throw error;
|
||||||
|
* cancel_scanner_errposition_callback(&scbstate);
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
setup_scanner_errposition_callback(ScannerCallbackState *scbstate,
|
||||||
|
core_yyscan_t yyscanner,
|
||||||
|
int location)
|
||||||
|
{
|
||||||
|
/* Setup error traceback support for ereport() */
|
||||||
|
scbstate->yyscanner = yyscanner;
|
||||||
|
scbstate->location = location;
|
||||||
|
scbstate->errcallback.callback = scb_error_callback;
|
||||||
|
scbstate->errcallback.arg = (void *) scbstate;
|
||||||
|
scbstate->errcallback.previous = error_context_stack;
|
||||||
|
error_context_stack = &scbstate->errcallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Cancel a previously-set-up errposition callback.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
|
||||||
|
{
|
||||||
|
/* Pop the error context stack */
|
||||||
|
error_context_stack = scbstate->errcallback.previous;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* scanner_yyerror
|
* scanner_yyerror
|
||||||
* Report a lexer or grammar error.
|
* Report a lexer or grammar error.
|
||||||
@ -1226,19 +1322,20 @@ process_integer_literal(const char *token, YYSTYPE *lval)
|
|||||||
static void
|
static void
|
||||||
addunicode(pg_wchar c, core_yyscan_t yyscanner)
|
addunicode(pg_wchar c, core_yyscan_t yyscanner)
|
||||||
{
|
{
|
||||||
char buf[8];
|
ScannerCallbackState scbstate;
|
||||||
|
char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
|
||||||
|
|
||||||
/* See also check_unicode_value() in parser.c */
|
if (!is_valid_unicode_codepoint(c))
|
||||||
if (c == 0 || c > 0x10FFFF)
|
|
||||||
yyerror("invalid Unicode escape value");
|
yyerror("invalid Unicode escape value");
|
||||||
if (c > 0x7F)
|
|
||||||
{
|
/*
|
||||||
if (GetDatabaseEncoding() != PG_UTF8)
|
* We expect that pg_unicode_to_server() will complain about any
|
||||||
yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
|
* unconvertible code point, so we don't have to set saw_non_ascii.
|
||||||
yyextra->saw_non_ascii = true;
|
*/
|
||||||
}
|
setup_scanner_errposition_callback(&scbstate, yyscanner, *(yylloc));
|
||||||
unicode_to_utf8(c, (unsigned char *) buf);
|
pg_unicode_to_server(c, (unsigned char *) buf);
|
||||||
addlit(buf, pg_mblen(buf), yyscanner);
|
cancel_scanner_errposition_callback(&scbstate);
|
||||||
|
addlit(buf, strlen(buf), yyscanner);
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned char
|
static unsigned char
|
||||||
|
@ -486,13 +486,6 @@ hexval(char c)
|
|||||||
static void
|
static void
|
||||||
addUnicodeChar(int ch)
|
addUnicodeChar(int ch)
|
||||||
{
|
{
|
||||||
/*
|
|
||||||
* For UTF8, replace the escape sequence by the actual
|
|
||||||
* utf8 character in lex->strval. Do this also for other
|
|
||||||
* encodings if the escape designates an ASCII character,
|
|
||||||
* otherwise raise an error.
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (ch == 0)
|
if (ch == 0)
|
||||||
{
|
{
|
||||||
/* We can't allow this, since our TEXT type doesn't */
|
/* We can't allow this, since our TEXT type doesn't */
|
||||||
@ -501,40 +494,20 @@ addUnicodeChar(int ch)
|
|||||||
errmsg("unsupported Unicode escape sequence"),
|
errmsg("unsupported Unicode escape sequence"),
|
||||||
errdetail("\\u0000 cannot be converted to text.")));
|
errdetail("\\u0000 cannot be converted to text.")));
|
||||||
}
|
}
|
||||||
else if (GetDatabaseEncoding() == PG_UTF8)
|
|
||||||
{
|
|
||||||
char utf8str[5];
|
|
||||||
int utf8len;
|
|
||||||
|
|
||||||
unicode_to_utf8(ch, (unsigned char *) utf8str);
|
|
||||||
utf8len = pg_utf_mblen((unsigned char *) utf8str);
|
|
||||||
addstring(false, utf8str, utf8len);
|
|
||||||
}
|
|
||||||
else if (ch <= 0x007f)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* This is the only way to designate things like a
|
|
||||||
* form feed character in JSON, so it's useful in all
|
|
||||||
* encodings.
|
|
||||||
*/
|
|
||||||
addchar(false, (char) ch);
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ereport(ERROR,
|
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
|
||||||
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
|
|
||||||
errmsg("invalid input syntax for type %s", "jsonpath"),
|
pg_unicode_to_server(ch, (unsigned char *) cbuf);
|
||||||
errdetail("Unicode escape values cannot be used for code "
|
addstring(false, cbuf, strlen(cbuf));
|
||||||
"point values above 007F when the server encoding "
|
|
||||||
"is not UTF8.")));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Add unicode character and process its hi surrogate */
|
/* Add unicode character, processing any surrogate pairs */
|
||||||
static void
|
static void
|
||||||
addUnicode(int ch, int *hi_surrogate)
|
addUnicode(int ch, int *hi_surrogate)
|
||||||
{
|
{
|
||||||
if (ch >= 0xd800 && ch <= 0xdbff)
|
if (is_utf16_surrogate_first(ch))
|
||||||
{
|
{
|
||||||
if (*hi_surrogate != -1)
|
if (*hi_surrogate != -1)
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
@ -542,10 +515,10 @@ addUnicode(int ch, int *hi_surrogate)
|
|||||||
errmsg("invalid input syntax for type %s", "jsonpath"),
|
errmsg("invalid input syntax for type %s", "jsonpath"),
|
||||||
errdetail("Unicode high surrogate must not follow "
|
errdetail("Unicode high surrogate must not follow "
|
||||||
"a high surrogate.")));
|
"a high surrogate.")));
|
||||||
*hi_surrogate = (ch & 0x3ff) << 10;
|
*hi_surrogate = ch;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (ch >= 0xdc00 && ch <= 0xdfff)
|
else if (is_utf16_surrogate_second(ch))
|
||||||
{
|
{
|
||||||
if (*hi_surrogate == -1)
|
if (*hi_surrogate == -1)
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
@ -553,7 +526,7 @@ addUnicode(int ch, int *hi_surrogate)
|
|||||||
errmsg("invalid input syntax for type %s", "jsonpath"),
|
errmsg("invalid input syntax for type %s", "jsonpath"),
|
||||||
errdetail("Unicode low surrogate must follow a high "
|
errdetail("Unicode low surrogate must follow a high "
|
||||||
"surrogate.")));
|
"surrogate.")));
|
||||||
ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
|
ch = surrogate_pair_to_codepoint(*hi_surrogate, ch);
|
||||||
*hi_surrogate = -1;
|
*hi_surrogate = -1;
|
||||||
}
|
}
|
||||||
else if (*hi_surrogate != -1)
|
else if (*hi_surrogate != -1)
|
||||||
|
@ -2085,26 +2085,6 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Map a Unicode codepoint into the current server encoding.
|
|
||||||
*/
|
|
||||||
static char *
|
|
||||||
unicode_to_sqlchar(pg_wchar c)
|
|
||||||
{
|
|
||||||
char utf8string[8]; /* need room for trailing zero */
|
|
||||||
char *result;
|
|
||||||
|
|
||||||
memset(utf8string, 0, sizeof(utf8string));
|
|
||||||
unicode_to_utf8(c, (unsigned char *) utf8string);
|
|
||||||
|
|
||||||
result = pg_any_to_server(utf8string, strlen(utf8string), PG_UTF8);
|
|
||||||
/* if pg_any_to_server didn't strdup, we must */
|
|
||||||
if (result == utf8string)
|
|
||||||
result = pstrdup(result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Map XML name to SQL identifier; see SQL/XML:2008 section 9.3.
|
* Map XML name to SQL identifier; see SQL/XML:2008 section 9.3.
|
||||||
*/
|
*/
|
||||||
@ -2125,10 +2105,12 @@ map_xml_name_to_sql_identifier(const char *name)
|
|||||||
&& isxdigit((unsigned char) *(p + 5))
|
&& isxdigit((unsigned char) *(p + 5))
|
||||||
&& *(p + 6) == '_')
|
&& *(p + 6) == '_')
|
||||||
{
|
{
|
||||||
|
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
|
||||||
unsigned int u;
|
unsigned int u;
|
||||||
|
|
||||||
sscanf(p + 2, "%X", &u);
|
sscanf(p + 2, "%X", &u);
|
||||||
appendStringInfoString(&buf, unicode_to_sqlchar(u));
|
pg_unicode_to_server(u, (unsigned char *) cbuf);
|
||||||
|
appendStringInfoString(&buf, cbuf);
|
||||||
p += 6;
|
p += 6;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -67,6 +67,13 @@ static List *ConvProcList = NIL; /* List of ConvProcInfo */
|
|||||||
static FmgrInfo *ToServerConvProc = NULL;
|
static FmgrInfo *ToServerConvProc = NULL;
|
||||||
static FmgrInfo *ToClientConvProc = NULL;
|
static FmgrInfo *ToClientConvProc = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This variable stores the conversion function to convert from UTF-8
|
||||||
|
* to the server encoding. It's NULL if the server encoding *is* UTF-8,
|
||||||
|
* or if we lack a conversion function for this.
|
||||||
|
*/
|
||||||
|
static FmgrInfo *Utf8ToServerConvProc = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* These variables track the currently-selected encodings.
|
* These variables track the currently-selected encodings.
|
||||||
*/
|
*/
|
||||||
@ -273,6 +280,8 @@ SetClientEncoding(int encoding)
|
|||||||
void
|
void
|
||||||
InitializeClientEncoding(void)
|
InitializeClientEncoding(void)
|
||||||
{
|
{
|
||||||
|
int current_server_encoding;
|
||||||
|
|
||||||
Assert(!backend_startup_complete);
|
Assert(!backend_startup_complete);
|
||||||
backend_startup_complete = true;
|
backend_startup_complete = true;
|
||||||
|
|
||||||
@ -289,6 +298,35 @@ InitializeClientEncoding(void)
|
|||||||
pg_enc2name_tbl[pending_client_encoding].name,
|
pg_enc2name_tbl[pending_client_encoding].name,
|
||||||
GetDatabaseEncodingName())));
|
GetDatabaseEncodingName())));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Also look up the UTF8-to-server conversion function if needed. Since
|
||||||
|
* the server encoding is fixed within any one backend process, we don't
|
||||||
|
* have to do this more than once.
|
||||||
|
*/
|
||||||
|
current_server_encoding = GetDatabaseEncoding();
|
||||||
|
if (current_server_encoding != PG_UTF8 &&
|
||||||
|
current_server_encoding != PG_SQL_ASCII)
|
||||||
|
{
|
||||||
|
Oid utf8_to_server_proc;
|
||||||
|
|
||||||
|
Assert(IsTransactionState());
|
||||||
|
utf8_to_server_proc =
|
||||||
|
FindDefaultConversionProc(PG_UTF8,
|
||||||
|
current_server_encoding);
|
||||||
|
/* If there's no such conversion, just leave the pointer as NULL */
|
||||||
|
if (OidIsValid(utf8_to_server_proc))
|
||||||
|
{
|
||||||
|
FmgrInfo *finfo;
|
||||||
|
|
||||||
|
finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
|
||||||
|
sizeof(FmgrInfo));
|
||||||
|
fmgr_info_cxt(utf8_to_server_proc, finfo,
|
||||||
|
TopMemoryContext);
|
||||||
|
/* Set Utf8ToServerConvProc only after data is fully valid */
|
||||||
|
Utf8ToServerConvProc = finfo;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -752,6 +790,73 @@ perform_default_encoding_conversion(const char *src, int len,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Convert a single Unicode code point into a string in the server encoding.
|
||||||
|
*
|
||||||
|
* The code point given by "c" is converted and stored at *s, which must
|
||||||
|
* have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
|
||||||
|
* The output will have a trailing '\0'. Throws error if the conversion
|
||||||
|
* cannot be performed.
|
||||||
|
*
|
||||||
|
* Note that this relies on having previously looked up any required
|
||||||
|
* conversion function. That's partly for speed but mostly because the parser
|
||||||
|
* may call this outside any transaction, or in an aborted transaction.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pg_unicode_to_server(pg_wchar c, unsigned char *s)
|
||||||
|
{
|
||||||
|
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
|
||||||
|
int c_as_utf8_len;
|
||||||
|
int server_encoding;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Complain if invalid Unicode code point. The choice of errcode here is
|
||||||
|
* debatable, but really our caller should have checked this anyway.
|
||||||
|
*/
|
||||||
|
if (!is_valid_unicode_codepoint(c))
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||||
|
errmsg("invalid Unicode code point")));
|
||||||
|
|
||||||
|
/* Otherwise, if it's in ASCII range, conversion is trivial */
|
||||||
|
if (c <= 0x7F)
|
||||||
|
{
|
||||||
|
s[0] = (unsigned char) c;
|
||||||
|
s[1] = '\0';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If the server encoding is UTF-8, we just need to reformat the code */
|
||||||
|
server_encoding = GetDatabaseEncoding();
|
||||||
|
if (server_encoding == PG_UTF8)
|
||||||
|
{
|
||||||
|
unicode_to_utf8(c, s);
|
||||||
|
s[pg_utf_mblen(s)] = '\0';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* For all other cases, we must have a conversion function available */
|
||||||
|
if (Utf8ToServerConvProc == NULL)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||||
|
errmsg("conversion between %s and %s is not supported",
|
||||||
|
pg_enc2name_tbl[PG_UTF8].name,
|
||||||
|
GetDatabaseEncodingName())));
|
||||||
|
|
||||||
|
/* Construct UTF-8 source string */
|
||||||
|
unicode_to_utf8(c, c_as_utf8);
|
||||||
|
c_as_utf8_len = pg_utf_mblen(c_as_utf8);
|
||||||
|
c_as_utf8[c_as_utf8_len] = '\0';
|
||||||
|
|
||||||
|
/* Convert, or throw error if we can't */
|
||||||
|
FunctionCall5(Utf8ToServerConvProc,
|
||||||
|
Int32GetDatum(PG_UTF8),
|
||||||
|
Int32GetDatum(server_encoding),
|
||||||
|
CStringGetDatum(c_as_utf8),
|
||||||
|
CStringGetDatum(s),
|
||||||
|
Int32GetDatum(c_as_utf8_len));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* convert a multibyte string to a wchar */
|
/* convert a multibyte string to a wchar */
|
||||||
int
|
int
|
||||||
|
@ -744,21 +744,21 @@ json_lex_string(JsonLexContext *lex)
|
|||||||
}
|
}
|
||||||
if (lex->strval != NULL)
|
if (lex->strval != NULL)
|
||||||
{
|
{
|
||||||
char utf8str[5];
|
/*
|
||||||
int utf8len;
|
* Combine surrogate pairs.
|
||||||
|
*/
|
||||||
if (ch >= 0xd800 && ch <= 0xdbff)
|
if (is_utf16_surrogate_first(ch))
|
||||||
{
|
{
|
||||||
if (hi_surrogate != -1)
|
if (hi_surrogate != -1)
|
||||||
return JSON_UNICODE_HIGH_SURROGATE;
|
return JSON_UNICODE_HIGH_SURROGATE;
|
||||||
hi_surrogate = (ch & 0x3ff) << 10;
|
hi_surrogate = ch;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
else if (ch >= 0xdc00 && ch <= 0xdfff)
|
else if (is_utf16_surrogate_second(ch))
|
||||||
{
|
{
|
||||||
if (hi_surrogate == -1)
|
if (hi_surrogate == -1)
|
||||||
return JSON_UNICODE_LOW_SURROGATE;
|
return JSON_UNICODE_LOW_SURROGATE;
|
||||||
ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
|
ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
|
||||||
hi_surrogate = -1;
|
hi_surrogate = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -766,35 +766,52 @@ json_lex_string(JsonLexContext *lex)
|
|||||||
return JSON_UNICODE_LOW_SURROGATE;
|
return JSON_UNICODE_LOW_SURROGATE;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* For UTF8, replace the escape sequence by the actual
|
* Reject invalid cases. We can't have a value above
|
||||||
* utf8 character in lex->strval. Do this also for other
|
* 0xFFFF here (since we only accepted 4 hex digits
|
||||||
* encodings if the escape designates an ASCII character,
|
* above), so no need to test for out-of-range chars.
|
||||||
* otherwise raise an error.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if (ch == 0)
|
if (ch == 0)
|
||||||
{
|
{
|
||||||
/* We can't allow this, since our TEXT type doesn't */
|
/* We can't allow this, since our TEXT type doesn't */
|
||||||
return JSON_UNICODE_CODE_POINT_ZERO;
|
return JSON_UNICODE_CODE_POINT_ZERO;
|
||||||
}
|
}
|
||||||
else if (lex->input_encoding == PG_UTF8)
|
|
||||||
|
/*
|
||||||
|
* Add the represented character to lex->strval. In the
|
||||||
|
* backend, we can let pg_unicode_to_server() handle any
|
||||||
|
* required character set conversion; in frontend, we can
|
||||||
|
* only deal with trivial conversions.
|
||||||
|
*
|
||||||
|
* Note: pg_unicode_to_server() will throw an error for a
|
||||||
|
* conversion failure, rather than returning a failure
|
||||||
|
* indication. That seems OK.
|
||||||
|
*/
|
||||||
|
#ifndef FRONTEND
|
||||||
{
|
{
|
||||||
|
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
|
||||||
|
|
||||||
|
pg_unicode_to_server(ch, (unsigned char *) cbuf);
|
||||||
|
appendStringInfoString(lex->strval, cbuf);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (lex->input_encoding == PG_UTF8)
|
||||||
|
{
|
||||||
|
/* OK, we can map the code point to UTF8 easily */
|
||||||
|
char utf8str[5];
|
||||||
|
int utf8len;
|
||||||
|
|
||||||
unicode_to_utf8(ch, (unsigned char *) utf8str);
|
unicode_to_utf8(ch, (unsigned char *) utf8str);
|
||||||
utf8len = pg_utf_mblen((unsigned char *) utf8str);
|
utf8len = pg_utf_mblen((unsigned char *) utf8str);
|
||||||
appendBinaryStringInfo(lex->strval, utf8str, utf8len);
|
appendBinaryStringInfo(lex->strval, utf8str, utf8len);
|
||||||
}
|
}
|
||||||
else if (ch <= 0x007f)
|
else if (ch <= 0x007f)
|
||||||
{
|
{
|
||||||
/*
|
/* The ASCII range is the same in all encodings */
|
||||||
* This is the only way to designate things like a
|
|
||||||
* form feed character in JSON, so it's useful in all
|
|
||||||
* encodings.
|
|
||||||
*/
|
|
||||||
appendStringInfoChar(lex->strval, (char) ch);
|
appendStringInfoChar(lex->strval, (char) ch);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return JSON_UNICODE_HIGH_ESCAPE;
|
return JSON_UNICODE_HIGH_ESCAPE;
|
||||||
|
#endif /* FRONTEND */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (lex->strval != NULL)
|
else if (lex->strval != NULL)
|
||||||
@ -1083,7 +1100,8 @@ json_errdetail(JsonParseErrorType error, JsonLexContext *lex)
|
|||||||
case JSON_UNICODE_ESCAPE_FORMAT:
|
case JSON_UNICODE_ESCAPE_FORMAT:
|
||||||
return _("\"\\u\" must be followed by four hexadecimal digits.");
|
return _("\"\\u\" must be followed by four hexadecimal digits.");
|
||||||
case JSON_UNICODE_HIGH_ESCAPE:
|
case JSON_UNICODE_HIGH_ESCAPE:
|
||||||
return _("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.");
|
/* note: this case is only reachable in frontend not backend */
|
||||||
|
return _("Unicode escape values cannot be used for code point values above 007F when the encoding is not UTF8.");
|
||||||
case JSON_UNICODE_HIGH_SURROGATE:
|
case JSON_UNICODE_HIGH_SURROGATE:
|
||||||
return _("Unicode high surrogate must not follow a high surrogate.");
|
return _("Unicode high surrogate must not follow a high surrogate.");
|
||||||
case JSON_UNICODE_LOW_SURROGATE:
|
case JSON_UNICODE_LOW_SURROGATE:
|
||||||
|
@ -315,6 +315,15 @@ typedef enum pg_enc
|
|||||||
*/
|
*/
|
||||||
#define MAX_CONVERSION_GROWTH 4
|
#define MAX_CONVERSION_GROWTH 4
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Maximum byte length of the string equivalent to any one Unicode code point,
|
||||||
|
* in any backend encoding. The current value assumes that a 4-byte UTF-8
|
||||||
|
* character might expand by MAX_CONVERSION_GROWTH, which is a huge
|
||||||
|
* overestimate. But in current usage we don't allocate large multiples of
|
||||||
|
* this, so there's little point in being stingy.
|
||||||
|
*/
|
||||||
|
#define MAX_UNICODE_EQUIVALENT_STRING 16
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Table for mapping an encoding number to official encoding name and
|
* Table for mapping an encoding number to official encoding name and
|
||||||
* possibly other subsidiary data. Be careful to check encoding number
|
* possibly other subsidiary data. Be careful to check encoding number
|
||||||
@ -505,6 +514,12 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code);
|
|||||||
/*
|
/*
|
||||||
* Some handy functions for Unicode-specific tests.
|
* Some handy functions for Unicode-specific tests.
|
||||||
*/
|
*/
|
||||||
|
static inline bool
|
||||||
|
is_valid_unicode_codepoint(pg_wchar c)
|
||||||
|
{
|
||||||
|
return (c > 0 && c <= 0x10FFFF);
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool
|
static inline bool
|
||||||
is_utf16_surrogate_first(pg_wchar c)
|
is_utf16_surrogate_first(pg_wchar c)
|
||||||
{
|
{
|
||||||
@ -603,6 +618,8 @@ extern char *pg_server_to_client(const char *s, int len);
|
|||||||
extern char *pg_any_to_server(const char *s, int len, int encoding);
|
extern char *pg_any_to_server(const char *s, int len, int encoding);
|
||||||
extern char *pg_server_to_any(const char *s, int len, int encoding);
|
extern char *pg_server_to_any(const char *s, int len, int encoding);
|
||||||
|
|
||||||
|
extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
|
||||||
|
|
||||||
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
|
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
|
||||||
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
|
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
|
||||||
|
|
||||||
|
@ -99,9 +99,13 @@ typedef struct core_yy_extra_type
|
|||||||
int literallen; /* actual current string length */
|
int literallen; /* actual current string length */
|
||||||
int literalalloc; /* current allocated buffer size */
|
int literalalloc; /* current allocated buffer size */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Random assorted scanner state.
|
||||||
|
*/
|
||||||
int state_before_str_stop; /* start cond. before end quote */
|
int state_before_str_stop; /* start cond. before end quote */
|
||||||
int xcdepth; /* depth of nesting in slash-star comments */
|
int xcdepth; /* depth of nesting in slash-star comments */
|
||||||
char *dolqstart; /* current $foo$ quote start string */
|
char *dolqstart; /* current $foo$ quote start string */
|
||||||
|
YYLTYPE save_yylloc; /* one-element stack for PUSH_YYLLOC() */
|
||||||
|
|
||||||
/* first part of UTF16 surrogate pair for Unicode escapes */
|
/* first part of UTF16 surrogate pair for Unicode escapes */
|
||||||
int32 utf16_first_part;
|
int32 utf16_first_part;
|
||||||
@ -116,6 +120,14 @@ typedef struct core_yy_extra_type
|
|||||||
*/
|
*/
|
||||||
typedef void *core_yyscan_t;
|
typedef void *core_yyscan_t;
|
||||||
|
|
||||||
|
/* Support for scanner_errposition_callback function */
|
||||||
|
typedef struct ScannerCallbackState
|
||||||
|
{
|
||||||
|
core_yyscan_t yyscanner;
|
||||||
|
int location;
|
||||||
|
ErrorContextCallback errcallback;
|
||||||
|
} ScannerCallbackState;
|
||||||
|
|
||||||
|
|
||||||
/* Constant data exported from parser/scan.l */
|
/* Constant data exported from parser/scan.l */
|
||||||
extern PGDLLIMPORT const uint16 ScanKeywordTokens[];
|
extern PGDLLIMPORT const uint16 ScanKeywordTokens[];
|
||||||
@ -129,6 +141,10 @@ extern void scanner_finish(core_yyscan_t yyscanner);
|
|||||||
extern int core_yylex(core_YYSTYPE *lvalp, YYLTYPE *llocp,
|
extern int core_yylex(core_YYSTYPE *lvalp, YYLTYPE *llocp,
|
||||||
core_yyscan_t yyscanner);
|
core_yyscan_t yyscanner);
|
||||||
extern int scanner_errposition(int location, core_yyscan_t yyscanner);
|
extern int scanner_errposition(int location, core_yyscan_t yyscanner);
|
||||||
|
extern void setup_scanner_errposition_callback(ScannerCallbackState *scbstate,
|
||||||
|
core_yyscan_t yyscanner,
|
||||||
|
int location);
|
||||||
|
extern void cancel_scanner_errposition_callback(ScannerCallbackState *scbstate);
|
||||||
extern void scanner_yyerror(const char *message, core_yyscan_t yyscanner) pg_attribute_noreturn();
|
extern void scanner_yyerror(const char *message, core_yyscan_t yyscanner) pg_attribute_noreturn();
|
||||||
|
|
||||||
#endif /* SCANNER_H */
|
#endif /* SCANNER_H */
|
||||||
|
@ -1,4 +1,19 @@
|
|||||||
|
--
|
||||||
-- encoding-sensitive tests for json and jsonb
|
-- encoding-sensitive tests for json and jsonb
|
||||||
|
--
|
||||||
|
-- We provide expected-results files for UTF8 (json_encoding.out)
|
||||||
|
-- and for SQL_ASCII (json_encoding_1.out). Skip otherwise.
|
||||||
|
SELECT getdatabaseencoding() NOT IN ('UTF8', 'SQL_ASCII')
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
||||||
|
\endif
|
||||||
|
SELECT getdatabaseencoding(); -- just to label the results files
|
||||||
|
getdatabaseencoding
|
||||||
|
---------------------
|
||||||
|
UTF8
|
||||||
|
(1 row)
|
||||||
|
|
||||||
-- first json
|
-- first json
|
||||||
-- basic unicode input
|
-- basic unicode input
|
||||||
SELECT '"\u"'::json; -- ERROR, incomplete escape
|
SELECT '"\u"'::json; -- ERROR, incomplete escape
|
||||||
|
@ -1,4 +1,19 @@
|
|||||||
|
--
|
||||||
-- encoding-sensitive tests for json and jsonb
|
-- encoding-sensitive tests for json and jsonb
|
||||||
|
--
|
||||||
|
-- We provide expected-results files for UTF8 (json_encoding.out)
|
||||||
|
-- and for SQL_ASCII (json_encoding_1.out). Skip otherwise.
|
||||||
|
SELECT getdatabaseencoding() NOT IN ('UTF8', 'SQL_ASCII')
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
||||||
|
\endif
|
||||||
|
SELECT getdatabaseencoding(); -- just to label the results files
|
||||||
|
getdatabaseencoding
|
||||||
|
---------------------
|
||||||
|
SQL_ASCII
|
||||||
|
(1 row)
|
||||||
|
|
||||||
-- first json
|
-- first json
|
||||||
-- basic unicode input
|
-- basic unicode input
|
||||||
SELECT '"\u"'::json; -- ERROR, incomplete escape
|
SELECT '"\u"'::json; -- ERROR, incomplete escape
|
||||||
@ -33,9 +48,7 @@ SELECT '"\uaBcD"'::json; -- OK, uppercase and lower case both OK
|
|||||||
|
|
||||||
-- handling of unicode surrogate pairs
|
-- handling of unicode surrogate pairs
|
||||||
select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
|
select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
|
||||||
ERROR: unsupported Unicode escape sequence
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
CONTEXT: JSON data, line 1: { "a":...
|
|
||||||
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
|
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
|
||||||
ERROR: invalid input syntax for type json
|
ERROR: invalid input syntax for type json
|
||||||
DETAIL: Unicode high surrogate must not follow a high surrogate.
|
DETAIL: Unicode high surrogate must not follow a high surrogate.
|
||||||
@ -84,9 +97,7 @@ select json '{ "a": "null \\u0000 escape" }' as not_an_escape;
|
|||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
select json '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
|
select json '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
|
||||||
ERROR: unsupported Unicode escape sequence
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
CONTEXT: JSON data, line 1: { "a":...
|
|
||||||
select json '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
|
select json '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
|
||||||
correct_everywhere
|
correct_everywhere
|
||||||
--------------------
|
--------------------
|
||||||
@ -144,18 +155,14 @@ CONTEXT: JSON data, line 1: ...
|
|||||||
-- use octet_length here so we don't get an odd unicode char in the
|
-- use octet_length here so we don't get an odd unicode char in the
|
||||||
-- output
|
-- output
|
||||||
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
|
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
|
||||||
ERROR: unsupported Unicode escape sequence
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: SELECT octet_length('"\uaBcD"'::jsonb::text);
|
LINE 1: SELECT octet_length('"\uaBcD"'::jsonb::text);
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
CONTEXT: JSON data, line 1: ...
|
|
||||||
-- handling of unicode surrogate pairs
|
-- handling of unicode surrogate pairs
|
||||||
SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a')::text) AS correct_in_utf8;
|
SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a')::text) AS correct_in_utf8;
|
||||||
ERROR: unsupported Unicode escape sequence
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc3...
|
LINE 1: SELECT octet_length((jsonb '{ "a": "\ud83d\ude04\ud83d\udc3...
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
CONTEXT: JSON data, line 1: { "a":...
|
|
||||||
SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
|
SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
|
||||||
ERROR: invalid input syntax for type json
|
ERROR: invalid input syntax for type json
|
||||||
LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a';
|
LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a';
|
||||||
@ -182,11 +189,9 @@ DETAIL: Unicode low surrogate must follow a high surrogate.
|
|||||||
CONTEXT: JSON data, line 1: { "a":...
|
CONTEXT: JSON data, line 1: { "a":...
|
||||||
-- handling of simple unicode escapes
|
-- handling of simple unicode escapes
|
||||||
SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
|
SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
|
||||||
ERROR: unsupported Unicode escape sequence
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as corr...
|
LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as corr...
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
CONTEXT: JSON data, line 1: { "a":...
|
|
||||||
SELECT jsonb '{ "a": "dollar \u0024 character" }' as correct_everywhere;
|
SELECT jsonb '{ "a": "dollar \u0024 character" }' as correct_everywhere;
|
||||||
correct_everywhere
|
correct_everywhere
|
||||||
-----------------------------
|
-----------------------------
|
||||||
@ -212,11 +217,9 @@ SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape;
|
|||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
|
SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
|
||||||
ERROR: unsupported Unicode escape sequence
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a'...
|
LINE 1: SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' ->> 'a'...
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
CONTEXT: JSON data, line 1: { "a":...
|
|
||||||
SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
|
SELECT jsonb '{ "a": "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
|
||||||
correct_everywhere
|
correct_everywhere
|
||||||
--------------------
|
--------------------
|
||||||
|
9
src/test/regress/expected/json_encoding_2.out
Normal file
9
src/test/regress/expected/json_encoding_2.out
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
--
|
||||||
|
-- encoding-sensitive tests for json and jsonb
|
||||||
|
--
|
||||||
|
-- We provide expected-results files for UTF8 (json_encoding.out)
|
||||||
|
-- and for SQL_ASCII (json_encoding_1.out). Skip otherwise.
|
||||||
|
SELECT getdatabaseencoding() NOT IN ('UTF8', 'SQL_ASCII')
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
@ -1,4 +1,19 @@
|
|||||||
|
--
|
||||||
-- encoding-sensitive tests for jsonpath
|
-- encoding-sensitive tests for jsonpath
|
||||||
|
--
|
||||||
|
-- We provide expected-results files for UTF8 (jsonpath_encoding.out)
|
||||||
|
-- and for SQL_ASCII (jsonpath_encoding_1.out). Skip otherwise.
|
||||||
|
SELECT getdatabaseencoding() NOT IN ('UTF8', 'SQL_ASCII')
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
||||||
|
\endif
|
||||||
|
SELECT getdatabaseencoding(); -- just to label the results files
|
||||||
|
getdatabaseencoding
|
||||||
|
---------------------
|
||||||
|
UTF8
|
||||||
|
(1 row)
|
||||||
|
|
||||||
-- checks for double-quoted values
|
-- checks for double-quoted values
|
||||||
-- basic unicode input
|
-- basic unicode input
|
||||||
SELECT '"\u"'::jsonpath; -- ERROR, incomplete escape
|
SELECT '"\u"'::jsonpath; -- ERROR, incomplete escape
|
||||||
|
@ -1,4 +1,19 @@
|
|||||||
|
--
|
||||||
-- encoding-sensitive tests for jsonpath
|
-- encoding-sensitive tests for jsonpath
|
||||||
|
--
|
||||||
|
-- We provide expected-results files for UTF8 (jsonpath_encoding.out)
|
||||||
|
-- and for SQL_ASCII (jsonpath_encoding_1.out). Skip otherwise.
|
||||||
|
SELECT getdatabaseencoding() NOT IN ('UTF8', 'SQL_ASCII')
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
||||||
|
\endif
|
||||||
|
SELECT getdatabaseencoding(); -- just to label the results files
|
||||||
|
getdatabaseencoding
|
||||||
|
---------------------
|
||||||
|
SQL_ASCII
|
||||||
|
(1 row)
|
||||||
|
|
||||||
-- checks for double-quoted values
|
-- checks for double-quoted values
|
||||||
-- basic unicode input
|
-- basic unicode input
|
||||||
SELECT '"\u"'::jsonpath; -- ERROR, incomplete escape
|
SELECT '"\u"'::jsonpath; -- ERROR, incomplete escape
|
||||||
@ -19,16 +34,14 @@ LINE 1: SELECT '"\u0000"'::jsonpath;
|
|||||||
^
|
^
|
||||||
DETAIL: \u0000 cannot be converted to text.
|
DETAIL: \u0000 cannot be converted to text.
|
||||||
SELECT '"\uaBcD"'::jsonpath; -- OK, uppercase and lower case both OK
|
SELECT '"\uaBcD"'::jsonpath; -- OK, uppercase and lower case both OK
|
||||||
ERROR: invalid input syntax for type jsonpath
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: SELECT '"\uaBcD"'::jsonpath;
|
LINE 1: SELECT '"\uaBcD"'::jsonpath;
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
-- handling of unicode surrogate pairs
|
-- handling of unicode surrogate pairs
|
||||||
select '"\ud83d\ude04\ud83d\udc36"'::jsonpath as correct_in_utf8;
|
select '"\ud83d\ude04\ud83d\udc36"'::jsonpath as correct_in_utf8;
|
||||||
ERROR: invalid input syntax for type jsonpath
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: select '"\ud83d\ude04\ud83d\udc36"'::jsonpath as correct_in_...
|
LINE 1: select '"\ud83d\ude04\ud83d\udc36"'::jsonpath as correct_in_...
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
select '"\ud83d\ud83d"'::jsonpath; -- 2 high surrogates in a row
|
select '"\ud83d\ud83d"'::jsonpath; -- 2 high surrogates in a row
|
||||||
ERROR: invalid input syntax for type jsonpath
|
ERROR: invalid input syntax for type jsonpath
|
||||||
LINE 1: select '"\ud83d\ud83d"'::jsonpath;
|
LINE 1: select '"\ud83d\ud83d"'::jsonpath;
|
||||||
@ -51,10 +64,9 @@ LINE 1: select '"\ude04X"'::jsonpath;
|
|||||||
DETAIL: Unicode low surrogate must follow a high surrogate.
|
DETAIL: Unicode low surrogate must follow a high surrogate.
|
||||||
--handling of simple unicode escapes
|
--handling of simple unicode escapes
|
||||||
select '"the Copyright \u00a9 sign"'::jsonpath as correct_in_utf8;
|
select '"the Copyright \u00a9 sign"'::jsonpath as correct_in_utf8;
|
||||||
ERROR: invalid input syntax for type jsonpath
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: select '"the Copyright \u00a9 sign"'::jsonpath as correct_in...
|
LINE 1: select '"the Copyright \u00a9 sign"'::jsonpath as correct_in...
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
select '"dollar \u0024 character"'::jsonpath as correct_everywhere;
|
select '"dollar \u0024 character"'::jsonpath as correct_everywhere;
|
||||||
correct_everywhere
|
correct_everywhere
|
||||||
----------------------
|
----------------------
|
||||||
@ -98,16 +110,14 @@ LINE 1: SELECT '$."\u0000"'::jsonpath;
|
|||||||
^
|
^
|
||||||
DETAIL: \u0000 cannot be converted to text.
|
DETAIL: \u0000 cannot be converted to text.
|
||||||
SELECT '$."\uaBcD"'::jsonpath; -- OK, uppercase and lower case both OK
|
SELECT '$."\uaBcD"'::jsonpath; -- OK, uppercase and lower case both OK
|
||||||
ERROR: invalid input syntax for type jsonpath
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: SELECT '$."\uaBcD"'::jsonpath;
|
LINE 1: SELECT '$."\uaBcD"'::jsonpath;
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
-- handling of unicode surrogate pairs
|
-- handling of unicode surrogate pairs
|
||||||
select '$."\ud83d\ude04\ud83d\udc36"'::jsonpath as correct_in_utf8;
|
select '$."\ud83d\ude04\ud83d\udc36"'::jsonpath as correct_in_utf8;
|
||||||
ERROR: invalid input syntax for type jsonpath
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: select '$."\ud83d\ude04\ud83d\udc36"'::jsonpath as correct_i...
|
LINE 1: select '$."\ud83d\ude04\ud83d\udc36"'::jsonpath as correct_i...
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
select '$."\ud83d\ud83d"'::jsonpath; -- 2 high surrogates in a row
|
select '$."\ud83d\ud83d"'::jsonpath; -- 2 high surrogates in a row
|
||||||
ERROR: invalid input syntax for type jsonpath
|
ERROR: invalid input syntax for type jsonpath
|
||||||
LINE 1: select '$."\ud83d\ud83d"'::jsonpath;
|
LINE 1: select '$."\ud83d\ud83d"'::jsonpath;
|
||||||
@ -130,10 +140,9 @@ LINE 1: select '$."\ude04X"'::jsonpath;
|
|||||||
DETAIL: Unicode low surrogate must follow a high surrogate.
|
DETAIL: Unicode low surrogate must follow a high surrogate.
|
||||||
--handling of simple unicode escapes
|
--handling of simple unicode escapes
|
||||||
select '$."the Copyright \u00a9 sign"'::jsonpath as correct_in_utf8;
|
select '$."the Copyright \u00a9 sign"'::jsonpath as correct_in_utf8;
|
||||||
ERROR: invalid input syntax for type jsonpath
|
ERROR: conversion between UTF8 and SQL_ASCII is not supported
|
||||||
LINE 1: select '$."the Copyright \u00a9 sign"'::jsonpath as correct_...
|
LINE 1: select '$."the Copyright \u00a9 sign"'::jsonpath as correct_...
|
||||||
^
|
^
|
||||||
DETAIL: Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
|
|
||||||
select '$."dollar \u0024 character"'::jsonpath as correct_everywhere;
|
select '$."dollar \u0024 character"'::jsonpath as correct_everywhere;
|
||||||
correct_everywhere
|
correct_everywhere
|
||||||
------------------------
|
------------------------
|
||||||
|
9
src/test/regress/expected/jsonpath_encoding_2.out
Normal file
9
src/test/regress/expected/jsonpath_encoding_2.out
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
--
|
||||||
|
-- encoding-sensitive tests for jsonpath
|
||||||
|
--
|
||||||
|
-- We provide expected-results files for UTF8 (jsonpath_encoding.out)
|
||||||
|
-- and for SQL_ASCII (jsonpath_encoding_1.out). Skip otherwise.
|
||||||
|
SELECT getdatabaseencoding() NOT IN ('UTF8', 'SQL_ASCII')
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
@ -35,6 +35,12 @@ SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*';
|
|||||||
dat\+000061
|
dat\+000061
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
SELECT U&'a\\b' AS "a\b";
|
||||||
|
a\b
|
||||||
|
-----
|
||||||
|
a\b
|
||||||
|
(1 row)
|
||||||
|
|
||||||
SELECT U&' \' UESCAPE '!' AS "tricky";
|
SELECT U&' \' UESCAPE '!' AS "tricky";
|
||||||
tricky
|
tricky
|
||||||
--------
|
--------
|
||||||
@ -48,13 +54,15 @@ SELECT 'tricky' AS U&"\" UESCAPE '!';
|
|||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT U&'wrong: \061';
|
SELECT U&'wrong: \061';
|
||||||
ERROR: invalid Unicode escape value
|
ERROR: invalid Unicode escape
|
||||||
LINE 1: SELECT U&'wrong: \061';
|
LINE 1: SELECT U&'wrong: \061';
|
||||||
^
|
^
|
||||||
|
HINT: Unicode escapes must be \XXXX or \+XXXXXX.
|
||||||
SELECT U&'wrong: \+0061';
|
SELECT U&'wrong: \+0061';
|
||||||
ERROR: invalid Unicode escape value
|
ERROR: invalid Unicode escape
|
||||||
LINE 1: SELECT U&'wrong: \+0061';
|
LINE 1: SELECT U&'wrong: \+0061';
|
||||||
^
|
^
|
||||||
|
HINT: Unicode escapes must be \XXXX or \+XXXXXX.
|
||||||
SELECT U&'wrong: +0061' UESCAPE +;
|
SELECT U&'wrong: +0061' UESCAPE +;
|
||||||
ERROR: UESCAPE must be followed by a simple string literal at or near "+"
|
ERROR: UESCAPE must be followed by a simple string literal at or near "+"
|
||||||
LINE 1: SELECT U&'wrong: +0061' UESCAPE +;
|
LINE 1: SELECT U&'wrong: +0061' UESCAPE +;
|
||||||
@ -63,6 +71,77 @@ SELECT U&'wrong: +0061' UESCAPE '+';
|
|||||||
ERROR: invalid Unicode escape character at or near "'+'"
|
ERROR: invalid Unicode escape character at or near "'+'"
|
||||||
LINE 1: SELECT U&'wrong: +0061' UESCAPE '+';
|
LINE 1: SELECT U&'wrong: +0061' UESCAPE '+';
|
||||||
^
|
^
|
||||||
|
SELECT U&'wrong: \db99';
|
||||||
|
ERROR: invalid Unicode surrogate pair
|
||||||
|
LINE 1: SELECT U&'wrong: \db99';
|
||||||
|
^
|
||||||
|
SELECT U&'wrong: \db99xy';
|
||||||
|
ERROR: invalid Unicode surrogate pair
|
||||||
|
LINE 1: SELECT U&'wrong: \db99xy';
|
||||||
|
^
|
||||||
|
SELECT U&'wrong: \db99\\';
|
||||||
|
ERROR: invalid Unicode surrogate pair
|
||||||
|
LINE 1: SELECT U&'wrong: \db99\\';
|
||||||
|
^
|
||||||
|
SELECT U&'wrong: \db99\0061';
|
||||||
|
ERROR: invalid Unicode surrogate pair
|
||||||
|
LINE 1: SELECT U&'wrong: \db99\0061';
|
||||||
|
^
|
||||||
|
SELECT U&'wrong: \+00db99\+000061';
|
||||||
|
ERROR: invalid Unicode surrogate pair
|
||||||
|
LINE 1: SELECT U&'wrong: \+00db99\+000061';
|
||||||
|
^
|
||||||
|
SELECT U&'wrong: \+2FFFFF';
|
||||||
|
ERROR: invalid Unicode escape value
|
||||||
|
LINE 1: SELECT U&'wrong: \+2FFFFF';
|
||||||
|
^
|
||||||
|
-- while we're here, check the same cases in E-style literals
|
||||||
|
SELECT E'd\u0061t\U00000061' AS "data";
|
||||||
|
data
|
||||||
|
------
|
||||||
|
data
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT E'a\\b' AS "a\b";
|
||||||
|
a\b
|
||||||
|
-----
|
||||||
|
a\b
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT E'wrong: \u061';
|
||||||
|
ERROR: invalid Unicode escape
|
||||||
|
LINE 1: SELECT E'wrong: \u061';
|
||||||
|
^
|
||||||
|
HINT: Unicode escapes must be \uXXXX or \UXXXXXXXX.
|
||||||
|
SELECT E'wrong: \U0061';
|
||||||
|
ERROR: invalid Unicode escape
|
||||||
|
LINE 1: SELECT E'wrong: \U0061';
|
||||||
|
^
|
||||||
|
HINT: Unicode escapes must be \uXXXX or \UXXXXXXXX.
|
||||||
|
SELECT E'wrong: \udb99';
|
||||||
|
ERROR: invalid Unicode surrogate pair at or near "'"
|
||||||
|
LINE 1: SELECT E'wrong: \udb99';
|
||||||
|
^
|
||||||
|
SELECT E'wrong: \udb99xy';
|
||||||
|
ERROR: invalid Unicode surrogate pair at or near "x"
|
||||||
|
LINE 1: SELECT E'wrong: \udb99xy';
|
||||||
|
^
|
||||||
|
SELECT E'wrong: \udb99\\';
|
||||||
|
ERROR: invalid Unicode surrogate pair at or near "\"
|
||||||
|
LINE 1: SELECT E'wrong: \udb99\\';
|
||||||
|
^
|
||||||
|
SELECT E'wrong: \udb99\u0061';
|
||||||
|
ERROR: invalid Unicode surrogate pair at or near "\u0061"
|
||||||
|
LINE 1: SELECT E'wrong: \udb99\u0061';
|
||||||
|
^
|
||||||
|
SELECT E'wrong: \U0000db99\U00000061';
|
||||||
|
ERROR: invalid Unicode surrogate pair at or near "\U00000061"
|
||||||
|
LINE 1: SELECT E'wrong: \U0000db99\U00000061';
|
||||||
|
^
|
||||||
|
SELECT E'wrong: \U002FFFFF';
|
||||||
|
ERROR: invalid Unicode escape value at or near "\U002FFFFF"
|
||||||
|
LINE 1: SELECT E'wrong: \U002FFFFF';
|
||||||
|
^
|
||||||
SET standard_conforming_strings TO off;
|
SET standard_conforming_strings TO off;
|
||||||
SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
|
SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
|
||||||
ERROR: unsafe use of string constant with Unicode escapes
|
ERROR: unsafe use of string constant with Unicode escapes
|
||||||
|
@ -1,5 +1,16 @@
|
|||||||
|
--
|
||||||
-- encoding-sensitive tests for json and jsonb
|
-- encoding-sensitive tests for json and jsonb
|
||||||
|
--
|
||||||
|
|
||||||
|
-- We provide expected-results files for UTF8 (json_encoding.out)
|
||||||
|
-- and for SQL_ASCII (json_encoding_1.out). Skip otherwise.
|
||||||
|
SELECT getdatabaseencoding() NOT IN ('UTF8', 'SQL_ASCII')
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
||||||
|
\endif
|
||||||
|
|
||||||
|
SELECT getdatabaseencoding(); -- just to label the results files
|
||||||
|
|
||||||
-- first json
|
-- first json
|
||||||
|
|
||||||
|
@ -1,5 +1,16 @@
|
|||||||
|
--
|
||||||
-- encoding-sensitive tests for jsonpath
|
-- encoding-sensitive tests for jsonpath
|
||||||
|
--
|
||||||
|
|
||||||
|
-- We provide expected-results files for UTF8 (jsonpath_encoding.out)
|
||||||
|
-- and for SQL_ASCII (jsonpath_encoding_1.out). Skip otherwise.
|
||||||
|
SELECT getdatabaseencoding() NOT IN ('UTF8', 'SQL_ASCII')
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
||||||
|
\endif
|
||||||
|
|
||||||
|
SELECT getdatabaseencoding(); -- just to label the results files
|
||||||
|
|
||||||
-- checks for double-quoted values
|
-- checks for double-quoted values
|
||||||
|
|
||||||
|
@ -21,6 +21,7 @@ SET standard_conforming_strings TO on;
|
|||||||
|
|
||||||
SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
|
SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
|
||||||
SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*';
|
SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*';
|
||||||
|
SELECT U&'a\\b' AS "a\b";
|
||||||
|
|
||||||
SELECT U&' \' UESCAPE '!' AS "tricky";
|
SELECT U&' \' UESCAPE '!' AS "tricky";
|
||||||
SELECT 'tricky' AS U&"\" UESCAPE '!';
|
SELECT 'tricky' AS U&"\" UESCAPE '!';
|
||||||
@ -30,6 +31,25 @@ SELECT U&'wrong: \+0061';
|
|||||||
SELECT U&'wrong: +0061' UESCAPE +;
|
SELECT U&'wrong: +0061' UESCAPE +;
|
||||||
SELECT U&'wrong: +0061' UESCAPE '+';
|
SELECT U&'wrong: +0061' UESCAPE '+';
|
||||||
|
|
||||||
|
SELECT U&'wrong: \db99';
|
||||||
|
SELECT U&'wrong: \db99xy';
|
||||||
|
SELECT U&'wrong: \db99\\';
|
||||||
|
SELECT U&'wrong: \db99\0061';
|
||||||
|
SELECT U&'wrong: \+00db99\+000061';
|
||||||
|
SELECT U&'wrong: \+2FFFFF';
|
||||||
|
|
||||||
|
-- while we're here, check the same cases in E-style literals
|
||||||
|
SELECT E'd\u0061t\U00000061' AS "data";
|
||||||
|
SELECT E'a\\b' AS "a\b";
|
||||||
|
SELECT E'wrong: \u061';
|
||||||
|
SELECT E'wrong: \U0061';
|
||||||
|
SELECT E'wrong: \udb99';
|
||||||
|
SELECT E'wrong: \udb99xy';
|
||||||
|
SELECT E'wrong: \udb99\\';
|
||||||
|
SELECT E'wrong: \udb99\u0061';
|
||||||
|
SELECT E'wrong: \U0000db99\U00000061';
|
||||||
|
SELECT E'wrong: \U002FFFFF';
|
||||||
|
|
||||||
SET standard_conforming_strings TO off;
|
SET standard_conforming_strings TO off;
|
||||||
|
|
||||||
SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
|
SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
|
||||||
|
Loading…
x
Reference in New Issue
Block a user