diff options
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/parser/gram.y | 10 | ||||
-rw-r--r-- | src/backend/parser/parser.c | 282 | ||||
-rw-r--r-- | src/backend/parser/scan.l | 414 |
3 files changed, 358 insertions, 348 deletions
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index ad5be902b07..3806687ae37 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -598,10 +598,13 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); * the set of keywords. PL/pgSQL depends on this so that it can share the * same lexer. If you add/change tokens here, fix PL/pgSQL to match! * + * UIDENT and USCONST are reduced to IDENT and SCONST in parser.c, so that + * they need no productions here; but we must assign token codes to them. + * * DOT_DOT is unused in the core SQL grammar, and so will always provoke * parse errors. It is needed by PL/pgSQL. */ -%token <str> IDENT FCONST SCONST BCONST XCONST Op +%token <str> IDENT UIDENT FCONST SCONST USCONST BCONST XCONST Op %token <ival> ICONST PARAM %token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER %token LESS_EQUALS GREATER_EQUALS NOT_EQUALS @@ -691,8 +694,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); TREAT TRIGGER TRIM TRUE_P TRUNCATE TRUSTED TYPE_P TYPES_P - UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED - UNTIL UPDATE USER USING + UESCAPE UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN + UNLISTEN UNLOGGED UNTIL UPDATE USER USING VACUUM VALID VALIDATE VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING VERBOSE VERSION_P VIEW VIEWS VOLATILE @@ -15374,6 +15377,7 @@ unreserved_keyword: | TRUSTED | TYPE_P | TYPES_P + | UESCAPE | UNBOUNDED | UNCOMMITTED | UNENCRYPTED diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c index bc3f812da8e..1bf1144c4fd 100644 --- a/src/backend/parser/parser.c +++ b/src/backend/parser/parser.c @@ -21,8 +21,14 @@ #include "postgres.h" +#include "mb/pg_wchar.h" #include "parser/gramparse.h" #include "parser/parser.h" +#include "parser/scansup.h" + +static bool check_uescapechar(unsigned char escape); +static char *str_udeescape(const char *str, char escape, + int position, core_yyscan_t yyscanner); /* @@ -75,6 +81,10 @@ raw_parser(const char *str) * scanner backtrack, which would cost more performance than this filter * layer does. * + * We also use this filter to convert UIDENT and USCONST sequences into + * plain IDENT and SCONST tokens. While that could be handled by additional + * productions in the main grammar, it's more efficient to do it like this. + * * The filter also provides a convenient place to translate between * the core_YYSTYPE and YYSTYPE representations (which are really the * same thing anyway, but notationally they're different). @@ -104,7 +114,7 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) * If this token isn't one that requires lookahead, just return it. If it * does, determine the token length. (We could get that via strlen(), but * since we have such a small set of possibilities, hardwiring seems - * feasible and more efficient.) + * feasible and more efficient --- at least for the fixed-length cases.) */ switch (cur_token) { @@ -117,6 +127,10 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) case WITH: cur_token_length = 4; break; + case UIDENT: + case USCONST: + cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp); + break; default: return cur_token; } @@ -190,7 +204,273 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) break; } break; + + case UIDENT: + case USCONST: + /* Look ahead for UESCAPE */ + if (next_token == UESCAPE) + { + /* Yup, so get third token, which had better be SCONST */ + const char *escstr; + + /* Again save and restore *llocp */ + cur_yylloc = *llocp; + + /* Un-truncate current token so errors point to third token */ + *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; + + /* Get third token */ + next_token = core_yylex(&(yyextra->lookahead_yylval), + llocp, yyscanner); + + /* If we throw error here, it will point to third token */ + if (next_token != SCONST) + scanner_yyerror("UESCAPE must be followed by a simple string literal", + yyscanner); + + escstr = yyextra->lookahead_yylval.str; + if (strlen(escstr) != 1 || !check_uescapechar(escstr[0])) + scanner_yyerror("invalid Unicode escape character", + yyscanner); + + /* Now restore *llocp; errors will point to first token */ + *llocp = cur_yylloc; + + /* Apply Unicode conversion */ + lvalp->core_yystype.str = + str_udeescape(lvalp->core_yystype.str, + escstr[0], + *llocp, + yyscanner); + + /* + * We don't need to revert the un-truncation of UESCAPE. What + * we do want to do is clear have_lookahead, thereby consuming + * all three tokens. + */ + yyextra->have_lookahead = false; + } + else + { + /* No UESCAPE, so convert using default escape character */ + lvalp->core_yystype.str = + str_udeescape(lvalp->core_yystype.str, + '\\', + *llocp, + yyscanner); + } + + if (cur_token == UIDENT) + { + /* It's an identifier, so truncate as appropriate */ + truncate_identifier(lvalp->core_yystype.str, + strlen(lvalp->core_yystype.str), + true); + cur_token = IDENT; + } + else if (cur_token == USCONST) + { + cur_token = SCONST; + } + break; } return cur_token; } + +/* convert hex digit (caller should have verified that) to value */ +static unsigned int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +/* is Unicode code point acceptable in database's encoding? */ +static void +check_unicode_value(pg_wchar c, int pos, core_yyscan_t yyscanner) +{ + /* See also addunicode() in scan.l */ + if (c == 0 || c > 0x10FFFF) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape value"), + scanner_errposition(pos, yyscanner))); + + if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"), + scanner_errposition(pos, yyscanner))); +} + +/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ +static bool +check_uescapechar(unsigned char escape) +{ + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || scanner_isspace(escape)) + return false; + else + return true; +} + +/* + * Process Unicode escapes in "str", producing a palloc'd plain string + * + * escape: the escape character to use + * position: start position of U&'' or U&"" string token + * yyscanner: context information needed for error reports + */ +static char * +str_udeescape(const char *str, char escape, + int position, core_yyscan_t yyscanner) +{ + const char *in; + char *new, + *out; + pg_wchar pair_first = 0; + + /* + * This relies on the subtle assumption that a UTF-8 expansion cannot be + * longer than its escaped representation. + */ + new = palloc(strlen(str) + 1); + + in = str; + out = new; + while (*in) + { + if (in[0] == escape) + { + if (in[1] == escape) + { + if (pair_first) + goto invalid_pair; + *out++ = escape; + in += 2; + } + else if (isxdigit((unsigned char) in[1]) && + isxdigit((unsigned char) in[2]) && + isxdigit((unsigned char) in[3]) && + isxdigit((unsigned char) in[4])) + { + pg_wchar unicode; + + unicode = (hexval(in[1]) << 12) + + (hexval(in[2]) << 8) + + (hexval(in[3]) << 4) + + hexval(in[4]); + check_unicode_value(unicode, + in - str + position + 3, /* 3 for U&" */ + yyscanner); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } + in += 5; + } + else if (in[1] == '+' && + isxdigit((unsigned char) in[2]) && + isxdigit((unsigned char) in[3]) && + isxdigit((unsigned char) in[4]) && + isxdigit((unsigned char) in[5]) && + isxdigit((unsigned char) in[6]) && + isxdigit((unsigned char) in[7])) + { + pg_wchar unicode; + + unicode = (hexval(in[2]) << 20) + + (hexval(in[3]) << 16) + + (hexval(in[4]) << 12) + + (hexval(in[5]) << 8) + + (hexval(in[6]) << 4) + + hexval(in[7]); + check_unicode_value(unicode, + in - str + position + 3, /* 3 for U&" */ + yyscanner); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } + in += 8; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape value"), + scanner_errposition(in - str + position + 3, /* 3 for U&" */ + yyscanner))); + } + else + { + if (pair_first) + goto invalid_pair; + + *out++ = *in++; + } + } + + /* unfinished surrogate pair? */ + if (pair_first) + goto invalid_pair; + + *out = '\0'; + + /* + * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII + * codes; but it's probably not worth the trouble, since this isn't likely + * to be a performance-critical path. + */ + pg_verifymbstr(new, out - new, false); + return new; + +invalid_pair: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode surrogate pair"), + scanner_errposition(in - str + position + 3, /* 3 for U&" */ + yyscanner))); + return NULL; /* keep compiler quiet */ +} diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index e25e12e4614..84c73914a85 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -13,8 +13,8 @@ * in the sense that there is always a rule that can match the input * consumed so far (the rule action may internally throw back some input * with yyless(), however). As explained in the flex manual, this makes - * for a useful speed increase --- about a third faster than a plain -CF - * lexer, in simple testing. The extra complexity is mostly in the rules + * for a useful speed increase --- several percent faster when measuring + * raw parsing (Flex + Bison). The extra complexity is mostly in the rules * for handling float numbers and continued string literals. If you change * the lexical rules, verify that you haven't broken the no-backtrack * property by running flex with the "-b" option and checking that the @@ -110,14 +110,9 @@ const uint16 ScanKeywordTokens[] = { static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner); static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner); static char *litbufdup(core_yyscan_t yyscanner); -static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner); static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner); static int process_integer_literal(const char *token, YYSTYPE *lval); -static bool is_utf16_surrogate_first(pg_wchar c); -static bool is_utf16_surrogate_second(pg_wchar c); -static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second); static void addunicode(pg_wchar c, yyscan_t yyscanner); -static bool check_uescapechar(unsigned char escape); #define yyerror(msg) scanner_yyerror(msg, yyscanner) @@ -168,12 +163,11 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); * <xd> delimited identifiers (double-quoted identifiers) * <xh> hexadecimal numeric string * <xq> standard quoted strings + * <xqs> quote stop (detect continued strings) * <xe> extended quoted strings (support backslash escape sequences) * <xdolq> $foo$ quoted strings * <xui> quoted identifier with Unicode escapes - * <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow * <xus> quoted string with Unicode escapes - * <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow * <xeu> Unicode surrogate pair in extended quoted string * * Remember to add an <<EOF>> case whenever you add a new exclusive state! @@ -185,12 +179,11 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); %x xd %x xh %x xq +%x xqs %x xe %x xdolq %x xui -%x xuiend %x xus -%x xusend %x xeu /* @@ -231,19 +224,18 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +quote ' +/* If we see {quote} then {quotecontinue}, the quoted string continues */ +quotecontinue {whitespace_with_newline}{quote} + /* - * To ensure that {quotecontinue} can be scanned without having to back up - * if the full pattern isn't matched, we include trailing whitespace in - * {quotestop}. This matches all cases where {quotecontinue} fails to match, - * except for {quote} followed by whitespace and just one "-" (not two, - * which would start a {comment}). To cover that we have {quotefail}. - * The actions for {quotestop} and {quotefail} must throw back characters - * beyond the quote proper. + * {quotecontinuefail} is needed to avoid lexer backup when we fail to match + * {quotecontinue}. It might seem that this could just be {whitespace}*, + * but if there's a dash after {whitespace_with_newline}, it must be consumed + * to see if there's another dash --- which would start a {comment} and thus + * allow continuation of the {quotecontinue} token. */ -quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" +quotecontinuefail {whitespace}*"-"? /* Bit string * It is tempting to scan the string for only those characters @@ -304,21 +296,12 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ -/* Unicode escapes */ -uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} -/* error rule to avoid backup */ -uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU] - /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} -/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */ -xustop1 {uescapefail}? -xustop2 {uescape} - /* error rule to avoid backup */ xufailed [uU]& @@ -476,21 +459,10 @@ other . startlit(); addlitchar('b', yyscanner); } -<xb>{quotestop} | -<xb>{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return BCONST; - } <xh>{xhinside} | <xb>{xbinside} { addlit(yytext, yyleng, yyscanner); } -<xh>{quotecontinue} | -<xb>{quotecontinue} { - /* ignore */ - } <xb><<EOF>> { yyerror("unterminated bit string literal"); } {xhstart} { @@ -505,13 +477,6 @@ other . startlit(); addlitchar('x', yyscanner); } -<xh>{quotestop} | -<xh>{quotefail} { - yyless(1); - BEGIN(INITIAL); - yylval->str = litbufdup(yyscanner); - return XCONST; - } <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); } {xnstart} { @@ -568,53 +533,66 @@ other . BEGIN(xus); startlit(); } -<xq,xe>{quotestop} | -<xq,xe>{quotefail} { - yyless(1); - BEGIN(INITIAL); + +<xb,xh,xq,xe,xus>{quote} { /* - * check that the data remains valid if it might have been - * made invalid by unescaping any chars. + * When we are scanning a quoted string and see an end + * quote, we must look ahead for a possible continuation. + * If we don't see one, we know the end quote was in fact + * the end of the string. To reduce the lexer table size, + * we use a single "xqs" state to do the lookahead for all + * types of strings. */ - if (yyextra->saw_non_ascii) - pg_verifymbstr(yyextra->literalbuf, - yyextra->literallen, - false); - yylval->str = litbufdup(yyscanner); - return SCONST; - } -<xus>{quotestop} | -<xus>{quotefail} { - /* throw back all but the quote */ - yyless(1); - /* xusend state looks for possible UESCAPE */ - BEGIN(xusend); + yyextra->state_before_str_stop = YYSTATE; + BEGIN(xqs); } -<xusend>{whitespace} { - /* stay in xusend state over whitespace */ +<xqs>{quotecontinue} { + /* + * Found a quote continuation, so return to the in-quote + * state and continue scanning the literal. Nothing is + * added to the literal's contents. + */ + BEGIN(yyextra->state_before_str_stop); } -<xusend><<EOF>> | -<xusend>{other} | -<xusend>{xustop1} { - /* no UESCAPE after the quote, throw back everything */ +<xqs>{quotecontinuefail} | +<xqs>{other} | +<xqs><<EOF>> { + /* + * Failed to see a quote continuation. Throw back + * everything after the end quote, and handle the string + * according to the state we were in previously. + */ yyless(0); BEGIN(INITIAL); - yylval->str = litbuf_udeescape('\\', yyscanner); - return SCONST; - } -<xusend>{xustop2} { - /* found UESCAPE after the end quote */ - BEGIN(INITIAL); - if (!check_uescapechar(yytext[yyleng - 2])) + + switch (yyextra->state_before_str_stop) { - SET_YYLLOC(); - ADVANCE_YYLLOC(yyleng - 2); - yyerror("invalid Unicode escape character"); + case xb: + yylval->str = litbufdup(yyscanner); + return BCONST; + case xh: + yylval->str = litbufdup(yyscanner); + return XCONST; + case xq: + case xe: + /* + * Check that the data remains valid, if it might + * have been made invalid by unescaping any chars. + */ + if (yyextra->saw_non_ascii) + pg_verifymbstr(yyextra->literalbuf, + yyextra->literallen, + false); + yylval->str = litbufdup(yyscanner); + return SCONST; + case xus: + yylval->str = litbufdup(yyscanner); + return USCONST; + default: + yyerror("unhandled previous state in xqs"); } - yylval->str = litbuf_udeescape(yytext[yyleng - 2], - yyscanner); - return SCONST; } + <xq,xe,xus>{xqdouble} { addlitchar('\'', yyscanner); } @@ -693,9 +671,6 @@ other . if (c == '\0' || IS_HIGHBIT_SET(c)) yyextra->saw_non_ascii = true; } -<xq,xe,xus>{quotecontinue} { - /* ignore */ - } <xe>. { /* This is only needed for \ just before EOF */ addlitchar(yytext[0], yyscanner); @@ -769,53 +744,13 @@ other . yylval->str = ident; return IDENT; } -<xui>{dquote} { - yyless(1); - /* xuiend state looks for possible UESCAPE */ - BEGIN(xuiend); - } -<xuiend>{whitespace} { - /* stay in xuiend state over whitespace */ - } -<xuiend><<EOF>> | -<xuiend>{other} | -<xuiend>{xustop1} { - /* no UESCAPE after the quote, throw back everything */ - char *ident; - int identlen; - - yyless(0); - - BEGIN(INITIAL); - if (yyextra->literallen == 0) - yyerror("zero-length delimited identifier"); - ident = litbuf_udeescape('\\', yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; - } -<xuiend>{xustop2} { - /* found UESCAPE after the end quote */ - char *ident; - int identlen; - +<xui>{dquote} { BEGIN(INITIAL); if (yyextra->literallen == 0) yyerror("zero-length delimited identifier"); - if (!check_uescapechar(yytext[yyleng - 2])) - { - SET_YYLLOC(); - ADVANCE_YYLLOC(yyleng - 2); - yyerror("invalid Unicode escape character"); - } - ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); - identlen = strlen(ident); - if (identlen >= NAMEDATALEN) - truncate_identifier(ident, identlen, true); - yylval->str = ident; - return IDENT; + /* can't truncate till after we de-escape the ident */ + yylval->str = litbufdup(yyscanner); + return UIDENT; } <xd,xui>{xddouble} { addlitchar('"', yyscanner); @@ -1288,55 +1223,12 @@ process_integer_literal(const char *token, YYSTYPE *lval) return ICONST; } -static unsigned int -hexval(unsigned char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 0xA; - if (c >= 'A' && c <= 'F') - return c - 'A' + 0xA; - elog(ERROR, "invalid hexadecimal digit"); - return 0; /* not reached */ -} - -static void -check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner) -{ - if (GetDatabaseEncoding() == PG_UTF8) - return; - - if (c > 0x7F) - { - ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */ - yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); - } -} - -static bool -is_utf16_surrogate_first(pg_wchar c) -{ - return (c >= 0xD800 && c <= 0xDBFF); -} - -static bool -is_utf16_surrogate_second(pg_wchar c) -{ - return (c >= 0xDC00 && c <= 0xDFFF); -} - -static pg_wchar -surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) -{ - return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); -} - static void addunicode(pg_wchar c, core_yyscan_t yyscanner) { char buf[8]; + /* See also check_unicode_value() in parser.c */ if (c == 0 || c > 0x10FFFF) yyerror("invalid Unicode escape value"); if (c > 0x7F) @@ -1349,172 +1241,6 @@ addunicode(pg_wchar c, core_yyscan_t yyscanner) addlit(buf, pg_mblen(buf), yyscanner); } -/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ -static bool -check_uescapechar(unsigned char escape) -{ - if (isxdigit(escape) - || escape == '+' - || escape == '\'' - || escape == '"' - || scanner_isspace(escape)) - { - return false; - } - else - return true; -} - -/* like litbufdup, but handle unicode escapes */ -static char * -litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner) -{ - char *new; - char *litbuf, - *in, - *out; - pg_wchar pair_first = 0; - - /* Make literalbuf null-terminated to simplify the scanning loop */ - litbuf = yyextra->literalbuf; - litbuf[yyextra->literallen] = '\0'; - - /* - * This relies on the subtle assumption that a UTF-8 expansion cannot be - * longer than its escaped representation. - */ - new = palloc(yyextra->literallen + 1); - - in = litbuf; - out = new; - while (*in) - { - if (in[0] == escape) - { - if (in[1] == escape) - { - if (pair_first) - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - *out++ = escape; - in += 2; - } - else if (isxdigit((unsigned char) in[1]) && - isxdigit((unsigned char) in[2]) && - isxdigit((unsigned char) in[3]) && - isxdigit((unsigned char) in[4])) - { - pg_wchar unicode; - - unicode = (hexval(in[1]) << 12) + - (hexval(in[2]) << 8) + - (hexval(in[3]) << 4) + - hexval(in[4]); - check_unicode_value(unicode, in, yyscanner); - if (pair_first) - { - if (is_utf16_surrogate_second(unicode)) - { - unicode = surrogate_pair_to_codepoint(pair_first, unicode); - pair_first = 0; - } - else - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - } - else if (is_utf16_surrogate_second(unicode)) - yyerror("invalid Unicode surrogate pair"); - - if (is_utf16_surrogate_first(unicode)) - pair_first = unicode; - else - { - unicode_to_utf8(unicode, (unsigned char *) out); - out += pg_mblen(out); - } - in += 5; - } - else if (in[1] == '+' && - isxdigit((unsigned char) in[2]) && - isxdigit((unsigned char) in[3]) && - isxdigit((unsigned char) in[4]) && - isxdigit((unsigned char) in[5]) && - isxdigit((unsigned char) in[6]) && - isxdigit((unsigned char) in[7])) - { - pg_wchar unicode; - - unicode = (hexval(in[2]) << 20) + - (hexval(in[3]) << 16) + - (hexval(in[4]) << 12) + - (hexval(in[5]) << 8) + - (hexval(in[6]) << 4) + - hexval(in[7]); - check_unicode_value(unicode, in, yyscanner); - if (pair_first) - { - if (is_utf16_surrogate_second(unicode)) - { - unicode = surrogate_pair_to_codepoint(pair_first, unicode); - pair_first = 0; - } - else - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - } - else if (is_utf16_surrogate_second(unicode)) - yyerror("invalid Unicode surrogate pair"); - - if (is_utf16_surrogate_first(unicode)) - pair_first = unicode; - else - { - unicode_to_utf8(unicode, (unsigned char *) out); - out += pg_mblen(out); - } - in += 8; - } - else - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode escape value"); - } - } - else - { - if (pair_first) - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - *out++ = *in++; - } - } - - /* unfinished surrogate pair? */ - if (pair_first) - { - ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ - yyerror("invalid Unicode surrogate pair"); - } - - *out = '\0'; - - /* - * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII - * codes; but it's probably not worth the trouble, since this isn't likely - * to be a performance-critical path. - */ - pg_verifymbstr(new, out - new, false); - return new; -} - static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner) { |