24
24
* Portions Copyright (c) 1994, Regents of the University of California
25
25
*
26
26
* IDENTIFICATION
27
- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
27
+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
28
28
*
29
29
*-------------------------------------------------------------------------
30
30
*/
@@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
1097
1097
}
1098
1098
}
1099
1099
1100
+ static bool
1101
+ is_utf16_surrogate_first (pg_wchar c)
1102
+ {
1103
+ return (c >= 0xD800 && c <= 0xDBFF );
1104
+ }
1105
+
1106
+ static bool
1107
+ is_utf16_surrogate_second (pg_wchar c)
1108
+ {
1109
+ return (c >= 0xDC00 && c <= 0xDFFF );
1110
+ }
1111
+
1112
+ static pg_wchar
1113
+ surrogate_pair_to_codepoint (pg_wchar first, pg_wchar second)
1114
+ {
1115
+ return ((first & 0x3FF ) << 10 ) + 0x10000 + (second & 0x3FF );
1116
+ }
1117
+
1100
1118
static char *
1101
1119
litbuf_udeescape (unsigned char escape, base_yyscan_t yyscanner)
1102
1120
{
1103
1121
char *new ;
1104
1122
char *litbuf, *in, *out;
1123
+ pg_wchar pair_first = 0 ;
1105
1124
1106
1125
if (isxdigit (escape)
1107
1126
|| escape == ' +'
@@ -1131,16 +1150,39 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
1131
1150
{
1132
1151
if (in[1 ] == escape)
1133
1152
{
1153
+ if (pair_first)
1154
+ {
1155
+ ADVANCE_YYLLOC (in - litbuf + 3 ); /* 3 for U&" */
1156
+ yyerror (" invalid Unicode surrogate pair" );
1157
+ }
1134
1158
*out++ = escape;
1135
1159
in += 2 ;
1136
1160
}
1137
1161
else if (isxdigit (in[1 ]) && isxdigit (in[2 ]) && isxdigit (in[3 ]) && isxdigit (in[4 ]))
1138
1162
{
1139
1163
pg_wchar unicode = hexval (in[1 ]) * 16 *16 *16 + hexval (in[2 ]) * 16 *16 + hexval (in[3 ]) * 16 + hexval (in[4 ]);
1140
1164
check_unicode_value (unicode, in, yyscanner);
1141
- unicode_to_utf8 (unicode, (unsigned char *) out);
1165
+ if (pair_first)
1166
+ {
1167
+ if (is_utf16_surrogate_second (unicode))
1168
+ {
1169
+ unicode = surrogate_pair_to_codepoint (pair_first, unicode);
1170
+ pair_first = 0 ;
1171
+ }
1172
+ else
1173
+ {
1174
+ ADVANCE_YYLLOC (in - litbuf + 3 ); /* 3 for U&" */
1175
+ yyerror (" invalid Unicode surrogate pair" );
1176
+ }
1177
+ }
1178
+ if (is_utf16_surrogate_first (unicode))
1179
+ pair_first = unicode;
1180
+ else
1181
+ {
1182
+ unicode_to_utf8 (unicode, (unsigned char *) out);
1183
+ out += pg_mblen (out);
1184
+ }
1142
1185
in += 5 ;
1143
- out += pg_mblen (out);
1144
1186
}
1145
1187
else if (in[1 ] == ' +'
1146
1188
&& isxdigit (in[2 ]) && isxdigit (in[3 ])
@@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
1150
1192
pg_wchar unicode = hexval (in[2 ]) * 16 *16 *16 *16 *16 + hexval (in[3 ]) * 16 *16 *16 *16 + hexval (in[4 ]) * 16 *16 *16
1151
1193
+ hexval (in[5 ]) * 16 *16 + hexval (in[6 ]) * 16 + hexval (in[7 ]);
1152
1194
check_unicode_value (unicode, in, yyscanner);
1153
- unicode_to_utf8 (unicode, (unsigned char *) out);
1195
+ if (pair_first)
1196
+ {
1197
+ if (is_utf16_surrogate_second (unicode))
1198
+ {
1199
+ unicode = surrogate_pair_to_codepoint (pair_first, unicode);
1200
+ pair_first = 0 ;
1201
+ }
1202
+ else
1203
+ {
1204
+ ADVANCE_YYLLOC (in - litbuf + 3 ); /* 3 for U&" */
1205
+ yyerror (" invalid Unicode surrogate pair" );
1206
+ }
1207
+ }
1208
+ if (is_utf16_surrogate_first (unicode))
1209
+ pair_first = unicode;
1210
+ else
1211
+ {
1212
+ unicode_to_utf8 (unicode, (unsigned char *) out);
1213
+ out += pg_mblen (out);
1214
+ }
1154
1215
in += 8 ;
1155
- out += pg_mblen (out);
1156
1216
}
1157
1217
else
1158
1218
{
@@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
1161
1221
}
1162
1222
}
1163
1223
else
1224
+ {
1225
+ if (pair_first)
1226
+ {
1227
+ ADVANCE_YYLLOC (in - litbuf + 3 ); /* 3 for U&" */
1228
+ yyerror (" invalid Unicode surrogate pair" );
1229
+ }
1164
1230
*out++ = *in++;
1231
+ }
1165
1232
}
1166
1233
1167
1234
*out = ' \0 ' ;
0 commit comments