Handle Unicode surrogate pairs correctly when processing JSON.

author Andrew Dunstan <andrew@dunslane.net>

Sat, 8 Jun 2013 13:12:48 +0000 (09:12 -0400)

committer Andrew Dunstan <andrew@dunslane.net>

Sat, 8 Jun 2013 13:12:48 +0000 (09:12 -0400)
author Andrew Dunstan <andrew@dunslane.net>
Sat, 8 Jun 2013 13:12:48 +0000 (09:12 -0400)
committer Andrew Dunstan <andrew@dunslane.net>
Sat, 8 Jun 2013 13:12:48 +0000 (09:12 -0400)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml

index 2c02fd1e1b4e6e4b797d758462250fcea590b638..3adb36579eaceec5cab927665c4fbd5e0b376fad 100644 (file)
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -10150,6 +10150,15 @@ table2-mapping
      </tgroup>
     </table>
  
+  <note>
+    <para>
+      The <type>json</type> functions and operators can impose stricter validity requirements
+      than the type's input functions. In particular, they check much more closely that any use
+      of Unicode surrogate pairs to designate characters outside the Unicode Basic Multilingual
+      Plane is correct.
+    </para>
+  </note>
+
    <note>
      <para>
        The <xref linkend="hstore"> extension has a cast from <type>hstore</type> to
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c

index aaf99bddf27d579646d77cf52b287f45dbb5d68b..d8046c5b54dbdfe1c30fc442cd4a8d5ccf821b6a 100644 (file)
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -646,6 +646,7 @@ json_lex_string(JsonLexContext *lex)
  {
     char       *s;
     int         len;
+   int         hi_surrogate = -1;
  
     if (lex->strval != NULL)
         resetStringInfo(lex->strval);
@@ -718,6 +719,36 @@ json_lex_string(JsonLexContext *lex)
                     int         utf8len;
                     char       *converted;
  
+                   if (ch >= 0xd800 && ch <= 0xdbff)
+                   {
+                       if (hi_surrogate != -1)
+                           ereport(ERROR,
+                              (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                               errmsg("invalid input syntax for type json"),
+                               errdetail("high order surrogate must not follow a high order surrogate."),
+                               report_json_context(lex)));
+                       hi_surrogate = (ch & 0x3ff) << 10;
+                       continue;
+                   }
+                   else if (ch >= 0xdc00 && ch <= 0xdfff)
+                   {
+                       if (hi_surrogate == -1)
+                           ereport(ERROR,
+                              (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                               errmsg("invalid input syntax for type json"),
+                               errdetail("low order surrogate must follow a high order surrogate."),
+                               report_json_context(lex)));
+                       ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
+                       hi_surrogate = -1;
+                   }
+
+                   if (hi_surrogate != -1)
+                       ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                                errmsg("invalid input syntax for type json"),
+                                errdetail("low order surrogate must follow a high order surrogate."),
+                                report_json_context(lex)));
+
                     unicode_to_utf8(ch, (unsigned char *) utf8str);
                     utf8len = pg_utf_mblen((unsigned char *) utf8str);
                     utf8str[utf8len] = '\0';
@@ -730,6 +761,13 @@ json_lex_string(JsonLexContext *lex)
             }
             else if (lex->strval != NULL)
             {
+               if (hi_surrogate != -1)
+                   ereport(ERROR,
+                           (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                            errmsg("invalid input syntax for type json"),
+                            errdetail("low order surrogate must follow a high order surrogate."),
+                            report_json_context(lex)));
+
                 switch (*s)
                 {
                     case '"':
@@ -784,11 +822,25 @@ json_lex_string(JsonLexContext *lex)
         }
         else if (lex->strval != NULL)
         {
+           if (hi_surrogate != -1)
+               ereport(ERROR,
+                       (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                        errmsg("invalid input syntax for type json"),
+                        errdetail("low order surrogate must follow a high order surrogate."),
+                        report_json_context(lex)));
+
             appendStringInfoChar(lex->strval, *s);
         }
  
     }
  
+   if (hi_surrogate != -1)
+       ereport(ERROR,
+               (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                errmsg("invalid input syntax for type json"),
+       errdetail("low order surrogate must follow a high order surrogate."),
+                report_json_context(lex)));
+
     /* Hooray, we found the end of the string! */
     lex->prev_token_terminator = lex->token_terminator;
     lex->token_terminator = s + 1;
diff --git a/src/test/regress/expected/json.out b/src/test/regress/expected/json.out

index 1d7cf5ff2f302a59fbcd38254e6df0f743adf72f..293c7429627f549b5832c412b7df16d14bc32fde 100644 (file)
--- a/src/test/regress/expected/json.out
+++ b/src/test/regress/expected/json.out
@@ -920,3 +920,26 @@ select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":[100,200,3
  ERROR:  cannot call json_populate_recordset on a nested object
  select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
  ERROR:  cannot call json_populate_recordset on a nested object
+-- handling of unicode surrogate pairs
+select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct;
+          correct           
+----------------------------
+ "\ud83d\ude04\ud83d\udc36"
+(1 row)
+
+select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
+ERROR:  invalid input syntax for type json
+DETAIL:  high order surrogate must not follow a high order surrogate.
+CONTEXT:  JSON data, line 1: { "a":...
+select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
+ERROR:  invalid input syntax for type json
+DETAIL:  low order surrogate must follow a high order surrogate.
+CONTEXT:  JSON data, line 1: { "a":...
+select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
+ERROR:  invalid input syntax for type json
+DETAIL:  low order surrogate must follow a high order surrogate.
+CONTEXT:  JSON data, line 1: { "a":...
+select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
+ERROR:  invalid input syntax for type json
+DETAIL:  low order surrogate must follow a high order surrogate.
+CONTEXT:  JSON data, line 1: { "a":...
diff --git a/src/test/regress/sql/json.sql b/src/test/regress/sql/json.sql

index 8a136d7a2736ac6bea2c058e1452ad3457f7d961..5b6bc36517e305adbb22adf43dcbd1f4ca74b3ca 100644 (file)
--- a/src/test/regress/sql/json.sql
+++ b/src/test/regress/sql/json.sql
@@ -296,3 +296,11 @@ select * from json_populate_recordset(null::jpop,'[{"a":"blurfl","x":43.2},{"b":
  select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":"blurfl","x":43.2},{"b":3,"c":"2012-01-20 10:42:53"}]') q;
  select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
  select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
+
+-- handling of unicode surrogate pairs
+
+select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct;
+select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
+select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
+select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
+select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
author	Andrew Dunstan <andrew@dunslane.net>
	Sat, 8 Jun 2013 13:12:48 +0000 (09:12 -0400)
committer	Andrew Dunstan <andrew@dunslane.net>
	Sat, 8 Jun 2013 13:12:48 +0000 (09:12 -0400)
doc/src/sgml/func.sgml		patch \| blob \| blame \| history
src/backend/utils/adt/json.c		patch \| blob \| blame \| history
src/test/regress/expected/json.out		patch \| blob \| blame \| history
src/test/regress/sql/json.sql		patch \| blob \| blame \| history