@@ -140,6 +140,7 @@ static int parse_xml_decl(const xmlChar *str, size_t *lenp,
140
140
xmlChar * * version , xmlChar * * encoding , int * standalone );
141
141
static bool print_xml_decl (StringInfo buf , const xmlChar * version ,
142
142
pg_enc encoding , int standalone );
143
+ static bool xml_doctype_in_content (const xmlChar * str );
143
144
static xmlDocPtr xml_parse (text * data , XmlOptionType xmloption_arg ,
144
145
bool preserve_whitespace , int encoding );
145
146
static text * xml_xmlnodetoxmltype (xmlNodePtr cur , PgXmlErrorContext * xmlerrcxt );
@@ -1212,8 +1213,15 @@ parse_xml_decl(const xmlChar *str, size_t *lenp,
1212
1213
if (xmlStrncmp (p , (xmlChar * ) "<?xml" , 5 ) != 0 )
1213
1214
goto finished ;
1214
1215
1215
- /* if next char is name char, it's a PI like <?xml-stylesheet ...?> */
1216
- utf8len = strlen ((const char * ) (p + 5 ));
1216
+ /*
1217
+ * If next char is a name char, it's a PI like <?xml-stylesheet ...?>
1218
+ * rather than an XMLDecl, so we have done what we came to do and found no
1219
+ * XMLDecl.
1220
+ *
1221
+ * We need an input length value for xmlGetUTF8Char, but there's no need
1222
+ * to count the whole document size, so use strnlen not strlen.
1223
+ */
1224
+ utf8len = strnlen ((const char * ) (p + 5 ), MAX_MULTIBYTE_CHAR_LEN );
1217
1225
utf8char = xmlGetUTF8Char (p + 5 , & utf8len );
1218
1226
if (PG_XMLISNAMECHAR (utf8char ))
1219
1227
goto finished ;
@@ -1384,6 +1392,88 @@ print_xml_decl(StringInfo buf, const xmlChar *version,
1384
1392
return false;
1385
1393
}
1386
1394
1395
+ /*
1396
+ * Test whether an input that is to be parsed as CONTENT contains a DTD.
1397
+ *
1398
+ * The SQL/XML:2003 definition of CONTENT ("XMLDecl? content") is not
1399
+ * satisfied by a document with a DTD, which is a bit of a wart, as it means
1400
+ * the CONTENT type is not a proper superset of DOCUMENT. SQL/XML:2006 and
1401
+ * later fix that, by redefining content with reference to the "more
1402
+ * permissive" Document Node of the XQuery/XPath Data Model, such that any
1403
+ * DOCUMENT value is indeed also a CONTENT value. That definition is more
1404
+ * useful, as CONTENT becomes usable for parsing input of unknown form (think
1405
+ * pg_restore).
1406
+ *
1407
+ * As used below in parse_xml when parsing for CONTENT, libxml does not give
1408
+ * us the 2006+ behavior, but only the 2003; it will choke if the input has
1409
+ * a DTD. But we can provide the 2006+ definition of CONTENT easily enough,
1410
+ * by detecting this case first and simply doing the parse as DOCUMENT.
1411
+ *
1412
+ * A DTD can be found arbitrarily far in, but that would be a contrived case;
1413
+ * it will ordinarily start within a few dozen characters. The only things
1414
+ * that can precede it are an XMLDecl (here, the caller will have called
1415
+ * parse_xml_decl already), whitespace, comments, and processing instructions.
1416
+ * This function need only return true if it sees a valid sequence of such
1417
+ * things leading to <!DOCTYPE. It can simply return false in any other
1418
+ * cases, including malformed input; that will mean the input gets parsed as
1419
+ * CONTENT as originally planned, with libxml reporting any errors.
1420
+ *
1421
+ * This is only to be called from xml_parse, when pg_xml_init has already
1422
+ * been called. The input is already in UTF8 encoding.
1423
+ */
1424
+ static bool
1425
+ xml_doctype_in_content (const xmlChar * str )
1426
+ {
1427
+ const xmlChar * p = str ;
1428
+
1429
+ for (;;)
1430
+ {
1431
+ const xmlChar * e ;
1432
+
1433
+ SKIP_XML_SPACE (p );
1434
+ if (* p != '<' )
1435
+ return false;
1436
+ p ++ ;
1437
+
1438
+ if (* p == '!' )
1439
+ {
1440
+ p ++ ;
1441
+
1442
+ /* if we see <!DOCTYPE, we can return true */
1443
+ if (xmlStrncmp (p , (xmlChar * ) "DOCTYPE" , 7 ) == 0 )
1444
+ return true;
1445
+
1446
+ /* otherwise, if it's not a comment, fail */
1447
+ if (xmlStrncmp (p , (xmlChar * ) "--" , 2 ) != 0 )
1448
+ return false;
1449
+ /* find end of comment: find -- and a > must follow */
1450
+ p = xmlStrstr (p + 2 , (xmlChar * ) "--" );
1451
+ if (!p || p [2 ] != '>' )
1452
+ return false;
1453
+ /* advance over comment, and keep scanning */
1454
+ p += 3 ;
1455
+ continue ;
1456
+ }
1457
+
1458
+ /* otherwise, if it's not a PI <?target something?>, fail */
1459
+ if (* p != '?' )
1460
+ return false;
1461
+ p ++ ;
1462
+
1463
+ /* find end of PI (the string ?> is forbidden within a PI) */
1464
+ e = xmlStrstr (p , (xmlChar * ) "?>" );
1465
+ if (!e )
1466
+ return false;
1467
+
1468
+ /* we don't check PIs carefully, but do reject "xml" target */
1469
+ if (e - p >= 3 && xmlStrncasecmp (p , (xmlChar * ) "xml" , 3 ) == 0 )
1470
+ return false;
1471
+
1472
+ /* advance over PI, keep scanning */
1473
+ p = e + 2 ;
1474
+ }
1475
+ }
1476
+
1387
1477
1388
1478
/*
1389
1479
* Convert a C string to XML internal representation
@@ -1419,14 +1509,38 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
1419
1509
/* Use a TRY block to ensure we clean up correctly */
1420
1510
PG_TRY ();
1421
1511
{
1512
+ bool parse_as_document = false;
1513
+ int res_code ;
1514
+ size_t count = 0 ;
1515
+ xmlChar * version = NULL ;
1516
+ int standalone = 0 ;
1517
+
1422
1518
xmlInitParser ();
1423
1519
1424
1520
ctxt = xmlNewParserCtxt ();
1425
1521
if (ctxt == NULL || xmlerrcxt -> err_occurred )
1426
1522
xml_ereport (xmlerrcxt , ERROR , ERRCODE_OUT_OF_MEMORY ,
1427
1523
"could not allocate parser context" );
1428
1524
1525
+ /* Decide whether to parse as document or content */
1429
1526
if (xmloption_arg == XMLOPTION_DOCUMENT )
1527
+ parse_as_document = true;
1528
+ else
1529
+ {
1530
+ /* Parse and skip over the XML declaration, if any */
1531
+ res_code = parse_xml_decl (utf8string ,
1532
+ & count , & version , NULL , & standalone );
1533
+ if (res_code != 0 )
1534
+ xml_ereport_by_code (ERROR , ERRCODE_INVALID_XML_CONTENT ,
1535
+ "invalid XML content: invalid XML declaration" ,
1536
+ res_code );
1537
+
1538
+ /* Is there a DOCTYPE element? */
1539
+ if (xml_doctype_in_content (utf8string + count ))
1540
+ parse_as_document = true;
1541
+ }
1542
+
1543
+ if (parse_as_document )
1430
1544
{
1431
1545
/*
1432
1546
* Note, that here we try to apply DTD defaults
@@ -1441,23 +1555,18 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
1441
1555
XML_PARSE_NOENT | XML_PARSE_DTDATTR
1442
1556
| (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS ));
1443
1557
if (doc == NULL || xmlerrcxt -> err_occurred )
1444
- xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_DOCUMENT ,
1445
- "invalid XML document" );
1558
+ {
1559
+ /* Use original option to decide which error code to throw */
1560
+ if (xmloption_arg == XMLOPTION_DOCUMENT )
1561
+ xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_DOCUMENT ,
1562
+ "invalid XML document" );
1563
+ else
1564
+ xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_CONTENT ,
1565
+ "invalid XML content" );
1566
+ }
1446
1567
}
1447
1568
else
1448
1569
{
1449
- int res_code ;
1450
- size_t count ;
1451
- xmlChar * version ;
1452
- int standalone ;
1453
-
1454
- res_code = parse_xml_decl (utf8string ,
1455
- & count , & version , NULL , & standalone );
1456
- if (res_code != 0 )
1457
- xml_ereport_by_code (ERROR , ERRCODE_INVALID_XML_CONTENT ,
1458
- "invalid XML content: invalid XML declaration" ,
1459
- res_code );
1460
-
1461
1570
doc = xmlNewDoc (version );
1462
1571
Assert (doc -> encoding == NULL );
1463
1572
doc -> encoding = xmlStrdup ((const xmlChar * ) "UTF-8" );
0 commit comments