Add TestDataUnicode/TestDataBytes split; move tree_construction over

gsnedders · gsnedders · commit d8aa4666ae02 · 2015-12-03T03:30:03.000Z
diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
@@ -1,11 +1,20 @@
 from __future__ import absolute_import, division, unicode_literals
 
+import collections
 import os
 import sys
 import codecs
 import glob
 import xml.sax.handler
 
+from six import text_type
+from six.moves import range
+
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
+
 base_path = os.path.split(__file__)[0]
 
 test_dir = os.path.join(base_path, 'testdata')
@@ -14,6 +23,7 @@
                                                 os.path.pardir)))
 
 from html5lib import treebuilders
+from html5lib.utils import py2_str_unicode
 del base_path
 
 # Build a dict of avaliable trees
@@ -47,6 +57,17 @@ def get_data_files(subdirectory, files='*.dat'):
     return sorted(glob.glob(os.path.join(test_dir, subdirectory, files)))
 
 
+def isSubsequence(l1, l2):
+    """checks if l2 is a subsequence of l1"""
+    j = 0
+    for i in range(len(l1)):
+        if l1[i] == l2[j]:
+            j += 1
+            if j >= len(l2):
+                return True
+    return False
+
+
 class DefaultDict(dict):
     def __init__(self, default, *args, **kwargs):
         self.default = default
@@ -65,6 +86,9 @@ def __init__(self, filename, newTestHeading="data", encoding="utf8"):
         self.encoding = encoding
         self.newTestHeading = newTestHeading
 
+    def __del__(self):
+        self.f.close()
+
     def __iter__(self):
         data = DefaultDict(None)
         key = None
@@ -100,6 +124,120 @@ def normaliseOutput(self, data):
         return data
 
 
+def _getTestData(isUnicode):
+    class _TestData2(object):
+        def __init__(self, f, headings):
+            self.f = f
+            self.headings = headings
+
+        def __iter__(self):
+            newTestHeading = self.headings[0]
+
+            data = OrderedDict()
+            key = None
+            for line in self.f:
+                if line.startswith("#" if isUnicode else b"#"):
+                    heading = line[1:].strip()
+                    if data and heading == newTestHeading:
+                        self._normalize_newlines(data, last=False)
+                        yield (TestUnicode(self.headings, data) if isUnicode else TestBytes(self.headings, data))
+                        data = OrderedDict()
+                    key = heading
+                    data[key] = "" if isUnicode else b""
+                elif key is not None:
+                    data[key] += line
+            if data:
+                self._normalize_newlines(data, last=True)
+                yield (TestUnicode(self.headings, data) if isUnicode else TestBytes(self.headings, data))
+
+        def _normalize_newlines(self, data, last):
+            for key, value in data.items():
+                if value:
+                    assert value[-1] == "\n" if isUnicode else b"\n"
+                    data[key] = value[:-1]
+            if value and not last:
+                assert value[-2:] == "\n\n" if isUnicode else b"\n\n"
+                data[key] = value[:-2]
+
+
+    return _TestData2
+TestDataUnicode = _getTestData(True)
+TestDataBytes = _getTestData(False)
+
+
+class Test(object):
+    def __init__(self, headings, d):
+        if len(headings) != len(set(headings)):
+            raise ValueError("headings must not contain duplicates")
+        if not isSubsequence(headings, list(d.keys())):
+            raise ValueError("test headings must be a subsequence of expected headings, got %s, expected %s" % (list(d.keys()), headings))
+        if len(d) == len(headings):
+            self._d = d
+        else:
+            e = OrderedDict()
+            for heading in headings:
+                if heading in d:
+                    e[heading] = d[heading]
+                else:
+                    e[heading] = None
+            self._d = e
+
+    def __getitem__(self, k):
+        return self._d[k]
+
+    def __len__(self):
+        return len(self._d)
+
+    def __iter__(self):
+        return iter(self._d)
+
+    def __contains__(self, k):
+        return k in self._d
+
+    def keys(self):
+        return self._d.keys()
+
+    def items(self):
+        return self._d.items()
+
+    def values(self):
+        return self._d.values()
+
+    def get(self, k, d=None):
+        return self._d.get(k, d)
+
+    def __eq__(self, o):
+        return self._d == o._d
+
+    def __ne__(self, o):
+        return self._d != o._d
+
+    def __hash__(self):
+        return hash(self._d.items())
+
+
+@py2_str_unicode
+class TestBytes(Test):
+    def __bytes__(self):
+        r = []
+        for heading, content in self.items():
+            r.append(b"#" + heading)
+            r.append(content)
+        r.append(b"")
+        return b"\n".join(r)
+
+
+@py2_str_unicode
+class TestUnicode(Test):
+    def __unicode__(self):
+        r = []
+        for heading, content in self.items():
+            r.append("#" + heading)
+            r.append(content)
+        r.append("")
+        return "\n".join(r)
+
+
 def convert(stripChars):
     def convertData(data):
         """convert the output of str(document) to the format used in the testcases"""
diff --git a/html5lib/tests/tree_construction.py b/html5lib/tests/tree_construction.py
@@ -1,32 +1,34 @@
 from __future__ import absolute_import, division, unicode_literals
 
+import codecs
 import warnings
 import re
 
 import pytest
 
-from .support import TestData, convert, convertExpected, treeTypes
+from .support import TestDataUnicode, convert, convertExpected, treeTypes
 from html5lib import html5parser, constants
 
 
 class TreeConstructionFile(pytest.File):
     def collect(self):
-        tests = TestData(str(self.fspath), "data")
-        for i, test in enumerate(tests):
-            for treeName, treeClass in sorted(treeTypes.items()):
-                for namespaceHTMLElements in (True, False):
-                    if namespaceHTMLElements:
-                        nodeid = "%d::%s::namespaced" % (i, treeName)
-                    else:
-                        nodeid = "%d::%s::void-namespace" % (i, treeName)
-                    item = ParserTest(nodeid, self,
-                                      test, treeClass, namespaceHTMLElements)
-                    item.add_marker(getattr(pytest.mark, treeName))
-                    if namespaceHTMLElements:
-                        item.add_marker(pytest.mark.namespaced)
-                    if treeClass is None:
-                        item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
-                    yield item
+        with codecs.open(str(self.fspath), "rb", encoding="utf-8") as fp:
+            tests = TestDataUnicode(fp, ParserTest.headings)
+            for i, test in enumerate(tests):
+                for treeName, treeClass in sorted(treeTypes.items()):
+                    for namespaceHTMLElements in (True, False):
+                        if namespaceHTMLElements:
+                            nodeid = "%d::%s::namespaced" % (i, treeName)
+                        else:
+                            nodeid = "%d::%s::void-namespace" % (i, treeName)
+                        item = ParserTest(nodeid, self,
+                                          test, treeClass, namespaceHTMLElements)
+                        item.add_marker(getattr(pytest.mark, treeName))
+                        if namespaceHTMLElements:
+                            item.add_marker(pytest.mark.namespaced)
+                        if treeClass is None:
+                            item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
+                        yield item
 
 
 def convertTreeDump(data):
diff --git a/html5lib/utils.py b/html5lib/utils.py
@@ -1,8 +1,9 @@
 from __future__ import absolute_import, division, unicode_literals
 
+from copy import copy
 from types import ModuleType
 
-from six import text_type
+import six
 
 try:
     import xml.etree.cElementTree as default_etree
@@ -12,7 +13,7 @@
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
            "surrogatePairToCodepoint", "moduleFactoryFactory",
-           "supports_lone_surrogates"]
+           "supports_lone_surrogates", "py2_str_unicode"]
 
 
 # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
@@ -23,10 +24,10 @@
 # escapes.
 try:
     _x = eval('"\\uD800"')
-    if not isinstance(_x, text_type):
+    if not isinstance(_x, six.text_type):
         # We need this with u"" because of http://bugs.jython.org/issue2039
         _x = eval('u"\\uD800"')
-        assert isinstance(_x, text_type)
+        assert isinstance(_x, six.text_type)
 except:
     supports_lone_surrogates = False
 else:
@@ -109,3 +110,18 @@ def moduleFactory(baseModule, *args, **kwargs):
             return mod
 
     return moduleFactory
+
+
+def py2_str_unicode(klass):
+    """Sorts out __str__/__unicode__ from __bytes__/__str__"""
+    if six.PY2:
+        klass = copy(klass)
+        # This has to come first so we don't lose __str__
+        if "__str__" in klass.__dict__:
+            klass.__unicode__ = klass.__str__
+        # Then find the right thing for __str__
+        if "__bytes__" in klass.__dict__:
+            klass.__str__ = klass.__bytes__
+        elif "__str__" in klass.__dict__:
+            klass.__str__ = lambda self: self.__unicode__().encode("utf-8")
+    return klass