Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d8aa466

Browse files
committed
Add TestDataUnicode/TestDataBytes split; move tree_construction over
1 parent 1830d67 commit d8aa466

File tree

3 files changed

+177
-21
lines changed

3 files changed

+177
-21
lines changed

html5lib/tests/support.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,20 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3+
import collections
34
import os
45
import sys
56
import codecs
67
import glob
78
import xml.sax.handler
89

10+
from six import text_type
11+
from six.moves import range
12+
13+
try:
14+
from collections import OrderedDict
15+
except ImportError:
16+
from ordereddict import OrderedDict
17+
918
base_path = os.path.split(__file__)[0]
1019

1120
test_dir = os.path.join(base_path, 'testdata')
@@ -14,6 +23,7 @@
1423
os.path.pardir)))
1524

1625
from html5lib import treebuilders
26+
from html5lib.utils import py2_str_unicode
1727
del base_path
1828

1929
# Build a dict of avaliable trees
@@ -47,6 +57,17 @@ def get_data_files(subdirectory, files='*.dat'):
4757
return sorted(glob.glob(os.path.join(test_dir, subdirectory, files)))
4858

4959

60+
def isSubsequence(l1, l2):
61+
"""checks if l2 is a subsequence of l1"""
62+
j = 0
63+
for i in range(len(l1)):
64+
if l1[i] == l2[j]:
65+
j += 1
66+
if j >= len(l2):
67+
return True
68+
return False
69+
70+
5071
class DefaultDict(dict):
5172
def __init__(self, default, *args, **kwargs):
5273
self.default = default
@@ -65,6 +86,9 @@ def __init__(self, filename, newTestHeading="data", encoding="utf8"):
6586
self.encoding = encoding
6687
self.newTestHeading = newTestHeading
6788

89+
def __del__(self):
90+
self.f.close()
91+
6892
def __iter__(self):
6993
data = DefaultDict(None)
7094
key = None
@@ -100,6 +124,120 @@ def normaliseOutput(self, data):
100124
return data
101125

102126

127+
def _getTestData(isUnicode):
128+
class _TestData2(object):
129+
def __init__(self, f, headings):
130+
self.f = f
131+
self.headings = headings
132+
133+
def __iter__(self):
134+
newTestHeading = self.headings[0]
135+
136+
data = OrderedDict()
137+
key = None
138+
for line in self.f:
139+
if line.startswith("#" if isUnicode else b"#"):
140+
heading = line[1:].strip()
141+
if data and heading == newTestHeading:
142+
self._normalize_newlines(data, last=False)
143+
yield (TestUnicode(self.headings, data) if isUnicode else TestBytes(self.headings, data))
144+
data = OrderedDict()
145+
key = heading
146+
data[key] = "" if isUnicode else b""
147+
elif key is not None:
148+
data[key] += line
149+
if data:
150+
self._normalize_newlines(data, last=True)
151+
yield (TestUnicode(self.headings, data) if isUnicode else TestBytes(self.headings, data))
152+
153+
def _normalize_newlines(self, data, last):
154+
for key, value in data.items():
155+
if value:
156+
assert value[-1] == "\n" if isUnicode else b"\n"
157+
data[key] = value[:-1]
158+
if value and not last:
159+
assert value[-2:] == "\n\n" if isUnicode else b"\n\n"
160+
data[key] = value[:-2]
161+
162+
163+
return _TestData2
164+
TestDataUnicode = _getTestData(True)
165+
TestDataBytes = _getTestData(False)
166+
167+
168+
class Test(object):
169+
def __init__(self, headings, d):
170+
if len(headings) != len(set(headings)):
171+
raise ValueError("headings must not contain duplicates")
172+
if not isSubsequence(headings, list(d.keys())):
173+
raise ValueError("test headings must be a subsequence of expected headings, got %s, expected %s" % (list(d.keys()), headings))
174+
if len(d) == len(headings):
175+
self._d = d
176+
else:
177+
e = OrderedDict()
178+
for heading in headings:
179+
if heading in d:
180+
e[heading] = d[heading]
181+
else:
182+
e[heading] = None
183+
self._d = e
184+
185+
def __getitem__(self, k):
186+
return self._d[k]
187+
188+
def __len__(self):
189+
return len(self._d)
190+
191+
def __iter__(self):
192+
return iter(self._d)
193+
194+
def __contains__(self, k):
195+
return k in self._d
196+
197+
def keys(self):
198+
return self._d.keys()
199+
200+
def items(self):
201+
return self._d.items()
202+
203+
def values(self):
204+
return self._d.values()
205+
206+
def get(self, k, d=None):
207+
return self._d.get(k, d)
208+
209+
def __eq__(self, o):
210+
return self._d == o._d
211+
212+
def __ne__(self, o):
213+
return self._d != o._d
214+
215+
def __hash__(self):
216+
return hash(self._d.items())
217+
218+
219+
@py2_str_unicode
220+
class TestBytes(Test):
221+
def __bytes__(self):
222+
r = []
223+
for heading, content in self.items():
224+
r.append(b"#" + heading)
225+
r.append(content)
226+
r.append(b"")
227+
return b"\n".join(r)
228+
229+
230+
@py2_str_unicode
231+
class TestUnicode(Test):
232+
def __unicode__(self):
233+
r = []
234+
for heading, content in self.items():
235+
r.append("#" + heading)
236+
r.append(content)
237+
r.append("")
238+
return "\n".join(r)
239+
240+
103241
def convert(stripChars):
104242
def convertData(data):
105243
"""convert the output of str(document) to the format used in the testcases"""

html5lib/tests/tree_construction.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,34 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3+
import codecs
34
import warnings
45
import re
56

67
import pytest
78

8-
from .support import TestData, convert, convertExpected, treeTypes
9+
from .support import TestDataUnicode, convert, convertExpected, treeTypes
910
from html5lib import html5parser, constants
1011

1112

1213
class TreeConstructionFile(pytest.File):
1314
def collect(self):
14-
tests = TestData(str(self.fspath), "data")
15-
for i, test in enumerate(tests):
16-
for treeName, treeClass in sorted(treeTypes.items()):
17-
for namespaceHTMLElements in (True, False):
18-
if namespaceHTMLElements:
19-
nodeid = "%d::%s::namespaced" % (i, treeName)
20-
else:
21-
nodeid = "%d::%s::void-namespace" % (i, treeName)
22-
item = ParserTest(nodeid, self,
23-
test, treeClass, namespaceHTMLElements)
24-
item.add_marker(getattr(pytest.mark, treeName))
25-
if namespaceHTMLElements:
26-
item.add_marker(pytest.mark.namespaced)
27-
if treeClass is None:
28-
item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
29-
yield item
15+
with codecs.open(str(self.fspath), "rb", encoding="utf-8") as fp:
16+
tests = TestDataUnicode(fp, ParserTest.headings)
17+
for i, test in enumerate(tests):
18+
for treeName, treeClass in sorted(treeTypes.items()):
19+
for namespaceHTMLElements in (True, False):
20+
if namespaceHTMLElements:
21+
nodeid = "%d::%s::namespaced" % (i, treeName)
22+
else:
23+
nodeid = "%d::%s::void-namespace" % (i, treeName)
24+
item = ParserTest(nodeid, self,
25+
test, treeClass, namespaceHTMLElements)
26+
item.add_marker(getattr(pytest.mark, treeName))
27+
if namespaceHTMLElements:
28+
item.add_marker(pytest.mark.namespaced)
29+
if treeClass is None:
30+
item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
31+
yield item
3032

3133

3234
def convertTreeDump(data):

html5lib/utils.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3+
from copy import copy
34
from types import ModuleType
45

5-
from six import text_type
6+
import six
67

78
try:
89
import xml.etree.cElementTree as default_etree
@@ -12,7 +13,7 @@
1213

1314
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
1415
"surrogatePairToCodepoint", "moduleFactoryFactory",
15-
"supports_lone_surrogates"]
16+
"supports_lone_surrogates", "py2_str_unicode"]
1617

1718

1819
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
@@ -23,10 +24,10 @@
2324
# escapes.
2425
try:
2526
_x = eval('"\\uD800"')
26-
if not isinstance(_x, text_type):
27+
if not isinstance(_x, six.text_type):
2728
# We need this with u"" because of http://bugs.jython.org/issue2039
2829
_x = eval('u"\\uD800"')
29-
assert isinstance(_x, text_type)
30+
assert isinstance(_x, six.text_type)
3031
except:
3132
supports_lone_surrogates = False
3233
else:
@@ -109,3 +110,18 @@ def moduleFactory(baseModule, *args, **kwargs):
109110
return mod
110111

111112
return moduleFactory
113+
114+
115+
def py2_str_unicode(klass):
116+
"""Sorts out __str__/__unicode__ from __bytes__/__str__"""
117+
if six.PY2:
118+
klass = copy(klass)
119+
# This has to come first so we don't lose __str__
120+
if "__str__" in klass.__dict__:
121+
klass.__unicode__ = klass.__str__
122+
# Then find the right thing for __str__
123+
if "__bytes__" in klass.__dict__:
124+
klass.__str__ = klass.__bytes__
125+
elif "__str__" in klass.__dict__:
126+
klass.__str__ = lambda self: self.__unicode__().encode("utf-8")
127+
return klass

0 commit comments

Comments
 (0)