From a5e861a5041c2c18893a1f916b5bd446b2a5bc06 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 31 Dec 2013 10:22:51 +0100 Subject: [PATCH 1/2] WIP More general fix for #127 with addinfourl See #134. --- html5lib/inputstream.py | 25 ++++++++++++--------- html5lib/tests/test_stream.py | 42 +++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 9e03b931..339005e9 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -1,6 +1,5 @@ from __future__ import absolute_import, division, unicode_literals from six import text_type -from six.moves import http_client import codecs import re @@ -119,22 +118,24 @@ def _readFromBuffer(self, bytes): def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): - if isinstance(source, http_client.HTTPResponse): - # Work around Python bug #20007: read(0) closes the connection. + if hasattr(source, "read"): + # Do no use .read(0) because of Python bug #20007 # http://bugs.python.org/issue20007 - isUnicode = False - elif hasattr(source, "read"): - isUnicode = isinstance(source.read(0), text_type) + firstChunk = source.read(HTMLUnicodeInputStream._defaultChunkSize) + print(firstChunk) + isUnicode = isinstance(firstChunk, text_type) else: isUnicode = isinstance(source, text_type) + firstChunk = "" if isUnicode else b"" if isUnicode: if encoding is not None: raise TypeError("Cannot explicitly set an encoding with a unicode string") - return HTMLUnicodeInputStream(source) + return HTMLUnicodeInputStream(source, firstChunk) else: - return HTMLBinaryInputStream(source, encoding, parseMeta, chardet) + return HTMLBinaryInputStream( + source, firstChunk, encoding, parseMeta, chardet) class HTMLUnicodeInputStream(object): @@ -147,7 +148,7 @@ class HTMLUnicodeInputStream(object): _defaultChunkSize = 10240 - def __init__(self, source): + def __init__(self, source, firstChunk=""): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -163,6 +164,7 @@ def __init__(self, source): parseMeta - Look for a element containing encoding information """ + # XXX do something with firstChunk # Craziness if len("\U0010FFFF") == 1: @@ -378,7 +380,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream): """ - def __init__(self, source, encoding=None, parseMeta=True, chardet=True): + def __init__(self, source, firstChunk=b"", encoding=None, + parseMeta=True, chardet=True): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -394,6 +397,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): parseMeta - Look for a element containing encoding information """ + # XXX do something with firstChunk + # Raw Stream - for unicode objects this will encode to utf-8 and set # self.charEncoding as appropriate self.rawStream = self.openStream(source) diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index 2a876c1d..b769c6e1 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,3 +1,5 @@ +# coding: utf8 + from __future__ import absolute_import, division, unicode_literals from . import support # flake8: noqa @@ -6,6 +8,7 @@ from io import BytesIO from six.moves import http_client +from six.moves.urllib.response import addinfourl from html5lib.inputstream import (BufferedStream, HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream) @@ -156,6 +159,25 @@ def test_position2(self): self.assertEqual(stream.char(), "d") self.assertEqual(stream.position(), (2, 1)) + def test_non_seekable_stream(self): + class Stream(object): + def __init__(self, data): + self.data = data + + def read(self, n=None): + if n is None: + data = self.data + self.data = b'' + return data + else: + data = self.data[:n] + self.data = self.data[n:] + return data + + # Fails when firstChunk is ignored + stream = HTMLInputStream(Stream(b"Test")) + self.assertEqual(stream.charsUntil(" "), "Test") + def test_python_issue_20007(self): """ Make sure we have a work-around for Python bug #20007 @@ -170,6 +192,26 @@ def makefile(self, _mode, _bufsize=None): stream = HTMLInputStream(source) self.assertEqual(stream.charsUntil(" "), "Text") + def test_python_issue_20007_addinfourl(self): + """ + Same as above, but the source is not necessarily an instance + of HTTPResponse. + """ + class FakeSocket(object): + def makefile(self, _mode, _bufsize=None): + return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") + + source = http_client.HTTPResponse(FakeSocket()) + source.begin() + try: + source = addinfourl(source, None, None) + except AttributeError: + # Fails on Python 2.x where HTTPResponse does not have .readline() + # Apparently, addinfourl it only used with HTTPResponse on 3.x + pass + else: + stream = HTMLInputStream(source) + self.assertEqual(stream.charsUntil(" "), "Text") def buildTestSuite(): return unittest.defaultTestLoader.loadTestsFromName(__name__) From b4861c60184bf73cc40774888fd01cbb5b3ddcef Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 31 Dec 2013 17:09:27 +0100 Subject: [PATCH 2/2] Adress some review comments. --- html5lib/inputstream.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 339005e9..674fabe2 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -119,14 +119,13 @@ def _readFromBuffer(self, bytes): def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): if hasattr(source, "read"): - # Do no use .read(0) because of Python bug #20007 + # Do not use .read(0) because of Python bug #20007 # http://bugs.python.org/issue20007 firstChunk = source.read(HTMLUnicodeInputStream._defaultChunkSize) - print(firstChunk) isUnicode = isinstance(firstChunk, text_type) else: isUnicode = isinstance(source, text_type) - firstChunk = "" if isUnicode else b"" + firstChunk = None if isUnicode: if encoding is not None: @@ -134,8 +133,8 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): return HTMLUnicodeInputStream(source, firstChunk) else: - return HTMLBinaryInputStream( - source, firstChunk, encoding, parseMeta, chardet) + return HTMLBinaryInputStream(source, firstChunk, encoding, parseMeta, + chardet) class HTMLUnicodeInputStream(object):