diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 9e03b931..674fabe2 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -1,6 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
-from six.moves import http_client
import codecs
import re
@@ -119,22 +118,23 @@ def _readFromBuffer(self, bytes):
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
- if isinstance(source, http_client.HTTPResponse):
- # Work around Python bug #20007: read(0) closes the connection.
+ if hasattr(source, "read"):
+ # Do not use .read(0) because of Python bug #20007
# http://bugs.python.org/issue20007
- isUnicode = False
- elif hasattr(source, "read"):
- isUnicode = isinstance(source.read(0), text_type)
+ firstChunk = source.read(HTMLUnicodeInputStream._defaultChunkSize)
+ isUnicode = isinstance(firstChunk, text_type)
else:
isUnicode = isinstance(source, text_type)
+ firstChunk = None
if isUnicode:
if encoding is not None:
raise TypeError("Cannot explicitly set an encoding with a unicode string")
- return HTMLUnicodeInputStream(source)
+ return HTMLUnicodeInputStream(source, firstChunk)
else:
- return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
+ return HTMLBinaryInputStream(source, firstChunk, encoding, parseMeta,
+ chardet)
class HTMLUnicodeInputStream(object):
@@ -147,7 +147,7 @@ class HTMLUnicodeInputStream(object):
_defaultChunkSize = 10240
- def __init__(self, source):
+ def __init__(self, source, firstChunk=""):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -163,6 +163,7 @@ def __init__(self, source):
parseMeta - Look for a element containing encoding information
"""
+ # XXX do something with firstChunk
# Craziness
if len("\U0010FFFF") == 1:
@@ -378,7 +379,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
"""
- def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+ def __init__(self, source, firstChunk=b"", encoding=None,
+ parseMeta=True, chardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -394,6 +396,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
parseMeta - Look for a element containing encoding information
"""
+ # XXX do something with firstChunk
+
# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
self.rawStream = self.openStream(source)
diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
index 2a876c1d..b769c6e1 100644
--- a/html5lib/tests/test_stream.py
+++ b/html5lib/tests/test_stream.py
@@ -1,3 +1,5 @@
+# coding: utf8
+
from __future__ import absolute_import, division, unicode_literals
from . import support # flake8: noqa
@@ -6,6 +8,7 @@
from io import BytesIO
from six.moves import http_client
+from six.moves.urllib.response import addinfourl
from html5lib.inputstream import (BufferedStream, HTMLInputStream,
HTMLUnicodeInputStream, HTMLBinaryInputStream)
@@ -156,6 +159,25 @@ def test_position2(self):
self.assertEqual(stream.char(), "d")
self.assertEqual(stream.position(), (2, 1))
+ def test_non_seekable_stream(self):
+ class Stream(object):
+ def __init__(self, data):
+ self.data = data
+
+ def read(self, n=None):
+ if n is None:
+ data = self.data
+ self.data = b''
+ return data
+ else:
+ data = self.data[:n]
+ self.data = self.data[n:]
+ return data
+
+ # Fails when firstChunk is ignored
+ stream = HTMLInputStream(Stream(b"Test"))
+ self.assertEqual(stream.charsUntil(" "), "Test")
+
def test_python_issue_20007(self):
"""
Make sure we have a work-around for Python bug #20007
@@ -170,6 +192,26 @@ def makefile(self, _mode, _bufsize=None):
stream = HTMLInputStream(source)
self.assertEqual(stream.charsUntil(" "), "Text")
+ def test_python_issue_20007_addinfourl(self):
+ """
+ Same as above, but the source is not necessarily an instance
+ of HTTPResponse.
+ """
+ class FakeSocket(object):
+ def makefile(self, _mode, _bufsize=None):
+ return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
+
+ source = http_client.HTTPResponse(FakeSocket())
+ source.begin()
+ try:
+ source = addinfourl(source, None, None)
+ except AttributeError:
+ # Fails on Python 2.x where HTTPResponse does not have .readline()
+ # Apparently, addinfourl it only used with HTTPResponse on 3.x
+ pass
+ else:
+ stream = HTMLInputStream(source)
+ self.assertEqual(stream.charsUntil(" "), "Text")
def buildTestSuite():
return unittest.defaultTestLoader.loadTestsFromName(__name__)