html5lib · SimonSapin · Dec 31, 2013 · Dec 31, 2013
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -1,6 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
-from six.moves import http_client
 
 import codecs
 import re
@@ -119,22 +118,23 @@ def _readFromBuffer(self, bytes):
 
 
 def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
-    if isinstance(source, http_client.HTTPResponse):
-        # Work around Python bug #20007: read(0) closes the connection.
+    if hasattr(source, "read"):
+        # Do not use .read(0) because of Python bug #20007
         # http://bugs.python.org/issue20007
-        isUnicode = False
-    elif hasattr(source, "read"):
-        isUnicode = isinstance(source.read(0), text_type)
+        firstChunk = source.read(HTMLUnicodeInputStream._defaultChunkSize)
+        isUnicode = isinstance(firstChunk, text_type)
     else:
         isUnicode = isinstance(source, text_type)
+        firstChunk = None
 
     if isUnicode:
         if encoding is not None:
             raise TypeError("Cannot explicitly set an encoding with a unicode string")
 
-        return HTMLUnicodeInputStream(source)
+        return HTMLUnicodeInputStream(source, firstChunk)
     else:
-        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
+        return HTMLBinaryInputStream(source, firstChunk, encoding, parseMeta,
+                                     chardet)
 
 
 class HTMLUnicodeInputStream(object):
@@ -147,7 +147,7 @@ class HTMLUnicodeInputStream(object):
 
     _defaultChunkSize = 10240
 
-    def __init__(self, source):
+    def __init__(self, source, firstChunk=""):
         """Initialises the HTMLInputStream.
 
         HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -163,6 +163,7 @@ def __init__(self, source):
         parseMeta - Look for a <meta> element containing encoding information
 
         """
+        # XXX do something with firstChunk
 
         # Craziness
         if len("\U0010FFFF") == 1:
@@ -378,7 +379,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
 
     """
 
-    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+    def __init__(self, source, firstChunk=b"", encoding=None,
+                 parseMeta=True, chardet=True):
         """Initialises the HTMLInputStream.
 
         HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -394,6 +396,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         parseMeta - Look for a <meta> element containing encoding information
 
         """
+        # XXX do something with firstChunk
+
         # Raw Stream - for unicode objects this will encode to utf-8 and set
         #              self.charEncoding as appropriate
         self.rawStream = self.openStream(source)

diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
@@ -1,3 +1,5 @@
+# coding: utf8
+
 from __future__ import absolute_import, division, unicode_literals
 
 from . import support  # flake8: noqa
@@ -6,6 +8,7 @@
 from io import BytesIO
 
 from six.moves import http_client
+from six.moves.urllib.response import addinfourl
 
 from html5lib.inputstream import (BufferedStream, HTMLInputStream,
                                   HTMLUnicodeInputStream, HTMLBinaryInputStream)
@@ -156,6 +159,25 @@ def test_position2(self):
         self.assertEqual(stream.char(), "d")
         self.assertEqual(stream.position(), (2, 1))
 
+    def test_non_seekable_stream(self):
+        class Stream(object):
+            def __init__(self, data):
+                self.data = data
+
+            def read(self, n=None):
+                if n is None:
+                    data = self.data
+                    self.data = b''
+                    return data
+                else:
+                    data = self.data[:n]
+                    self.data = self.data[n:]
+                    return data
+
+        # Fails when firstChunk is ignored
+        stream = HTMLInputStream(Stream(b"Test"))
+        self.assertEqual(stream.charsUntil(" "), "Test")
+
     def test_python_issue_20007(self):
         """
         Make sure we have a work-around for Python bug #20007
@@ -170,6 +192,26 @@ def makefile(self, _mode, _bufsize=None):
         stream = HTMLInputStream(source)
         self.assertEqual(stream.charsUntil(" "), "Text")
 
+    def test_python_issue_20007_addinfourl(self):
+        """
+        Same as above, but the source is not necessarily an instance
+        of HTTPResponse.
+        """
+        class FakeSocket(object):
+            def makefile(self, _mode, _bufsize=None):
+                return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
+
+        source = http_client.HTTPResponse(FakeSocket())
+        source.begin()
+        try:
+            source = addinfourl(source, None, None)
+        except AttributeError:
+            # Fails on Python 2.x where HTTPResponse does not have .readline()
+            # Apparently, addinfourl it only used with HTTPResponse on 3.x
+            pass
+        else:
+            stream = HTMLInputStream(source)
+            self.assertEqual(stream.charsUntil(" "), "Text")
 
 def buildTestSuite():
     return unittest.defaultTestLoader.loadTestsFromName(__name__)