Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

WIP More general fix for #127 with addinfourl #136

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions html5lib/inputstream.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from six.moves import http_client

import codecs
import re
Expand Down Expand Up @@ -119,22 +118,23 @@ def _readFromBuffer(self, bytes):


def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
if isinstance(source, http_client.HTTPResponse):
# Work around Python bug #20007: read(0) closes the connection.
if hasattr(source, "read"):
# Do not use .read(0) because of Python bug #20007
# http://bugs.python.org/issue20007
isUnicode = False
elif hasattr(source, "read"):
isUnicode = isinstance(source.read(0), text_type)
firstChunk = source.read(HTMLUnicodeInputStream._defaultChunkSize)
isUnicode = isinstance(firstChunk, text_type)
else:
isUnicode = isinstance(source, text_type)
firstChunk = None

if isUnicode:
if encoding is not None:
raise TypeError("Cannot explicitly set an encoding with a unicode string")

return HTMLUnicodeInputStream(source)
return HTMLUnicodeInputStream(source, firstChunk)
else:
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
return HTMLBinaryInputStream(source, firstChunk, encoding, parseMeta,
chardet)


class HTMLUnicodeInputStream(object):
Expand All @@ -147,7 +147,7 @@ class HTMLUnicodeInputStream(object):

_defaultChunkSize = 10240

def __init__(self, source):
def __init__(self, source, firstChunk=""):
"""Initialises the HTMLInputStream.

HTMLInputStream(source, [encoding]) -> Normalized stream from source
Expand All @@ -163,6 +163,7 @@ def __init__(self, source):
parseMeta - Look for a <meta> element containing encoding information

"""
# XXX do something with firstChunk

# Craziness
if len("\U0010FFFF") == 1:
Expand Down Expand Up @@ -378,7 +379,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

"""

def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
def __init__(self, source, firstChunk=b"", encoding=None,
parseMeta=True, chardet=True):
"""Initialises the HTMLInputStream.

HTMLInputStream(source, [encoding]) -> Normalized stream from source
Expand All @@ -394,6 +396,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
parseMeta - Look for a <meta> element containing encoding information

"""
# XXX do something with firstChunk

# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
self.rawStream = self.openStream(source)
Expand Down
42 changes: 42 additions & 0 deletions html5lib/tests/test_stream.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# coding: utf8

from __future__ import absolute_import, division, unicode_literals

from . import support # flake8: noqa
Expand All @@ -6,6 +8,7 @@
from io import BytesIO

from six.moves import http_client
from six.moves.urllib.response import addinfourl

from html5lib.inputstream import (BufferedStream, HTMLInputStream,
HTMLUnicodeInputStream, HTMLBinaryInputStream)
Expand Down Expand Up @@ -156,6 +159,25 @@ def test_position2(self):
self.assertEqual(stream.char(), "d")
self.assertEqual(stream.position(), (2, 1))

def test_non_seekable_stream(self):
class Stream(object):
def __init__(self, data):
self.data = data

def read(self, n=None):
if n is None:
data = self.data
self.data = b''
return data
else:
data = self.data[:n]
self.data = self.data[n:]
return data

# Fails when firstChunk is ignored
stream = HTMLInputStream(Stream(b"Test"))
self.assertEqual(stream.charsUntil(" "), "Test")

def test_python_issue_20007(self):
"""
Make sure we have a work-around for Python bug #20007
Expand All @@ -170,6 +192,26 @@ def makefile(self, _mode, _bufsize=None):
stream = HTMLInputStream(source)
self.assertEqual(stream.charsUntil(" "), "Text")

def test_python_issue_20007_addinfourl(self):
"""
Same as above, but the source is not necessarily an instance
of HTTPResponse.
"""
class FakeSocket(object):
def makefile(self, _mode, _bufsize=None):
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")

source = http_client.HTTPResponse(FakeSocket())
source.begin()
try:
source = addinfourl(source, None, None)
except AttributeError:
# Fails on Python 2.x where HTTPResponse does not have .readline()
# Apparently, addinfourl it only used with HTTPResponse on 3.x
pass
else:
stream = HTMLInputStream(source)
self.assertEqual(stream.charsUntil(" "), "Text")

def buildTestSuite():
return unittest.defaultTestLoader.loadTestsFromName(__name__)
Expand Down