From 1be9532f79fd7744be0945c4ab42d2f5b41e4e73 Mon Sep 17 00:00:00 2001 From: Ritwik Gupta Date: Mon, 24 Nov 2014 16:23:01 -0500 Subject: [PATCH 001/147] Added iframe seamless boolean attribute --- html5lib/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/html5lib/constants.py b/html5lib/constants.py index e7089846..659f2b5e 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -535,6 +535,7 @@ "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), "output": frozenset(("disabled", "readonly")), + "iframe": frozenset(("seamless")), } # entitiesWindows1252 has to be _ordered_ and needs to have an index. It From 4dfe3cd9f97ce51c53463d633308f4a3fe6ad9e6 Mon Sep 17 00:00:00 2001 From: Ritwik Gupta Date: Mon, 24 Nov 2014 16:25:04 -0500 Subject: [PATCH 002/147] Update CHANGES.rst --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 1431b3c9..89e48f94 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,7 +6,7 @@ Change Log Released on XXX, 2014 -* XXX +* Fix #XXX: added the seamless attribute for iframes. 0.999 From 7fd79e31e083ab75305b3e837ea9aa8c9b4675ff Mon Sep 17 00:00:00 2001 From: Ritwik Gupta Date: Mon, 24 Nov 2014 16:25:28 -0500 Subject: [PATCH 003/147] Update AUTHORS.rst --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 4148a6ed..787c3b94 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,3 +32,4 @@ Patches and suggestions - Juan Carlos Garcia Segovia - Mike West - Marc DM +- Ritwik Gupta From 983a9355ea66a8c1626a42fd0682b48e246685bd Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 15 Jul 2016 02:24:33 +0100 Subject: [PATCH 004/147] And back to dev. --- html5lib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index 8ee9b53e..f3cd9455 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -22,4 +22,4 @@ "getTreeWalker", "serialize"] # this has to be at the top level, see how setup.py parses this -__version__ = "0.999999999" +__version__ = "0.9999999999-dev" From 6cd93c82f5c0a09bc0a3ccab214d87537c5e60c2 Mon Sep 17 00:00:00 2001 From: John Vandenberg Date: Fri, 22 Jul 2016 09:11:31 +0700 Subject: [PATCH 005/147] Monkeypatch pkg_resources to always use _markerlib Prior to setuptools 20.10.0 there is patchy support for environment markers, and setup.py fails while parsing them. html5lib requires at least setuptools 18.5 for its environment markers. However, @gsnedders developed a way to monkey patch pkg_resources so that it always uses _markerlib, which allows all environment markers to be used for any version of setuptools. Some patching of _markerlib is also required so that it works on Python 3 also. On removing the dependency for setuptools>=18.5, pip partially fails on Python 2.6 with an error `Double requirement given: ordereddict` and does not install the requirements.txt. Fixed by removing ordereddict from requirements-test.txt --- requirements-install.sh | 3 --- requirements-test.txt | 1 - requirements.txt | 1 - setup.py | 53 +++++++++++++++++++++++++++++++++++++---- 4 files changed, 48 insertions(+), 10 deletions(-) diff --git a/requirements-install.sh b/requirements-install.sh index 9b28888a..cd693444 100755 --- a/requirements-install.sh +++ b/requirements-install.sh @@ -5,9 +5,6 @@ if [[ $USE_OPTIONAL != "true" && $USE_OPTIONAL != "false" ]]; then exit 1 fi -# Make sure we're running setuptools >= 18.5 -pip install -U pip setuptools>=18.5 - pip install -U -r requirements-test.txt if [[ $USE_OPTIONAL == "true" ]]; then diff --git a/requirements-test.txt b/requirements-test.txt index e24223ef..e1ad307d 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,4 +4,3 @@ flake8 pytest pytest-expect>=1.1,<2.0 mock -ordereddict ; python_version < '2.7' diff --git a/requirements.txt b/requirements.txt index 92c09036..745993b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ six webencodings ordereddict ; python_version < '2.7' -setuptools>=18.5 diff --git a/setup.py b/setup.py index 7c419e2c..00fee241 100644 --- a/setup.py +++ b/setup.py @@ -8,10 +8,54 @@ from setuptools import setup, find_packages, __version__ as setuptools_version from pkg_resources import parse_version -if parse_version(setuptools_version) < parse_version("18.5"): - print("html5lib requires setuptools version 18.5 or above; " - "please upgrade before installing (you have %s)" % setuptools_version) - sys.exit(1) +import pkg_resources + +try: + import _markerlib.markers +except ImportError: + _markerlib = None + + +# _markerlib.default_environment() obtains its data from _VARS +# and wraps it in another dict, but _markerlib_evaluate writes +# to the dict while it is iterating the keys, causing an error +# on Python 3 only. +# Replace _markerlib.default_environment to return a custom dict +# that has all the necessary markers, and ignores any writes. + +class Python3MarkerDict(dict): + + def __setitem__(self, key, value): + pass + + def pop(self, i=-1): + return self[i] + + +if _markerlib and sys.version_info[0] == 3: + env = _markerlib.markers._VARS + for key in list(env.keys()): + new_key = key.replace('.', '_') + if new_key != key: + env[new_key] = env[key] + + _markerlib.markers._VARS = Python3MarkerDict(env) + + def default_environment(): + return _markerlib.markers._VARS + + _markerlib.default_environment = default_environment + +# Avoid the very buggy pkg_resources.parser, which doesnt consistently +# recognise the markers needed by this setup.py +# Change this to setuptools 20.10.0 to support all markers. +if pkg_resources: + if parse_version(setuptools_version) < parse_version('18.5'): + MarkerEvaluation = pkg_resources.MarkerEvaluation + + del pkg_resources.parser + pkg_resources.evaluate_marker = MarkerEvaluation._markerlib_evaluate + MarkerEvaluation.evaluate_marker = MarkerEvaluation._markerlib_evaluate classifiers = [ 'Development Status :: 5 - Production/Stable', @@ -60,7 +104,6 @@ install_requires=[ 'six', 'webencodings', - 'setuptools>=18.5' ], extras_require={ # A empty extra that only has a conditional marker will be From a3022dcea691780d300547bbf68b4dd921995d1c Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 26 Jul 2016 14:19:01 +0100 Subject: [PATCH 006/147] Require flake8 to be < 3.0 for Python 2.6 support (#291) --- requirements-test.txt | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index e1ad307d..40df78d4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,6 +1,6 @@ -r requirements.txt -flake8 +flake8<3.0 pytest pytest-expect>=1.1,<2.0 mock diff --git a/tox.ini b/tox.ini index efaea775..da64de71 100644 --- a/tox.ini +++ b/tox.ini @@ -3,7 +3,7 @@ envlist = {py26,py27,py33,py34,py35,pypy}-{base,optional} [testenv] deps = - flake8 + flake8<3.0 pytest pytest-expect>=1.1,<2.0 mock From ea0fafdbff732b1272140b696d6948054ed1d6d2 Mon Sep 17 00:00:00 2001 From: John Vandenberg Date: Fri, 16 Sep 2016 04:34:00 +0700 Subject: [PATCH 007/147] Remove redundant submodule update (#302) Travis CI automatically updates submodules during initialisation. --- .travis.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 94bb87e7..6f6be0f1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,9 +17,6 @@ env: - USE_OPTIONAL=true - USE_OPTIONAL=false -before_install: - - git submodule update --init --recursive - install: - bash requirements-install.sh From ff6111cd82191a2eb963d6d662c6da8fa2e7ddde Mon Sep 17 00:00:00 2001 From: Eric Amorde Date: Thu, 27 Oct 2016 14:33:26 -0700 Subject: [PATCH 008/147] Declare explicit dependency on Six 1.9 (#301) --- .travis.yml | 1 + requirements-install.sh | 4 ++++ requirements.txt | 2 +- setup.py | 2 +- 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6f6be0f1..09ef5985 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,7 @@ cache: env: - USE_OPTIONAL=true - USE_OPTIONAL=false + - SIX_VERSION=1.9 USE_OPTIONAL=true install: - bash requirements-install.sh diff --git a/requirements-install.sh b/requirements-install.sh index cd693444..0be226a6 100755 --- a/requirements-install.sh +++ b/requirements-install.sh @@ -11,6 +11,10 @@ if [[ $USE_OPTIONAL == "true" ]]; then pip install -U -r requirements-optional.txt fi +if [[ $SIX_VERSION != "false" ]]; then + pip install six==$SIX_VERSION +fi + if [[ $CI == "true" ]]; then pip install -U codecov fi diff --git a/requirements.txt b/requirements.txt index 745993b9..3884556f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -six +six>=1.9 webencodings ordereddict ; python_version < '2.7' diff --git a/setup.py b/setup.py index 00fee241..6f0fc17e 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,7 @@ def default_environment(): maintainer_email='james@hoppipolla.co.uk', packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), install_requires=[ - 'six', + 'six>=1.9', 'webencodings', ], extras_require={ From 1a28d721091a2c433c6e8471d14cbb75afd70d1c Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Thu, 27 Oct 2016 15:01:15 -0700 Subject: [PATCH 009/147] Fix typo in docs. (#300) Statement was repeated twice on a single line. --- doc/movingparts.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/movingparts.rst b/doc/movingparts.rst index 36539785..80ee2ad1 100644 --- a/doc/movingparts.rst +++ b/doc/movingparts.rst @@ -136,7 +136,7 @@ To use a filter, simply wrap it around a stream: >>> dom = html5lib.parse("

") - -HTMLTokenizer -~~~~~~~~~~~~~ - -This is the default tokenizer, the heart of html5lib. The implementation -can be found in `html5lib/tokenizer.py -`_. - -HTMLSanitizer -~~~~~~~~~~~~~ - -This is a tokenizer that removes unsafe markup and CSS styles from the -input. Elements that are known to be safe are passed through and the -rest is converted to visible text. The default configuration of the -sanitizer follows the `WHATWG Sanitization Rules -`_. - -The implementation can be found in `html5lib/sanitizer.py -`_. From 85540983f6285c82f2a1c4a8d756ae58d0c1e713 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:40:10 -0700 Subject: [PATCH 020/147] Fix Sphinx title underline warnings --- doc/html5lib.rst | 2 +- doc/html5lib.treewalkers.rst | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/html5lib.rst b/doc/html5lib.rst index 22af7728..44e34573 100644 --- a/doc/html5lib.rst +++ b/doc/html5lib.rst @@ -24,7 +24,7 @@ html5lib Package :show-inheritance: :mod:`serializer` Module ----------------------- +------------------------ .. automodule:: html5lib.serializer :members: diff --git a/doc/html5lib.treewalkers.rst b/doc/html5lib.treewalkers.rst index 46501258..085d8a98 100644 --- a/doc/html5lib.treewalkers.rst +++ b/doc/html5lib.treewalkers.rst @@ -10,7 +10,7 @@ treewalkers Package :show-inheritance: :mod:`base` Module -------------------- +------------------ .. automodule:: html5lib.treewalkers.base :members: @@ -34,7 +34,7 @@ treewalkers Package :show-inheritance: :mod:`etree_lxml` Module ------------------------ +------------------------ .. automodule:: html5lib.treewalkers.etree_lxml :members: @@ -43,9 +43,9 @@ treewalkers Package :mod:`genshi` Module --------------------------- +-------------------- .. automodule:: html5lib.treewalkers.genshi :members: :undoc-members: - :show-inheritance: \ No newline at end of file + :show-inheritance: From c8fca0ecc7c704995947601e03da0c34a85ecdf5 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:50:52 -0700 Subject: [PATCH 021/147] Open in binary mode for Python 3 --- html5lib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index 745b9342..b1970d29 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -7,7 +7,7 @@ Example usage:: import html5lib - with open("my_document.html") as f: + with open("my_document.html", "rb") as f: tree = html5lib.parse(f) For convenience, this module re-exports the following names: From 637826ffa72ca982dff6ae7204e4afcc35f3e29e Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 11:51:16 -0700 Subject: [PATCH 022/147] Update and expand "moving parts" doc --- doc/movingparts.rst | 65 +++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/doc/movingparts.rst b/doc/movingparts.rst index 3eeff4f2..1f3086cb 100644 --- a/doc/movingparts.rst +++ b/doc/movingparts.rst @@ -4,22 +4,25 @@ The moving parts html5lib consists of a number of components, which are responsible for handling its features. +Parsing uses a *tree builder* to generate a *tree*, the in-memory representation of the document. +Several tree representations are supported, as are translations to other formats via *tree adapters*. +The tree may be translated to a token stream with a *tree walker*, from which :class:`~html5lib.serializer.HTMLSerializer` produces a stream of bytes. +The token stream may also be transformed by use of *filters* to accomplish tasks like sanitization. Tree builders ------------- The parser reads HTML by tokenizing the content and building a tree that -the user can later access. There are three main types of trees that -html5lib can build: +the user can later access. html5lib can build three types of trees: -* ``etree`` - this is the default; builds a tree based on ``xml.etree``, +* ``etree`` - this is the default; builds a tree based on :mod:`xml.etree`, which can be found in the standard library. Whenever possible, the accelerated ``ElementTree`` implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x) is used. -* ``dom`` - builds a tree based on ``xml.dom.minidom``. +* ``dom`` - builds a tree based on :mod:`xml.dom.minidom`. -* ``lxml.etree`` - uses lxml's implementation of the ``ElementTree`` +* ``lxml`` - uses the :mod:`lxml.etree` implementation of the ``ElementTree`` API. The performance gains are relatively small compared to using the accelerated ``ElementTree`` module. @@ -31,21 +34,15 @@ You can specify the builder by name when using the shorthand API: with open("mydocument.html", "rb") as f: lxml_etree_document = html5lib.parse(f, treebuilder="lxml") -When instantiating a parser object, you have to pass a tree builder -class in the ``tree`` keyword attribute: +To get a builder class by name, use the :func:`~html5lib.treebuilders.getTreeBuilder` function. -.. code-block:: python - - import html5lib - parser = html5lib.HTMLParser(tree=SomeTreeBuilder) - document = parser.parse("

Hello World!") - -To get a builder class by name, use the ``getTreeBuilder`` function: +When instantiating a :class:`~html5lib.html5parser.HTMLParser` object, you must pass a tree builder class via the ``tree`` keyword attribute: .. code-block:: python import html5lib - parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) + TreeBuilder = html5lib.getTreeBuilder("dom") + parser = html5lib.HTMLParser(tree=TreeBuilder) minidom_document = parser.parse("

Hello World!") The implementation of builders can be found in `html5lib/treebuilders/ @@ -55,17 +52,16 @@ The implementation of builders can be found in `html5lib/treebuilders/ Tree walkers ------------ -Once a tree is ready, you can work on it either manually, or using -a tree walker, which provides a streaming view of the tree. html5lib -provides walkers for all three supported types of trees (``etree``, -``dom`` and ``lxml``). +In addition to manipulating a tree directly, you can use a tree walker to generate a streaming view of it. +html5lib provides walkers for ``etree``, ``dom``, and ``lxml`` trees, as well as ``genshi`` `markup streams `_. The implementation of walkers can be found in `html5lib/treewalkers/ `_. -Walkers make consuming HTML easier. html5lib uses them to provide you -with has a couple of handy tools. +html5lib provides a few tools for consuming token streams: +* :class:`~html5lib.serializer.HTMLSerializer`, to generate a stream of bytes; and +* filters, to manipulate the token stream. HTMLSerializer ~~~~~~~~~~~~~~ @@ -90,15 +86,14 @@ The serializer lets you write HTML back as a stream of bytes. '>' 'Witam wszystkich' -You can customize the serializer behaviour in a variety of ways, consult -the :class:`~html5lib.serializer.htmlserializer.HTMLSerializer` -documentation. +You can customize the serializer behaviour in a variety of ways. Consult +the :class:`~html5lib.serializer.HTMLSerializer` documentation. Filters ~~~~~~~ -You can alter the stream content with filters provided by html5lib: +html5lib provides several filters * :class:`alphabeticalattributes.Filter ` sorts attributes on @@ -110,11 +105,11 @@ You can alter the stream content with filters provided by html5lib: the document * :class:`lint.Filter ` raises - ``LintError`` exceptions on invalid tag and attribute names, invalid + :exc:`AssertionError` exceptions on invalid tag and attribute names, invalid PCDATA, etc. * :class:`optionaltags.Filter ` - removes tags from the stream which are not necessary to produce valid + removes tags from the token stream which are not necessary to produce valid HTML * :class:`sanitizer.Filter ` removes @@ -125,9 +120,9 @@ You can alter the stream content with filters provided by html5lib: * :class:`whitespace.Filter ` collapses all whitespace characters to single spaces unless they're in - ``

`` or ``textarea`` tags.
+  ``
`` or ``