From 3ecbda615f7dbb73a5aa7c3af151b78b6ef3ec4b Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Wed, 1 Mar 2023 14:52:49 +0200 Subject: [PATCH 01/20] Bump Flake8 to fix CI for Python 3.7 (#554) --- html5lib/_inputstream.py | 2 +- html5lib/serializer.py | 4 ++-- html5lib/tests/test_serializer.py | 2 +- html5lib/treebuilders/etree.py | 4 ++-- html5lib/treewalkers/etree.py | 2 +- requirements-test.txt | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py index 0207dd21..a93b5a4e 100644 --- a/html5lib/_inputstream.py +++ b/html5lib/_inputstream.py @@ -324,7 +324,7 @@ def charsUntil(self, characters, opposite=False): except KeyError: if __debug__: for c in characters: - assert(ord(c) < 128) + assert ord(c) < 128 regex = "".join(["\\x%02x" % ord(c) for c in characters]) if not opposite: regex = "^%s" % regex diff --git a/html5lib/serializer.py b/html5lib/serializer.py index c66df683..a171ac1c 100644 --- a/html5lib/serializer.py +++ b/html5lib/serializer.py @@ -222,14 +222,14 @@ def __init__(self, **kwargs): self.strict = False def encode(self, string): - assert(isinstance(string, text_type)) + assert isinstance(string, text_type) if self.encoding: return string.encode(self.encoding, "htmlentityreplace") else: return string def encodeStrict(self, string): - assert(isinstance(string, text_type)) + assert isinstance(string, text_type) if self.encoding: return string.encode(self.encoding, "strict") else: diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index bce62459..a2be0be5 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -74,7 +74,7 @@ def _convertAttrib(self, attribs): attrs = {} for attrib in attribs: name = (attrib["namespace"], attrib["name"]) - assert(name not in attrs) + assert name not in attrs attrs[name] = attrib["value"] return attrs diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 086bed4e..0b745081 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -108,7 +108,7 @@ def removeChild(self, node): node.parent = None def insertText(self, data, insertBefore=None): - if not(len(self._element)): + if not len(self._element): if not self._element.text: self._element.text = "" self._element.text += data @@ -201,7 +201,7 @@ def testSerializer(element): rv = [] def serializeElement(element, indent=0): - if not(hasattr(element, "tag")): + if not hasattr(element, "tag"): element = element.getroot() if element.tag == "": if element.get("publicId") or element.get("systemId"): diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index 44653372..411a1d45 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -37,7 +37,7 @@ def getNodeDetails(self, node): else: node = elt - if not(hasattr(node, "tag")): + if not hasattr(node, "tag"): node = node.getroot() if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"): diff --git a/requirements-test.txt b/requirements-test.txt index 57f8f617..8c0ca7c7 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,7 +1,7 @@ -r requirements.txt tox>=3.15.1,<4 -flake8>=3.8.1,<3.9 +flake8>=3.8.1,<6 pytest>=4.6.10,<5 ; python_version < '3' pytest>=5.4.2,<7 ; python_version >= '3' coverage>=5.1,<6 From 10b7eb83b7d957033b8350eb354bf0e37c8c1746 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Wed, 1 Mar 2023 14:53:31 +0200 Subject: [PATCH 02/20] Add support for Python 3.9 and 3.10 and 3.11 (#542) --- .appveyor.yml | 6 +++++- .github/workflows/python-tox.yml | 9 ++++++--- .travis.yml | 3 ++- CHANGES.rst | 2 +- README.rst | 2 +- setup.py | 3 +++ tox.ini | 4 ++-- 7 files changed, 20 insertions(+), 9 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index a1a3e347..8af60b9b 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,4 @@ -# To activate, change the Appveyor settings to use `.appveyor.yml`. +image: Visual Studio 2019 environment: global: PATH: "C:\\Python27\\Scripts\\;%PATH%" @@ -13,6 +13,10 @@ environment: - TOXENV: py37-optional - TOXENV: py38-base - TOXENV: py38-optional + - TOXENV: py39-base + - TOXENV: py39-optional + - TOXENV: py310-base + - TOXENV: py310-optional install: - git submodule update --init --recursive diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml index ec5cf636..78f7fdc3 100644 --- a/.github/workflows/python-tox.yml +++ b/.github/workflows/python-tox.yml @@ -6,15 +6,18 @@ jobs: if: github.event.push || github.event.pull_request.head.repo.full_name != github.repository runs-on: ubuntu-latest strategy: + fail-fast: false matrix: - python: [2.7, 3.5, 3.6, 3.7, 3.8, pypy-2.7, pypy3] + python: ["2.7", "3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "pypy-2.7", "pypy-3.8"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: "requirements*.txt" - run: pip install tox - run: tox -e py - if: ${{ always() }} diff --git a/.travis.yml b/.travis.yml index d2d9e30e..780df9a2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,12 +2,13 @@ language: python python: - "pypy3" - "pypy" + - "3.10" + - "3.9" - "3.8" - "3.7" - "3.6" - "3.5" - "2.7" - - "3.9-dev" cache: pip diff --git a/CHANGES.rst b/CHANGES.rst index 0f6314aa..3ed63a96 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -95,7 +95,7 @@ Released on July 14, 2016 tested, doesn't entirely work, and as far as I can tell is completely unused by anyone.** -* Move testsuite to ``py.test``. +* Move testsuite to ``pytest``. * **Fix #124: move to webencodings for decoding the input byte stream; this makes html5lib compliant with the Encoding Standard, and diff --git a/README.rst b/README.rst index d367905d..072861ab 100644 --- a/README.rst +++ b/README.rst @@ -128,7 +128,7 @@ Tests ----- Unit tests require the ``pytest`` and ``mock`` libraries and can be -run using the ``py.test`` command in the root directory. +run using the ``pytest`` command in the root directory. Test data are contained in a separate `html5lib-tests `_ repository and included diff --git a/setup.py b/setup.py index f84c1284..b4c11811 100644 --- a/setup.py +++ b/setup.py @@ -70,6 +70,9 @@ def default_environment(): 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Software Development :: Libraries :: Python Modules', diff --git a/tox.ini b/tox.ini index 16b8cf41..42790f48 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py{27,35,36,37,38,py,py3}-{base,six19,optional} +envlist = py{27,35,36,37,38,39,310,311,py,py3}-{base,six19,optional} [testenv] deps = @@ -12,7 +12,7 @@ passenv = COVERAGE_RUN_OPTIONS commands = six19: pip install six==1.9 - {env:PYTEST_COMMAND:{envbindir}/py.test} {posargs} + {env:PYTEST_COMMAND:{envbindir}/pytest} {posargs} flake8 {toxinidir} [testenv:doc] From d1dfa20ecd28d09a01b027fc24675132a8196ed3 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Wed, 1 Mar 2023 16:49:07 +0200 Subject: [PATCH 03/20] Remove Travis CI in favour of GitHub Actions (#545) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Łukasz Langa --- .github/workflows/python-tox.yml | 2 +- .travis.yml | 26 -------------------------- CONTRIBUTING.rst | 2 +- README.rst | 5 ++--- 4 files changed, 4 insertions(+), 31 deletions(-) delete mode 100644 .travis.yml diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml index 78f7fdc3..0e3e46db 100644 --- a/.github/workflows/python-tox.yml +++ b/.github/workflows/python-tox.yml @@ -8,7 +8,7 @@ jobs: strategy: fail-fast: false matrix: - python: ["2.7", "3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "pypy-2.7", "pypy-3.8"] + python: ["2.7", "3.7", "3.8", "3.9", "3.10", "3.11", "pypy-2.7", "pypy-3.8"] steps: - uses: actions/checkout@v3 with: diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 780df9a2..00000000 --- a/.travis.yml +++ /dev/null @@ -1,26 +0,0 @@ -language: python -python: - - "pypy3" - - "pypy" - - "3.10" - - "3.9" - - "3.8" - - "3.7" - - "3.6" - - "3.5" - - "2.7" - -cache: pip - -env: - global: - - TOXENV=base,optional,six19-optional - -install: - - pip install tox - -script: - - tox - -after_script: - - python debug-info.py diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 8c5e1985..dba35216 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -16,7 +16,7 @@ documentation. Some useful information: - We keep the master branch passing all tests at all times on all supported versions. -`Travis CI `_ is run +`GitHub Actions `_ is run against all pull requests and should enforce all of the above. We use `Opera Critic `_ as an external diff --git a/README.rst b/README.rst index 072861ab..39ae0afc 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,8 @@ html5lib ======== -.. image:: https://travis-ci.org/html5lib/html5lib-python.svg?branch=master - :target: https://travis-ci.org/html5lib/html5lib-python - +.. image:: https://github.com/html5lib/html5lib-python/actions/workflows/python-tox.yml/badge.svg + :target: https://github.com/html5lib/html5lib-python/actions/workflows/python-tox.yml html5lib is a pure-python library for parsing HTML. It is designed to conform to the WHATWG HTML specification, as is implemented by all major From 6815a541f414cbd30e80f78841e4a72113c71910 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Wed, 1 Mar 2023 16:50:06 +0200 Subject: [PATCH 04/20] Fix pytest warnings (#544) --- html5lib/tests/test_sanitizer.py | 7 ++++++- pytest.ini | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index f3faeb80..6ad43a3a 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -1,9 +1,14 @@ from __future__ import absolute_import, division, unicode_literals +import warnings + import pytest from html5lib import constants, parseFragment, serialize -from html5lib.filters import sanitizer + +with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + from html5lib.filters import sanitizer def sanitize_html(stream): diff --git a/pytest.ini b/pytest.ini index 8824977a..1f620d98 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,6 +1,6 @@ [pytest] # Output fails, errors, xpass, and warnings; ignore doctest; make warnings errors -addopts = -rfEXw -p no:doctest --strict +addopts = -rfEXw -p no:doctest --strict-markers # Make xpass results be considered fail xfail_strict = true From 5decf471b22db9aacb212bc22bfe74e1e0f67c60 Mon Sep 17 00:00:00 2001 From: Drew Hintz Date: Wed, 1 Mar 2023 06:50:48 -0800 Subject: [PATCH 05/20] remove redundant list comprehension (#547) any() accepts a generator. This will cause it to run faster because the any() returns as soon as it finds a True value. --- html5lib/html5parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 74d829d9..9fb038b7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1002,8 +1002,8 @@ def processCharacters(self, token): self.tree.insertText(token["data"]) # This must be bad for performance if (self.parser.framesetOK and - any([char not in spaceCharacters - for char in token["data"]])): + any(char not in spaceCharacters + for char in token["data"])): self.parser.framesetOK = False def processSpaceCharactersNonPre(self, token): @@ -1850,7 +1850,7 @@ def __init__(self, *args, **kwargs): def flushCharacters(self): data = "".join([item["data"] for item in self.characterTokens]) - if any([item not in spaceCharacters for item in data]): + if any(item not in spaceCharacters for item in data): token = {"type": tokenTypes["Characters"], "data": data} self.parser.phases["inTable"].insertText(token) elif data: From 1cd57cb0a55890b5b114a945fe861a23b9ede309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philip=20J=C3=A4genstedt?= Date: Wed, 1 Mar 2023 16:26:55 +0100 Subject: [PATCH 06/20] Replace os.path.split with .dirname or .basename in two places (#508) --- html5lib/tests/support.py | 2 +- html5lib/tests/tokenizertotree.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py index 9cd5afbe..1bd0ccc1 100644 --- a/html5lib/tests/support.py +++ b/html5lib/tests/support.py @@ -8,7 +8,7 @@ import glob import xml.sax.handler -base_path = os.path.split(__file__)[0] +base_path = os.path.dirname(__file__) test_dir = os.path.join(base_path, 'testdata') sys.path.insert(0, os.path.abspath(os.path.join(base_path, diff --git a/html5lib/tests/tokenizertotree.py b/html5lib/tests/tokenizertotree.py index 8528e876..42463f32 100644 --- a/html5lib/tests/tokenizertotree.py +++ b/html5lib/tests/tokenizertotree.py @@ -29,7 +29,7 @@ def run_file(filename, out_path): except ValueError: sys.stderr.write("Failed to load %s\n" % filename) return - name = os.path.splitext(os.path.split(filename)[1])[0] + name = os.path.splitext(os.path.basename(filename))[0] output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w") if 'tests' in tests_data: From 2f64f3bc408cc3b000e85f756963d57158cedecd Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Wed, 1 Mar 2023 18:17:27 +0200 Subject: [PATCH 07/20] README: Replace broken mailing list link with archive (#543) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Łukasz Langa --- README.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 39ae0afc..a17411ab 100644 --- a/README.rst +++ b/README.rst @@ -144,7 +144,9 @@ which can be found on PyPI. Questions? ---------- -There's a mailing list available for support on Google Groups, -`html5lib-discuss `_, -though you may get a quicker response asking on IRC in `#whatwg on -irc.freenode.net `_. +Check out `the docs https://html5lib.readthedocs.io/en/latest/`_. Still +need help? Go to our `GitHub Discussions +https://github.com/html5lib/html5lib-python/discussions`_. + +You can also browse the archives of the `html5lib-discuss mailing list +https://www.mail-archive.com/html5lib-discuss@googlegroups.com/`_. From 4f9f5bfe13d5389fdb8905349a6405c1c3b1e8b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Wed, 1 Mar 2023 17:48:03 +0100 Subject: [PATCH 08/20] Fix ReST link markup in README --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index a17411ab..6a623a43 100644 --- a/README.rst +++ b/README.rst @@ -144,9 +144,9 @@ which can be found on PyPI. Questions? ---------- -Check out `the docs https://html5lib.readthedocs.io/en/latest/`_. Still +Check out `the docs `_. Still need help? Go to our `GitHub Discussions -https://github.com/html5lib/html5lib-python/discussions`_. +`_. You can also browse the archives of the `html5lib-discuss mailing list -https://www.mail-archive.com/html5lib-discuss@googlegroups.com/`_. +`_. From 1b393775e62b0fed8ba7e713b09e1b2713f63b1c Mon Sep 17 00:00:00 2001 From: anonymous <29055749+idiomaticrefactoring@users.noreply.github.com> Date: Thu, 2 Mar 2023 01:05:27 +0800 Subject: [PATCH 09/20] Use for:else: where it makes sense (#539) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: zzj <29055749+zjzh@users.noreply.github.com> Co-authored-by: Łukasz Langa --- html5lib/_ihatexml.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/html5lib/_ihatexml.py b/html5lib/_ihatexml.py index 3ff803c1..d725eabd 100644 --- a/html5lib/_ihatexml.py +++ b/html5lib/_ihatexml.py @@ -104,18 +104,15 @@ def charStringToList(chars): charRanges = [item.strip() for item in chars.split(" | ")] rv = [] for item in charRanges: - foundMatch = False for regexp in (reChar, reCharRange): match = regexp.match(item) if match is not None: rv.append([hexToInt(item) for item in match.groups()]) if len(rv[-1]) == 1: rv[-1] = rv[-1] * 2 - foundMatch = True break - if not foundMatch: + else: assert len(item) == 1 - rv.append([ord(item)] * 2) rv = normaliseCharList(rv) return rv From c64a1115daa463c7c60b32104e07f37c26b8a097 Mon Sep 17 00:00:00 2001 From: theRealProHacker <77074862+theRealProHacker@users.noreply.github.com> Date: Wed, 1 Mar 2023 21:02:42 +0100 Subject: [PATCH 10/20] Restore ability to provide a tree builder name as string (#559) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Łukasz Langa --- html5lib/html5parser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 9fb038b7..4c2d4c75 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -115,6 +115,9 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa if tree is None: tree = treebuilders.getTreeBuilder("etree") + elif isinstance(tree, str): + tree = treebuilders.getTreeBuilder(tree) + self.tree = tree(namespaceHTMLElements) self.errors = [] From ff7fa3721e1072898e91531a52e6befefd9fbc55 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Wed, 1 Mar 2023 13:55:52 -0800 Subject: [PATCH 11/20] Sanitizer: Allow (#423) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add tests that the sanitizer allows
and and update the implementation to allow . Co-authored-by: Łukasz Langa --- CHANGES.rst | 8 +++++++- html5lib/filters/sanitizer.py | 1 + html5lib/tests/test_sanitizer.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3ed63a96..f28caa86 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,13 @@ Change Log ---------- +1.2 +~~~ + +Bug fixes: + +* The sanitizer now permits ```` tags. + 1.1 ~~~ @@ -22,7 +29,6 @@ Other changes: ``html5lib`` keeps working in future Python versions. (#403) * Drop optional ``datrie`` dependency. (#442) - 1.0.1 ~~~~~ diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index 70ef9066..684f2172 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -113,6 +113,7 @@ (namespaces['html'], 'strike'), (namespaces['html'], 'strong'), (namespaces['html'], 'sub'), + (namespaces['html'], 'summary'), (namespaces['html'], 'sup'), (namespaces['html'], 'table'), (namespaces['html'], 'tbody'), diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 6ad43a3a..9deed6f5 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -111,6 +111,18 @@ def param_sanitizer(): """foo""" % (protocol, rest_of_uri)) +def test_details_open_allowed(): + sanitized = sanitize_html("
.
") + expected = '
.
' + assert expected == sanitized + + +def test_details_summary_allowed(): + sanitized = sanitize_html("
.

...

") + expected = '
.

...

' + assert expected == sanitized + + @pytest.mark.parametrize("expected, input", (pytest.param(expected, input, id=id) for id, expected, input in param_sanitizer())) From 5c30ac35aa51f69c7048dc701d6f9210269b5abc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Wed, 1 Mar 2023 23:06:02 +0100 Subject: [PATCH 12/20] Fix reference targets in Sphinx documentation (#562) Fixes #548 --- doc/conf.py | 10 +++++++++- doc/html5lib.filters.rst | 33 +++++++++++++++++++-------------- doc/html5lib.rst | 17 +++++++++++------ doc/html5lib.treeadapters.rst | 3 --- doc/html5lib.treebuilders.rst | 19 ++++++++----------- doc/html5lib.treewalkers.rst | 23 ++++++++++------------- doc/movingparts.rst | 6 +++--- html5lib/treebuilders/base.py | 1 + 8 files changed, 61 insertions(+), 51 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 22ebab4f..d5a1e863 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -19,7 +19,8 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.viewcode'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.viewcode', + 'sphinx.ext.intersphinx'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -92,6 +93,13 @@ ] +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'lxml': ('https://lxml.de/apidoc/', None), + 'chardet': ('https://chardet.readthedocs.io/en/latest/', None), +} + + class CExtMock(object): """Required for autodoc on readthedocs.org where you cannot build C extensions.""" def __init__(self, *args, **kwargs): diff --git a/doc/html5lib.filters.rst b/doc/html5lib.filters.rst index d70e4552..89a5bc7b 100644 --- a/doc/html5lib.filters.rst +++ b/doc/html5lib.filters.rst @@ -1,56 +1,61 @@ filters Package =============== -:mod:`base` Module -------------------- +``base`` Module +---------------- .. automodule:: html5lib.filters.base :members: :show-inheritance: :special-members: __init__ -:mod:`alphabeticalattributes` Module ------------------------------------- +.. autoclass:: html5lib.filters.base.Filter + :members: + :show-inheritance: + :special-members: __init__ + +``alphabeticalattributes`` Module +--------------------------------- .. automodule:: html5lib.filters.alphabeticalattributes :members: :show-inheritance: :special-members: __init__ -:mod:`inject_meta_charset` Module ---------------------------------- +``inject_meta_charset`` Module +------------------------------ .. automodule:: html5lib.filters.inject_meta_charset :members: :show-inheritance: :special-members: __init__ -:mod:`lint` Module ------------------- +``lint`` Module +--------------- .. automodule:: html5lib.filters.lint :members: :show-inheritance: :special-members: __init__ -:mod:`optionaltags` Module --------------------------- +``optionaltags`` Module +----------------------- .. automodule:: html5lib.filters.optionaltags :members: :show-inheritance: :special-members: __init__ -:mod:`sanitizer` Module ------------------------ +``sanitizer`` Module +-------------------- .. automodule:: html5lib.filters.sanitizer :members: :show-inheritance: :special-members: __init__ -:mod:`whitespace` Module ------------------------- +``whitespace`` Module +--------------------- .. automodule:: html5lib.filters.whitespace :members: diff --git a/doc/html5lib.rst b/doc/html5lib.rst index d7c75c58..57d1db54 100644 --- a/doc/html5lib.rst +++ b/doc/html5lib.rst @@ -4,29 +4,34 @@ html5lib Package .. automodule:: html5lib :members: __version__ -:mod:`constants` Module ------------------------ +``constants`` Module +-------------------- .. automodule:: html5lib.constants :members: :show-inheritance: -:mod:`html5parser` Module -------------------------- +``html5parser`` Module +---------------------- .. automodule:: html5lib.html5parser :members: :show-inheritance: :special-members: __init__ -:mod:`serializer` Module ------------------------- +``serializer`` Module +--------------------- .. automodule:: html5lib.serializer :members: :show-inheritance: :special-members: __init__ +.. autoclass:: html5lib.serializer.HTMLSerializer + :members: + :show-inheritance: + :special-members: __init__ + Subpackages ----------- diff --git a/doc/html5lib.treeadapters.rst b/doc/html5lib.treeadapters.rst index 1d3a9fba..d20c1e95 100644 --- a/doc/html5lib.treeadapters.rst +++ b/doc/html5lib.treeadapters.rst @@ -1,9 +1,6 @@ treeadapters Package ==================== -:mod:`~html5lib.treeadapters` Package -------------------------------------- - .. automodule:: html5lib.treeadapters :members: :show-inheritance: diff --git a/doc/html5lib.treebuilders.rst b/doc/html5lib.treebuilders.rst index 1a051e50..507d319e 100644 --- a/doc/html5lib.treebuilders.rst +++ b/doc/html5lib.treebuilders.rst @@ -1,40 +1,37 @@ treebuilders Package ==================== -:mod:`treebuilders` Package ---------------------------- - .. automodule:: html5lib.treebuilders :members: :show-inheritance: :special-members: __init__ -:mod:`base` Module -------------------- +``base`` Module +--------------- .. automodule:: html5lib.treebuilders.base :members: :show-inheritance: :special-members: __init__ -:mod:`dom` Module ------------------ +``dom`` Module +-------------- .. automodule:: html5lib.treebuilders.dom :members: :show-inheritance: :special-members: __init__ -:mod:`etree` Module -------------------- +``etree`` Module +---------------- .. automodule:: html5lib.treebuilders.etree :members: :show-inheritance: :special-members: __init__ -:mod:`etree_lxml` Module ------------------------- +``etree_lxml`` Module +--------------------- .. automodule:: html5lib.treebuilders.etree_lxml :members: diff --git a/doc/html5lib.treewalkers.rst b/doc/html5lib.treewalkers.rst index 4afef476..53bd5c31 100644 --- a/doc/html5lib.treewalkers.rst +++ b/doc/html5lib.treewalkers.rst @@ -1,48 +1,45 @@ treewalkers Package =================== -:mod:`treewalkers` Package --------------------------- - .. automodule:: html5lib.treewalkers :members: :show-inheritance: :special-members: __init__ -:mod:`base` Module ------------------- +``base`` Module +--------------- .. automodule:: html5lib.treewalkers.base :members: :show-inheritance: :special-members: __init__ -:mod:`dom` Module ------------------ +``dom`` Module +-------------- .. automodule:: html5lib.treewalkers.dom :members: :show-inheritance: :special-members: __init__ -:mod:`etree` Module -------------------- +``etree`` Module +---------------- .. automodule:: html5lib.treewalkers.etree :members: :show-inheritance: :special-members: __init__ -:mod:`etree_lxml` Module ------------------------- +``etree_lxml`` Module +--------------------- .. automodule:: html5lib.treewalkers.etree_lxml :members: :show-inheritance: :special-members: __init__ -:mod:`genshi` Module --------------------- +``genshi`` Module +----------------- .. automodule:: html5lib.treewalkers.genshi :members: diff --git a/doc/movingparts.rst b/doc/movingparts.rst index 6ba367a2..c7fe0247 100644 --- a/doc/movingparts.rst +++ b/doc/movingparts.rst @@ -15,9 +15,9 @@ Tree builders The parser reads HTML by tokenizing the content and building a tree that the user can later access. html5lib can build three types of trees: -* ``etree`` - this is the default; builds a tree based on :mod:`xml.etree`, - which can be found in the standard library. Whenever possible, the - accelerated ``ElementTree`` implementation (i.e. +* ``etree`` - this is the default; builds a tree based on + :mod:`xml.etree.ElementTree`, which can be found in the standard library. + Whenever possible, the accelerated ``ElementTree`` implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x) is used. * ``dom`` - builds a tree based on :mod:`xml.dom.minidom`. diff --git a/html5lib/treebuilders/base.py b/html5lib/treebuilders/base.py index e4a3d710..020d7e15 100644 --- a/html5lib/treebuilders/base.py +++ b/html5lib/treebuilders/base.py @@ -121,6 +121,7 @@ def hasContent(self): class ActiveFormattingElements(list): def append(self, node): + """Append node to the end of the list.""" equalCount = 0 if node != Marker: for element in self[::-1]: From 6ca02445937a6a7c930c262d1325ec15fdb0324a Mon Sep 17 00:00:00 2001 From: Tom Most Date: Thu, 2 Mar 2023 06:22:38 -0800 Subject: [PATCH 13/20] Support the element (#395) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Łukasz Langa --- CHANGES.rst | 10 ++++++++++ html5lib/constants.py | 3 ++- html5lib/filters/sanitizer.py | 1 + html5lib/tests/test_sanitizer.py | 6 ++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index f28caa86..cf95ea0b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,14 @@ Change Log 1.2 ~~~ +Unreleased yet + +Features: + +* Add support for the ```` element in the sanitizer, `which indicates + a line break opportunity `_. + This element is allowed by default. (#395) (Thank you, Tom Most!) + Bug fixes: * The sanitizer now permits ```` tags. @@ -11,6 +19,8 @@ Bug fixes: 1.1 ~~~ +Released on June 23, 2020 + Breaking changes: * Drop support for Python 3.3. (#358) diff --git a/html5lib/constants.py b/html5lib/constants.py index fe3e237c..11184e0d 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -571,7 +571,8 @@ "col", "input", "source", - "track" + "track", + "wbr", ]) cdataElements = frozenset(['title', 'textarea']) diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index 684f2172..f7ac8d9b 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -129,6 +129,7 @@ (namespaces['html'], 'ul'), (namespaces['html'], 'var'), (namespaces['html'], 'video'), + (namespaces['html'], 'wbr'), (namespaces['mathml'], 'maction'), (namespaces['mathml'], 'math'), (namespaces['mathml'], 'merror'), diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 9deed6f5..a6cbd798 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -55,6 +55,12 @@ def test_data_uri_disallowed_type(): assert expected == sanitized +def test_wbr_allowed(): + sanitized = sanitize_html('') + expected = '' + assert expected == sanitized + + def param_sanitizer(): for ns, tag_name in sanitizer.allowed_elements: if ns != constants.namespaces["html"]: From 82047b0e8506886255c5cca5954c870f84e3adda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Thu, 2 Mar 2023 20:42:35 +0100 Subject: [PATCH 14/20] Sort void elements alphabetically, document ones that don't match current standard (#563) --- html5lib/constants.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index 11184e0d..e83bfb5d 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -557,24 +557,36 @@ ) voidElements = frozenset([ + "area", "base", - "command", - "event-source", - "link", - "meta", - "hr", "br", - "img", - "embed", - "param", - "area", "col", + "command", # removed ^1 + "embed", + "event-source", # renamed and later removed ^2 + "hr", + "img", "input", + "link", + "meta", + "param", # deprecated ^3 "source", "track", "wbr", ]) +# Removals and deprecations in the HTML 5 spec: +# ^1: command +# http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2012-December/038472.html +# https://github.com/whatwg/html/commit/9e2e25f4ae90969a7c64e0763c98548a35b50af8 +# ^2: event-source +# renamed to eventsource in 7/2008: +# https://github.com/whatwg/html/commit/d157945d0285b4463a04b57318da0c4b300a99e7 +# removed entirely in 2/2009: +# https://github.com/whatwg/html/commit/43cbdbfbb7eb74b0d65e0f4caab2020c0b2a16ff +# ^3: param +# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/param + cdataElements = frozenset(['title', 'textarea']) rcdataElements = frozenset([ From 01b7dea9a378d69ae66767183a42e602d7a468cd Mon Sep 17 00:00:00 2001 From: Tom Most Date: Fri, 3 Mar 2023 01:06:08 -0800 Subject: [PATCH 15/20] Add support for
    , related attributes (#396) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Mark
      as a boolean attribute so it serializes properly in HTML. * Allow
        in the sanitizer. Closes #321. * Allow
          in the sanitizer. *
            was already allowed, but probably accidentally (type is an attribute allowed for other tags). I added a test to prevent it from regressing in case we add per-element attribute sanitization in the future. https://html.spec.whatwg.org/multipage/grouping-content.html#attr-ol-reversed Co-authored-by: Łukasz Langa --- CHANGES.rst | 7 ++++++- html5lib/constants.py | 1 + html5lib/filters/sanitizer.py | 2 ++ html5lib/tests/test_sanitizer.py | 18 ++++++++++++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index cf95ea0b..47dcda3a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -11,10 +11,15 @@ Features: * Add support for the ```` element in the sanitizer, `which indicates a line break opportunity `_. This element is allowed by default. (#395) (Thank you, Tom Most!) +* Add support for serializing the ``
              `` boolean attribute. (Thank + you, Tom Most!) (#396) +* The ``
                `` and ``
                  `` attributes are now permitted by the + sanitizer. (#321) (Thank you, Tom Most!) Bug fixes: -* The sanitizer now permits ```` tags. +* The sanitizer now permits ```` tags. It used to allow ``
                  `` + already. (#423) 1.1 ~~~ diff --git a/html5lib/constants.py b/html5lib/constants.py index e83bfb5d..2fa4146d 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -617,6 +617,7 @@ "button": frozenset(["disabled", "autofocus"]), "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]), "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]), + "ol": frozenset(["reversed"]), "output": frozenset(["disabled", "readonly"]), "iframe": frozenset(["seamless"]), } diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index f7ac8d9b..81c85d44 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -365,6 +365,7 @@ (None, 'maxsize'), (None, 'minsize'), (None, 'other'), + (None, 'reversed'), (None, 'rowalign'), (None, 'rowalign'), (None, 'rowalign'), @@ -375,6 +376,7 @@ (None, 'scriptlevel'), (None, 'selection'), (None, 'separator'), + (None, 'start'), (None, 'stretchy'), (None, 'width'), (None, 'width'), diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index a6cbd798..499310b6 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -154,3 +154,21 @@ def test_uppercase_color_codes_in_style(): sanitized = sanitize_html("

                  ") expected = '

                  ' assert expected == sanitized + + +def test_ol_start_allowed(): + sanitized = sanitize_html("
                  1. .
                  ") + expected = '
                  1. .
                  ' + assert expected == sanitized + + +def test_ol_type_allowed(): + sanitized = sanitize_html("
                  1. .
                  ") + expected = '
                  1. .
                  ' + assert expected == sanitized + + +def test_ol_reversed_allowed(): + sanitized = sanitize_html("
                  1. .
                  ") + expected = '
                  1. .
                  ' + assert expected == sanitized From f0bb2a639d768c5bd6640c020aef86621a63b02b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Fri, 3 Mar 2023 15:50:22 +0100 Subject: [PATCH 16/20] Test more on GitHub Actions (#564) The only environments left on AppVeyor are now 2.7, 3.5, and 3.6. The remaining Python versions are now also tested on Windows using GitHub Actions. Additionally, a new group of dependencies called "oldest" is now tested as well to ensure compatibility with the oldest pinned versions in requirements.txt and setup.py. AppVeyor is used for 3.5 and 3.6 because those versions aren't available on GitHub Actions at all. Python 2.7 crashes on Windows with a pip cache failure so it only runs on GHA with the "oldest" dependencies on Ubuntu. The other Python version running "oldest" dependencies is 3.7, also only running on Ubuntu. --- .appveyor.yml | 8 ----- .github/workflows/python-tox.yml | 51 +++++++++++++++++++++++++--- requirements-install.sh | 15 --------- requirements-oldest.txt | 29 ++++++++++++++++ requirements-optional.txt | 6 ++-- requirements-test.txt | 4 +-- setup.py | 12 +++---- tox.ini | 7 ++-- toxver.py | 57 ++++++++++++++++++++++++++++++++ 9 files changed, 147 insertions(+), 42 deletions(-) delete mode 100755 requirements-install.sh create mode 100644 requirements-oldest.txt create mode 100755 toxver.py diff --git a/.appveyor.yml b/.appveyor.yml index 8af60b9b..7661aa63 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -9,14 +9,6 @@ environment: - TOXENV: py35-optional - TOXENV: py36-base - TOXENV: py36-optional - - TOXENV: py37-base - - TOXENV: py37-optional - - TOXENV: py38-base - - TOXENV: py38-optional - - TOXENV: py39-base - - TOXENV: py39-optional - - TOXENV: py310-base - - TOXENV: py310-optional install: - git submodule update --init --recursive diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml index 0e3e46db..cfcc42e6 100644 --- a/.github/workflows/python-tox.yml +++ b/.github/workflows/python-tox.yml @@ -4,21 +4,62 @@ jobs: # Prevent duplicate builds for 'internal' pull requests on existing commits # Credit: https://github.community/t/duplicate-checks-on-push-and-pull-request-simultaneous-event/18012 if: github.event.push || github.event.pull_request.head.repo.full_name != github.repository - runs-on: ubuntu-latest strategy: fail-fast: false matrix: - python: ["2.7", "3.7", "3.8", "3.9", "3.10", "3.11", "pypy-2.7", "pypy-3.8"] + # 2.7, 3.5, and 3.6 run on Windows via AppVeyor + python: ["3.7", "3.8", "3.9", "3.10", "3.11"] + os: [ubuntu-latest, windows-latest] + deps: [base, optional] + include: + - python: "pypy-2.7" + os: ubuntu-latest + deps: base + - python: "pypy-3.8" + os: ubuntu-latest + deps: base + - python: "2.7" + os: ubuntu-latest + deps: oldest + - python: "3.7" + os: ubuntu-latest + deps: oldest + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 with: submodules: true - - uses: actions/setup-python@v4 + - if: ${{ matrix.deps == 'base' }} + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} cache: pip - cache-dependency-path: "requirements*.txt" + cache-dependency-path: | + requirements.txt + requirements-test.txt + - if: ${{ matrix.deps == 'optional' }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: | + requirements.txt + requirements-optional.txt + requirements-test.txt + - if: ${{ matrix.deps == 'oldest' }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: | + requirements-oldest.txt + - if: ${{ matrix.os == 'windows-latest' }} + name: Determine environment name for Tox (PowerShell) + run: python toxver.py ${{ matrix.python }} ${{ matrix.deps }} >> $env:GITHUB_ENV + - if: ${{ matrix.os == 'ubuntu-latest' }} + name: Determine environment name for Tox (Bash) + run: python toxver.py ${{ matrix.python }} ${{ matrix.deps }} >> $GITHUB_ENV - run: pip install tox - - run: tox -e py + - run: tox - if: ${{ always() }} run: python debug-info.py diff --git a/requirements-install.sh b/requirements-install.sh deleted file mode 100755 index b7a8d96d..00000000 --- a/requirements-install.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -ex - -if [[ $SIX_VERSION ]]; then - pip install six==$SIX_VERSION -fi - -pip install -r requirements-test.txt - -if [[ $USE_OPTIONAL == "true" ]]; then - pip install -r requirements-optional.txt -fi - -if [[ $CI == "true" ]]; then - pip install codecov -fi diff --git a/requirements-oldest.txt b/requirements-oldest.txt new file mode 100644 index 00000000..68d0f13d --- /dev/null +++ b/requirements-oldest.txt @@ -0,0 +1,29 @@ +# This allows us to install the actually oldest supported dependencies and test whether that works. + +# requirements.txt +six==1.9 +webencodings==0.5.1 + +# requirements-optional.txt +genshi==0.7.1 ; python_version < '3.8' +genshi==0.7.6 ; python_version >= '3.8' +chardet==2.2.1 +# this should be 3.4.0 but there are no Linux +# binary wheels for older releases +lxml==3.8.0 ; python_version < '3.7' +# minimums for 3.x are actually different: +# - 3.7 is actually 4.1.1 +# - 3.8 is actually 4.3.5 +# - 3.9-3.10 is actually 4.5.2 +# - 3.11 is actually 4.9.0 +lxml==4.9.0 ; python_version >= '3.7' + +# requirements-test.txt +flake8==3.9.2 ; python_version < '3.6' +flake8==5.0.4; python_version >= '3.6' +pytest==4.6.10 ; python_version < '3' +pytest==5.4.2 ; python_version >= '3' +coverage==5.1 +pytest-expect==1.1.0 +mock==3.0.5 ; python_version < '3.6' +mock==4.0.2 ; python_version >= '3.6' \ No newline at end of file diff --git a/requirements-optional.txt b/requirements-optional.txt index 2e78c952..2e112e95 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -2,12 +2,12 @@ # We support a Genshi treewalker that can be used to serialize Genshi # streams. -genshi +genshi>=0.7.1 # chardet can be used as a fallback in case we are unable to determine # the encoding of a document. -chardet>=2.2 +chardet>=2.2.1 # lxml is supported with its own treebuilder ("lxml") and otherwise # uses the standard ElementTree support -lxml ; platform_python_implementation == 'CPython' +lxml>=3.4.0 ; platform_python_implementation == 'CPython' diff --git a/requirements-test.txt b/requirements-test.txt index 8c0ca7c7..27866e59 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,7 +1,7 @@ -r requirements.txt -tox>=3.15.1,<4 -flake8>=3.8.1,<6 +flake8==3.9.2 ; python_version < '3.6' +flake8>=5.0.4; python_version >= '3.6' pytest>=4.6.10,<5 ; python_version < '3' pytest>=5.4.2,<7 ; python_version >= '3' coverage>=5.1,<6 diff --git a/setup.py b/setup.py index b4c11811..30ee0575 100644 --- a/setup.py +++ b/setup.py @@ -108,23 +108,23 @@ def default_environment(): packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), install_requires=[ 'six>=1.9', - 'webencodings', + 'webencodings>=0.5.1', ], python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*", extras_require={ # A conditional extra will only install these items when the extra is # requested and the condition matches. - "lxml:platform_python_implementation == 'CPython'": ["lxml"], + "lxml:platform_python_implementation == 'CPython'": ["lxml>=3.4.0"], # Standard extras, will be installed when the extra is requested. - "genshi": ["genshi"], - "chardet": ["chardet>=2.2"], + "genshi": ["genshi>=0.7.1"], + "chardet": ["chardet>=2.2.1"], # The all extra combines a standard extra which will be used anytime # the all extra is requested, and it extends it with a conditional # extra that will be installed whenever the condition matches and the # all extra is requested. - "all": ["genshi", "chardet>=2.2"], - "all:platform_python_implementation == 'CPython'": ["lxml"], + "all": ["genshi>=0.7.1", "chardet>=2.2.1"], + "all:platform_python_implementation == 'CPython'": ["lxml>=3.4.0"], }, ) diff --git a/tox.ini b/tox.ini index 42790f48..fb228e96 100644 --- a/tox.ini +++ b/tox.ini @@ -1,17 +1,18 @@ [tox] -envlist = py{27,35,36,37,38,39,310,311,py,py3}-{base,six19,optional} +envlist = py{27,35,36,37,38,39,310,311,py,py3}-{base,optional,oldest} [testenv] deps = + base: -r{toxinidir}/requirements-test.txt + optional: -r{toxinidir}/requirements-test.txt optional: -r{toxinidir}/requirements-optional.txt - -r{toxinidir}/requirements-test.txt + oldest: -r{toxinidir}/requirements-oldest.txt doc: Sphinx passenv = PYTEST_COMMAND # this is maintained so one can, e.g., PYTEST_COMMAND="coverage run -m pytest" COVERAGE_RUN_OPTIONS commands = - six19: pip install six==1.9 {env:PYTEST_COMMAND:{envbindir}/pytest} {posargs} flake8 {toxinidir} diff --git a/toxver.py b/toxver.py new file mode 100755 index 00000000..68eb71ec --- /dev/null +++ b/toxver.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +""" +usage: toxver.py [python-version] [deps] + +Returns a Tox environment name given a GHA matrix Python version and dependencies. +Many GHA configurations do this with inline Bash scripts but we want our solution +to be cross-platform and work on Windows workers, too. + +Examples: + + $ toxver.py pypy-3.8 base + TOXENV=pypy3-base + + $ toxver.py 2.7 oldest + TOXENV=py27-oldest + + $ toxver.py ~3.12.0-0 optional + TOXENV=py312-optional + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys + + +def main(argv): + if len(argv) != 3: + print(__doc__.strip(), file=sys.stderr) + return 1 + + deps = argv[2] + + if argv[1].startswith("pypy-2"): + print("TOXENV=pypy-" + deps) + return 0 + + if argv[1].startswith("pypy-3"): + print("TOXENV=pypy3-" + deps) + return 0 + + if argv[1].startswith("~"): + ver = argv[1][1:5] + else: + ver = argv[1] + + ver = ver.replace(".", "") + print("TOXENV=py" + ver + "-" + deps) + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) From 3e500bb6e4188ea087f5b743a720ed9f4d9216f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Fri, 3 Mar 2023 16:43:58 +0100 Subject: [PATCH 17/20] Allow min-height, max-height, min-width, max-width in the sanitizer (#566) Co-authored-by: Vladimir Kuvandjiev Co-authored-by: Vladimir Kuvandjiev --- html5lib/filters/sanitizer.py | 4 ++++ html5lib/tests/sanitizer-testdata/tests1.dat | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index 81c85d44..ea2c5dd3 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -598,6 +598,10 @@ 'height', 'letter-spacing', 'line-height', + 'max-height', + 'min-height', + 'max-width', + 'min-width', 'overflow', 'pause', 'pause-after', diff --git a/html5lib/tests/sanitizer-testdata/tests1.dat b/html5lib/tests/sanitizer-testdata/tests1.dat index 74e88336..2bfbc7df 100644 --- a/html5lib/tests/sanitizer-testdata/tests1.dat +++ b/html5lib/tests/sanitizer-testdata/tests1.dat @@ -133,8 +133,8 @@ { "name": "platypus", - "input": "never trust your upstream platypus", - "output": "never trust your upstream platypus" + "input": "never trust your upstream platypus", + "output": "never trust your upstream platypus" }, { From 4a87368b71090f1432df6302f178c4babfcec93f Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 10 Jan 2024 15:13:40 +0100 Subject: [PATCH 18/20] GitHub Actions: python: ["3.8", "3.9", "3.10", "3.11", "3.12"] (#575) --- .appveyor.yml | 30 +++++++++++++++++------------ .github/workflows/python-tox.yml | 19 ++++++++---------- html5lib/tests/tokenizer.py | 4 +++- html5lib/tests/tree_construction.py | 4 +++- requirements-test.txt | 3 ++- 5 files changed, 34 insertions(+), 26 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 7661aa63..e6f7bf48 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,23 +1,29 @@ -image: Visual Studio 2019 +# appveyor.yml - https://www.appveyor.com/docs/lang/python +# https://www.appveyor.com/docs/windows-images-software/#visual-studio-2022 +--- +image: Visual Studio 2022 environment: - global: - PATH: "C:\\Python27\\Scripts\\;%PATH%" matrix: - - TOXENV: py27-base - - TOXENV: py27-optional - - TOXENV: py35-base - - TOXENV: py35-optional - - TOXENV: py36-base - - TOXENV: py36-optional + - PY_PYTHON: 2.7 + TOXENV: py27-base + - PY_PYTHON: 2.7 + TOXENV: py27-optional + - PY_PYTHON: 3.7 + TOXENV: py37-base + - PY_PYTHON: 3.7 + TOXENV: py37-optional install: - git submodule update --init --recursive - - python -m pip install tox + - py --list + - py -VV + - py -m pip install --upgrade pip + - py -m pip install tox build: off test_script: - - tox + - py -m tox after_test: - - python debug-info.py + - py debug-info.py diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml index cfcc42e6..5ed83175 100644 --- a/.github/workflows/python-tox.yml +++ b/.github/workflows/python-tox.yml @@ -7,30 +7,27 @@ jobs: strategy: fail-fast: false matrix: - # 2.7, 3.5, and 3.6 run on Windows via AppVeyor - python: ["3.7", "3.8", "3.9", "3.10", "3.11"] + # 2.7 and 3.7 run on Windows via AppVeyor + python: ["3.8", "3.9", "3.10", "3.11", "3.12"] os: [ubuntu-latest, windows-latest] deps: [base, optional] include: - python: "pypy-2.7" os: ubuntu-latest deps: base - - python: "pypy-3.8" + - python: "pypy-3.10" os: ubuntu-latest deps: base - - python: "2.7" - os: ubuntu-latest - deps: oldest - - python: "3.7" + - python: "3.8" os: ubuntu-latest deps: oldest runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - if: ${{ matrix.deps == 'base' }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} cache: pip @@ -38,7 +35,7 @@ jobs: requirements.txt requirements-test.txt - if: ${{ matrix.deps == 'optional' }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} cache: pip @@ -47,7 +44,7 @@ jobs: requirements-optional.txt requirements-test.txt - if: ${{ matrix.deps == 'oldest' }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} cache: pip diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py index cc9897a4..b49d2e6e 100644 --- a/html5lib/tests/tokenizer.py +++ b/html5lib/tests/tokenizer.py @@ -246,7 +246,9 @@ def runtest(self): def repr_failure(self, excinfo): traceback = excinfo.traceback ntraceback = traceback.cut(path=__file__) - excinfo.traceback = ntraceback.filter() + pytest_ver = getattr(pytest, "version_tuple", ()) + filter_args = (excinfo,) if pytest_ver >= (7, 4, 0) else () + excinfo.traceback = ntraceback.filter(*filter_args) return excinfo.getrepr(funcargs=True, showlocals=False, diff --git a/html5lib/tests/tree_construction.py b/html5lib/tests/tree_construction.py index fb0657bf..363b48c2 100644 --- a/html5lib/tests/tree_construction.py +++ b/html5lib/tests/tree_construction.py @@ -135,7 +135,9 @@ def runtest(self): def repr_failure(self, excinfo): traceback = excinfo.traceback ntraceback = traceback.cut(path=__file__) - excinfo.traceback = ntraceback.filter() + pytest_ver = getattr(pytest, "version_tuple", ()) + filter_args = (excinfo,) if pytest_ver >= (7, 4, 0) else () + excinfo.traceback = ntraceback.filter(*filter_args) return excinfo.getrepr(funcargs=True, showlocals=False, diff --git a/requirements-test.txt b/requirements-test.txt index 27866e59..39913ee4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,8 +3,9 @@ flake8==3.9.2 ; python_version < '3.6' flake8>=5.0.4; python_version >= '3.6' pytest>=4.6.10,<5 ; python_version < '3' -pytest>=5.4.2,<7 ; python_version >= '3' +pytest>=5.4.2,<8 ; python_version >= '3' coverage>=5.1,<6 pytest-expect>=1.1.0,<2 mock>=3.0.5,<4 ; python_version < '3.6' mock>=4.0.2,<5 ; python_version >= '3.6' +setuptools; python_version >= '3.12' From 82c2599585a6119e5afd26e58e754972c79f6734 Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Wed, 10 Jan 2024 13:31:09 -0500 Subject: [PATCH 19/20] tests: drop dependency on external mock module for newer python (#574) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #541 Co-authored-by: Łukasz Langa --- html5lib/tests/test_meta.py | 5 ++++- requirements-test.txt | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/html5lib/tests/test_meta.py b/html5lib/tests/test_meta.py index dd02dd7f..e02268aa 100644 --- a/html5lib/tests/test_meta.py +++ b/html5lib/tests/test_meta.py @@ -1,7 +1,10 @@ from __future__ import absolute_import, division, unicode_literals import six -from mock import Mock +try: + from unittest.mock import Mock +except ImportError: + from mock import Mock from . import support diff --git a/requirements-test.txt b/requirements-test.txt index 39913ee4..aca31f5e 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,6 +6,5 @@ pytest>=4.6.10,<5 ; python_version < '3' pytest>=5.4.2,<8 ; python_version >= '3' coverage>=5.1,<6 pytest-expect>=1.1.0,<2 -mock>=3.0.5,<4 ; python_version < '3.6' -mock>=4.0.2,<5 ; python_version >= '3.6' +mock>=3.0.5,<4 ; python_version < '3.3' setuptools; python_version >= '3.12' From fd4f032bc090d44fb11a84b352dad7cbee0a4745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Langa?= Date: Wed, 21 Feb 2024 16:31:38 +0100 Subject: [PATCH 20/20] Constant phases (#567) * Get rid of getPhases This added a fair bit of complexity, and notable made the Phase classes dynamically generated. However, by doing this, we no longer include "process the token using the rules for" phases in the debug log. Co-authored-by: Sam Sneddon --- html5lib/_utils.py | 12 - html5lib/html5parser.py | 4349 ++++++++++++++++---------------- html5lib/tests/test_parser2.py | 1 - 3 files changed, 2172 insertions(+), 2190 deletions(-) diff --git a/html5lib/_utils.py b/html5lib/_utils.py index 9ea57942..7e23ee57 100644 --- a/html5lib/_utils.py +++ b/html5lib/_utils.py @@ -145,15 +145,3 @@ def moduleFactory(baseModule, *args, **kwargs): return mod return moduleFactory - - -def memoize(func): - cache = {} - - def wrapped(*args, **kwargs): - key = (tuple(args), tuple(kwargs.items())) - if key not in cache: - cache[key] = func(*args, **kwargs) - return cache[key] - - return wrapped diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 4c2d4c75..b3c206d1 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,7 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import with_metaclass, viewkeys - -import types +from six import viewkeys from . import _inputstream from . import _tokenizer @@ -13,7 +11,7 @@ from .constants import ( spaceCharacters, asciiUpper2Lower, specialElements, headingElements, cdataElements, rcdataElements, - tokenTypes, tagTokenTypes, + tokenTypes, namespaces, htmlIntegrationPointElements, mathmlTextIntegrationPointElements, adjustForeignAttributes as adjustForeignAttributesMap, @@ -71,18 +69,6 @@ def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElemen return p.parseFragment(doc, container=container, **kwargs) -def method_decorator_metaclass(function): - class Decorated(type): - def __new__(meta, classname, bases, classDict): - for attributeName, attribute in classDict.items(): - if isinstance(attribute, types.FunctionType): - attribute = function(attribute) - - classDict[attributeName] = attribute - return type.__new__(meta, classname, bases, classDict) - return Decorated - - class HTMLParser(object): """HTML parser @@ -112,6 +98,7 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa # Raise an exception on the first error encountered self.strict = strict + self.debug = debug if tree is None: tree = treebuilders.getTreeBuilder("etree") @@ -122,7 +109,7 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa self.errors = [] self.phases = {name: cls(self, self.tree) for name, cls in - getPhases(debug).items()} + _phases.items()} def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): @@ -204,6 +191,9 @@ def mainLoop(self): DoctypeToken = tokenTypes["Doctype"] ParseErrorToken = tokenTypes["ParseError"] + type_names = {value: key for key, value in tokenTypes.items()} + debug = self.debug + for token in self.tokenizer: prev_token = None new_token = token @@ -235,6 +225,17 @@ def mainLoop(self): else: phase = self.phases["inForeignContent"] + if debug: + info = {"type": type_names[type]} + if type in (StartTagToken, EndTagToken): + info["name"] = new_token['name'] + + self.log.append((self.tokenizer.state.__name__, + self.phase.__class__.__name__, + phase.__class__.__name__, + "process" + info["type"], + info)) + if type == CharactersToken: new_token = phase.processCharacters(new_token) elif type == SpaceCharactersToken: @@ -396,2386 +397,2380 @@ def parseRCDataRawtext(self, token, contentType): self.phase = self.phases["text"] -@_utils.memoize -def getPhases(debug): - def log(function): - """Logger that records which phase processes each token""" - type_names = {value: key for key, value in tokenTypes.items()} - - def wrapped(self, *args, **kwargs): - if function.__name__.startswith("process") and len(args) > 0: - token = args[0] - info = {"type": type_names[token['type']]} - if token['type'] in tagTokenTypes: - info["name"] = token['name'] - - self.parser.log.append((self.parser.tokenizer.state.__name__, - self.parser.phase.__class__.__name__, - self.__class__.__name__, - function.__name__, - info)) - return function(self, *args, **kwargs) - else: - return function(self, *args, **kwargs) - return wrapped - - def getMetaclass(use_metaclass, metaclass_func): - if use_metaclass: - return method_decorator_metaclass(metaclass_func) +class Phase(object): + """Base class for helper object that implements each phase of processing + """ + __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") + + def __init__(self, parser, tree): + self.parser = parser + self.tree = tree + self.__startTagCache = {} + self.__endTagCache = {} + + def processEOF(self): + raise NotImplementedError + + def processComment(self, token): + # For most phases the following is correct. Where it's not it will be + # overridden. + self.tree.insertComment(token, self.tree.openElements[-1]) + + def processDoctype(self, token): + self.parser.parseError("unexpected-doctype") + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processSpaceCharacters(self, token): + self.tree.insertText(token["data"]) + + def processStartTag(self, token): + # Note the caching is done here rather than BoundMethodDispatcher as doing it there + # requires a circular reference to the Phase, and this ends up with a significant + # (CPython 2.7, 3.8) GC cost when parsing many short inputs + name = token["name"] + # In Py2, using `in` is quicker in general than try/except KeyError + # In Py3, `in` is quicker when there are few cache hits (typically short inputs) + if name in self.__startTagCache: + func = self.__startTagCache[name] else: - return type - - # pylint:disable=unused-argument - class Phase(with_metaclass(getMetaclass(debug, log))): - """Base class for helper object that implements each phase of processing - """ - __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") - - def __init__(self, parser, tree): - self.parser = parser - self.tree = tree - self.__startTagCache = {} - self.__endTagCache = {} - - def processEOF(self): - raise NotImplementedError - - def processComment(self, token): - # For most phases the following is correct. Where it's not it will be - # overridden. - self.tree.insertComment(token, self.tree.openElements[-1]) - - def processDoctype(self, token): - self.parser.parseError("unexpected-doctype") - - def processCharacters(self, token): - self.tree.insertText(token["data"]) - - def processSpaceCharacters(self, token): - self.tree.insertText(token["data"]) - - def processStartTag(self, token): - # Note the caching is done here rather than BoundMethodDispatcher as doing it there - # requires a circular reference to the Phase, and this ends up with a significant - # (CPython 2.7, 3.8) GC cost when parsing many short inputs - name = token["name"] - # In Py2, using `in` is quicker in general than try/except KeyError - # In Py3, `in` is quicker when there are few cache hits (typically short inputs) - if name in self.__startTagCache: - func = self.__startTagCache[name] - else: - func = self.__startTagCache[name] = self.startTagHandler[name] - # bound the cache size in case we get loads of unknown tags - while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: - # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 - self.__startTagCache.pop(next(iter(self.__startTagCache))) - return func(token) - - def startTagHtml(self, token): - if not self.parser.firstStartTag and token["name"] == "html": - self.parser.parseError("non-html-root") - # XXX Need a check here to see if the first start tag token emitted is - # this token... If it's not, invoke self.parser.parseError(). - for attr, value in token["data"].items(): - if attr not in self.tree.openElements[0].attributes: - self.tree.openElements[0].attributes[attr] = value - self.parser.firstStartTag = False - - def processEndTag(self, token): - # Note the caching is done here rather than BoundMethodDispatcher as doing it there - # requires a circular reference to the Phase, and this ends up with a significant - # (CPython 2.7, 3.8) GC cost when parsing many short inputs - name = token["name"] - # In Py2, using `in` is quicker in general than try/except KeyError - # In Py3, `in` is quicker when there are few cache hits (typically short inputs) - if name in self.__endTagCache: - func = self.__endTagCache[name] - else: - func = self.__endTagCache[name] = self.endTagHandler[name] - # bound the cache size in case we get loads of unknown tags - while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: - # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 - self.__endTagCache.pop(next(iter(self.__endTagCache))) - return func(token) - - class InitialPhase(Phase): - __slots__ = tuple() - - def processSpaceCharacters(self, token): - pass - - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) - - def processDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - correct = token["correct"] - - if (name != "html" or publicId is not None or - systemId is not None and systemId != "about:legacy-compat"): - self.parser.parseError("unknown-doctype") - - if publicId is None: - publicId = "" - - self.tree.insertDoctype(token) - - if publicId != "": - publicId = publicId.translate(asciiUpper2Lower) - - if (not correct or token["name"] != "html" or - publicId.startswith( - ("+//silmaril//dtd html pro v0r11 19970101//", - "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", - "-//as//dtd html 3.0 aswedit + extensions//", - "-//ietf//dtd html 2.0 level 1//", - "-//ietf//dtd html 2.0 level 2//", - "-//ietf//dtd html 2.0 strict level 1//", - "-//ietf//dtd html 2.0 strict level 2//", - "-//ietf//dtd html 2.0 strict//", - "-//ietf//dtd html 2.0//", - "-//ietf//dtd html 2.1e//", - "-//ietf//dtd html 3.0//", - "-//ietf//dtd html 3.2 final//", - "-//ietf//dtd html 3.2//", - "-//ietf//dtd html 3//", - "-//ietf//dtd html level 0//", - "-//ietf//dtd html level 1//", - "-//ietf//dtd html level 2//", - "-//ietf//dtd html level 3//", - "-//ietf//dtd html strict level 0//", - "-//ietf//dtd html strict level 1//", - "-//ietf//dtd html strict level 2//", - "-//ietf//dtd html strict level 3//", - "-//ietf//dtd html strict//", - "-//ietf//dtd html//", - "-//metrius//dtd metrius presentational//", - "-//microsoft//dtd internet explorer 2.0 html strict//", - "-//microsoft//dtd internet explorer 2.0 html//", - "-//microsoft//dtd internet explorer 2.0 tables//", - "-//microsoft//dtd internet explorer 3.0 html strict//", - "-//microsoft//dtd internet explorer 3.0 html//", - "-//microsoft//dtd internet explorer 3.0 tables//", - "-//netscape comm. corp.//dtd html//", - "-//netscape comm. corp.//dtd strict html//", - "-//o'reilly and associates//dtd html 2.0//", - "-//o'reilly and associates//dtd html extended 1.0//", - "-//o'reilly and associates//dtd html extended relaxed 1.0//", - "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", - "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", - "-//spyglass//dtd html 2.0 extended//", - "-//sq//dtd html 2.0 hotmetal + extensions//", - "-//sun microsystems corp.//dtd hotjava html//", - "-//sun microsystems corp.//dtd hotjava strict html//", - "-//w3c//dtd html 3 1995-03-24//", - "-//w3c//dtd html 3.2 draft//", - "-//w3c//dtd html 3.2 final//", - "-//w3c//dtd html 3.2//", - "-//w3c//dtd html 3.2s draft//", - "-//w3c//dtd html 4.0 frameset//", - "-//w3c//dtd html 4.0 transitional//", - "-//w3c//dtd html experimental 19960712//", - "-//w3c//dtd html experimental 970421//", - "-//w3c//dtd w3 html//", - "-//w3o//dtd w3 html 3.0//", - "-//webtechs//dtd mozilla html 2.0//", - "-//webtechs//dtd mozilla html//")) or - publicId in ("-//w3o//dtd w3 html strict 3.0//en//", - "-/w3c/dtd html 4.0 transitional/en", - "html") or - publicId.startswith( - ("-//w3c//dtd html 4.01 frameset//", - "-//w3c//dtd html 4.01 transitional//")) and - systemId is None or - systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): - self.parser.compatMode = "quirks" - elif (publicId.startswith( - ("-//w3c//dtd xhtml 1.0 frameset//", - "-//w3c//dtd xhtml 1.0 transitional//")) or - publicId.startswith( - ("-//w3c//dtd html 4.01 frameset//", - "-//w3c//dtd html 4.01 transitional//")) and - systemId is not None): - self.parser.compatMode = "limited quirks" - - self.parser.phase = self.parser.phases["beforeHtml"] - - def anythingElse(self): + func = self.__startTagCache[name] = self.startTagHandler[name] + # bound the cache size in case we get loads of unknown tags + while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: + # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 + self.__startTagCache.pop(next(iter(self.__startTagCache))) + return func(token) + + def startTagHtml(self, token): + if not self.parser.firstStartTag and token["name"] == "html": + self.parser.parseError("non-html-root") + # XXX Need a check here to see if the first start tag token emitted is + # this token... If it's not, invoke self.parser.parseError(). + for attr, value in token["data"].items(): + if attr not in self.tree.openElements[0].attributes: + self.tree.openElements[0].attributes[attr] = value + self.parser.firstStartTag = False + + def processEndTag(self, token): + # Note the caching is done here rather than BoundMethodDispatcher as doing it there + # requires a circular reference to the Phase, and this ends up with a significant + # (CPython 2.7, 3.8) GC cost when parsing many short inputs + name = token["name"] + # In Py2, using `in` is quicker in general than try/except KeyError + # In Py3, `in` is quicker when there are few cache hits (typically short inputs) + if name in self.__endTagCache: + func = self.__endTagCache[name] + else: + func = self.__endTagCache[name] = self.endTagHandler[name] + # bound the cache size in case we get loads of unknown tags + while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: + # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 + self.__endTagCache.pop(next(iter(self.__endTagCache))) + return func(token) + + +class InitialPhase(Phase): + __slots__ = tuple() + + def processSpaceCharacters(self, token): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + correct = token["correct"] + + if (name != "html" or publicId is not None or + systemId is not None and systemId != "about:legacy-compat"): + self.parser.parseError("unknown-doctype") + + if publicId is None: + publicId = "" + + self.tree.insertDoctype(token) + + if publicId != "": + publicId = publicId.translate(asciiUpper2Lower) + + if (not correct or token["name"] != "html" or + publicId.startswith( + ("+//silmaril//dtd html pro v0r11 19970101//", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", + "-//as//dtd html 3.0 aswedit + extensions//", + "-//ietf//dtd html 2.0 level 1//", + "-//ietf//dtd html 2.0 level 2//", + "-//ietf//dtd html 2.0 strict level 1//", + "-//ietf//dtd html 2.0 strict level 2//", + "-//ietf//dtd html 2.0 strict//", + "-//ietf//dtd html 2.0//", + "-//ietf//dtd html 2.1e//", + "-//ietf//dtd html 3.0//", + "-//ietf//dtd html 3.2 final//", + "-//ietf//dtd html 3.2//", + "-//ietf//dtd html 3//", + "-//ietf//dtd html level 0//", + "-//ietf//dtd html level 1//", + "-//ietf//dtd html level 2//", + "-//ietf//dtd html level 3//", + "-//ietf//dtd html strict level 0//", + "-//ietf//dtd html strict level 1//", + "-//ietf//dtd html strict level 2//", + "-//ietf//dtd html strict level 3//", + "-//ietf//dtd html strict//", + "-//ietf//dtd html//", + "-//metrius//dtd metrius presentational//", + "-//microsoft//dtd internet explorer 2.0 html strict//", + "-//microsoft//dtd internet explorer 2.0 html//", + "-//microsoft//dtd internet explorer 2.0 tables//", + "-//microsoft//dtd internet explorer 3.0 html strict//", + "-//microsoft//dtd internet explorer 3.0 html//", + "-//microsoft//dtd internet explorer 3.0 tables//", + "-//netscape comm. corp.//dtd html//", + "-//netscape comm. corp.//dtd strict html//", + "-//o'reilly and associates//dtd html 2.0//", + "-//o'reilly and associates//dtd html extended 1.0//", + "-//o'reilly and associates//dtd html extended relaxed 1.0//", + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", + "-//spyglass//dtd html 2.0 extended//", + "-//sq//dtd html 2.0 hotmetal + extensions//", + "-//sun microsystems corp.//dtd hotjava html//", + "-//sun microsystems corp.//dtd hotjava strict html//", + "-//w3c//dtd html 3 1995-03-24//", + "-//w3c//dtd html 3.2 draft//", + "-//w3c//dtd html 3.2 final//", + "-//w3c//dtd html 3.2//", + "-//w3c//dtd html 3.2s draft//", + "-//w3c//dtd html 4.0 frameset//", + "-//w3c//dtd html 4.0 transitional//", + "-//w3c//dtd html experimental 19960712//", + "-//w3c//dtd html experimental 970421//", + "-//w3c//dtd w3 html//", + "-//w3o//dtd w3 html 3.0//", + "-//webtechs//dtd mozilla html 2.0//", + "-//webtechs//dtd mozilla html//")) or + publicId in ("-//w3o//dtd w3 html strict 3.0//en//", + "-/w3c/dtd html 4.0 transitional/en", + "html") or + publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId is None or + systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatMode = "quirks" - self.parser.phase = self.parser.phases["beforeHtml"] - - def processCharacters(self, token): - self.parser.parseError("expected-doctype-but-got-chars") - self.anythingElse() - return token - - def processStartTag(self, token): - self.parser.parseError("expected-doctype-but-got-start-tag", + elif (publicId.startswith( + ("-//w3c//dtd xhtml 1.0 frameset//", + "-//w3c//dtd xhtml 1.0 transitional//")) or + publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId is not None): + self.parser.compatMode = "limited quirks" + + self.parser.phase = self.parser.phases["beforeHtml"] + + def anythingElse(self): + self.parser.compatMode = "quirks" + self.parser.phase = self.parser.phases["beforeHtml"] + + def processCharacters(self, token): + self.parser.parseError("expected-doctype-but-got-chars") + self.anythingElse() + return token + + def processStartTag(self, token): + self.parser.parseError("expected-doctype-but-got-start-tag", + {"name": token["name"]}) + self.anythingElse() + return token + + def processEndTag(self, token): + self.parser.parseError("expected-doctype-but-got-end-tag", + {"name": token["name"]}) + self.anythingElse() + return token + + def processEOF(self): + self.parser.parseError("expected-doctype-but-got-eof") + self.anythingElse() + return True + + +class BeforeHtmlPhase(Phase): + __slots__ = tuple() + + # helper methods + def insertHtmlElement(self): + self.tree.insertRoot(impliedTagToken("html", "StartTag")) + self.parser.phase = self.parser.phases["beforeHead"] + + # other + def processEOF(self): + self.insertHtmlElement() + return True + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + pass + + def processCharacters(self, token): + self.insertHtmlElement() + return token + + def processStartTag(self, token): + if token["name"] == "html": + self.parser.firstStartTag = True + self.insertHtmlElement() + return token + + def processEndTag(self, token): + if token["name"] not in ("head", "body", "html", "br"): + self.parser.parseError("unexpected-end-tag-before-html", {"name": token["name"]}) - self.anythingElse() + else: + self.insertHtmlElement() return token - def processEndTag(self, token): - self.parser.parseError("expected-doctype-but-got-end-tag", - {"name": token["name"]}) - self.anythingElse() - return token - def processEOF(self): - self.parser.parseError("expected-doctype-but-got-eof") - self.anythingElse() - return True +class BeforeHeadPhase(Phase): + __slots__ = tuple() - class BeforeHtmlPhase(Phase): - __slots__ = tuple() + def processEOF(self): + self.startTagHead(impliedTagToken("head", "StartTag")) + return True - # helper methods - def insertHtmlElement(self): - self.tree.insertRoot(impliedTagToken("html", "StartTag")) - self.parser.phase = self.parser.phases["beforeHead"] + def processSpaceCharacters(self, token): + pass - # other - def processEOF(self): - self.insertHtmlElement() - return True + def processCharacters(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) - def processSpaceCharacters(self, token): - pass + def startTagHead(self, token): + self.tree.insertElement(token) + self.tree.headPointer = self.tree.openElements[-1] + self.parser.phase = self.parser.phases["inHead"] - def processCharacters(self, token): - self.insertHtmlElement() - return token + def startTagOther(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token - def processStartTag(self, token): - if token["name"] == "html": - self.parser.firstStartTag = True - self.insertHtmlElement() - return token + def endTagImplyHead(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token - def processEndTag(self, token): - if token["name"] not in ("head", "body", "html", "br"): - self.parser.parseError("unexpected-end-tag-before-html", - {"name": token["name"]}) - else: - self.insertHtmlElement() - return token + def endTagOther(self, token): + self.parser.parseError("end-tag-after-implied-root", + {"name": token["name"]}) - class BeforeHeadPhase(Phase): - __slots__ = tuple() + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther - def processEOF(self): - self.startTagHead(impliedTagToken("head", "StartTag")) - return True + endTagHandler = _utils.MethodDispatcher([ + (("head", "body", "html", "br"), endTagImplyHead) + ]) + endTagHandler.default = endTagOther - def processSpaceCharacters(self, token): - pass - def processCharacters(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token +class InHeadPhase(Phase): + __slots__ = tuple() - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + # the real thing + def processEOF(self): + self.anythingElse() + return True - def startTagHead(self, token): - self.tree.insertElement(token) - self.tree.headPointer = self.tree.openElements[-1] - self.parser.phase = self.parser.phases["inHead"] + def processCharacters(self, token): + self.anythingElse() + return token - def startTagOther(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) - def endTagImplyHead(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token + def startTagHead(self, token): + self.parser.parseError("two-heads-are-not-better-than-one") - def endTagOther(self, token): - self.parser.parseError("end-tag-after-implied-root", - {"name": token["name"]}) + def startTagBaseLinkCommand(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True - startTagHandler = _utils.MethodDispatcher([ - ("html", startTagHtml), - ("head", startTagHead) - ]) - startTagHandler.default = startTagOther + def startTagMeta(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + attributes = token["data"] + if self.parser.tokenizer.stream.charEncoding[1] == "tentative": + if "charset" in attributes: + self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) + elif ("content" in attributes and + "http-equiv" in attributes and + attributes["http-equiv"].lower() == "content-type"): + # Encoding it as UTF-8 here is a hack, as really we should pass + # the abstract Unicode string, and just use the + # ContentAttrParser on that, but using UTF-8 allows all chars + # to be encoded and as a ASCII-superset works. + data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) + parser = _inputstream.ContentAttrParser(data) + codec = parser.parse() + self.parser.tokenizer.stream.changeEncoding(codec) + + def startTagTitle(self, token): + self.parser.parseRCDataRawtext(token, "RCDATA") + + def startTagNoFramesStyle(self, token): + # Need to decide whether to implement the scripting-disabled case + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagNoscript(self, token): + if self.parser.scripting: + self.parser.parseRCDataRawtext(token, "RAWTEXT") + else: + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inHeadNoscript"] - endTagHandler = _utils.MethodDispatcher([ - (("head", "body", "html", "br"), endTagImplyHead) - ]) - endTagHandler.default = endTagOther + def startTagScript(self, token): + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState + self.parser.originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["text"] + + def startTagOther(self, token): + self.anythingElse() + return token + + def endTagHead(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "head", "Expected head got %s" % node.name + self.parser.phase = self.parser.phases["afterHead"] + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + self.endTagHead(impliedTagToken("head")) + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("title", startTagTitle), + (("noframes", "style"), startTagNoFramesStyle), + ("noscript", startTagNoscript), + ("script", startTagScript), + (("base", "basefont", "bgsound", "command", "link"), + startTagBaseLinkCommand), + ("meta", startTagMeta), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("head", endTagHead), + (("br", "html", "body"), endTagHtmlBodyBr) + ]) + endTagHandler.default = endTagOther + + +class InHeadNoscriptPhase(Phase): + __slots__ = tuple() + + def processEOF(self): + self.parser.parseError("eof-in-head-noscript") + self.anythingElse() + return True + + def processComment(self, token): + return self.parser.phases["inHead"].processComment(token) + + def processCharacters(self, token): + self.parser.parseError("char-in-head-noscript") + self.anythingElse() + return token + + def processSpaceCharacters(self, token): + return self.parser.phases["inHead"].processSpaceCharacters(token) + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagBaseLinkCommand(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagHeadNoscript(self, token): + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + + def startTagOther(self, token): + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.anythingElse() + return token + + def endTagNoscript(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "noscript", "Expected noscript got %s" % node.name + self.parser.phase = self.parser.phases["inHead"] + + def endTagBr(self, token): + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + # Caller must raise parse error first! + self.endTagNoscript(impliedTagToken("noscript")) + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), + (("head", "noscript"), startTagHeadNoscript), + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("noscript", endTagNoscript), + ("br", endTagBr), + ]) + endTagHandler.default = endTagOther + + +class AfterHeadPhase(Phase): + __slots__ = tuple() + + def processEOF(self): + self.anythingElse() + return True + + def processCharacters(self, token): + self.anythingElse() + return token + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagBody(self, token): + self.parser.framesetOK = False + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inBody"] - class InHeadPhase(Phase): - __slots__ = tuple() + def startTagFrameset(self, token): + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] - # the real thing - def processEOF(self): - self.anythingElse() - return True + def startTagFromHead(self, token): + self.parser.parseError("unexpected-start-tag-out-of-my-head", + {"name": token["name"]}) + self.tree.openElements.append(self.tree.headPointer) + self.parser.phases["inHead"].processStartTag(token) + for node in self.tree.openElements[::-1]: + if node.name == "head": + self.tree.openElements.remove(node) + break - def processCharacters(self, token): - self.anythingElse() - return token + def startTagHead(self, token): + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + + def startTagOther(self, token): + self.anythingElse() + return token + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + self.tree.insertElement(impliedTagToken("body", "StartTag")) + self.parser.phase = self.parser.phases["inBody"] + self.parser.framesetOK = True + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("body", startTagBody), + ("frameset", startTagFrameset), + (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", + "style", "title"), + startTagFromHead), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther + endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), + endTagHtmlBodyBr)]) + endTagHandler.default = endTagOther + + +class InBodyPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody + # the really-really-really-very crazy mode + __slots__ = ("processSpaceCharacters",) + + def __init__(self, *args, **kwargs): + super(InBodyPhase, self).__init__(*args, **kwargs) + # Set this to the default handler + self.processSpaceCharacters = self.processSpaceCharactersNonPre + + def isMatchingFormattingElement(self, node1, node2): + return (node1.name == node2.name and + node1.namespace == node2.namespace and + node1.attributes == node2.attributes) + + # helper + def addFormattingElement(self, token): + self.tree.insertElement(token) + element = self.tree.openElements[-1] - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + matchingElements = [] + for node in self.tree.activeFormattingElements[::-1]: + if node is Marker: + break + elif self.isMatchingFormattingElement(node, element): + matchingElements.append(node) + + assert len(matchingElements) <= 3 + if len(matchingElements) == 3: + self.tree.activeFormattingElements.remove(matchingElements[-1]) + self.tree.activeFormattingElements.append(element) + + # the real deal + def processEOF(self): + allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", + "tfoot", "th", "thead", "tr", "body", + "html")) + for node in self.tree.openElements[::-1]: + if node.name not in allowed_elements: + self.parser.parseError("expected-closing-tag-but-got-eof") + break + # Stop parsing + + def processSpaceCharactersDropNewline(self, token): + # Sometimes (start of
                  , , and