From 183d8a026b0cd83e80dac5c7347457b7420e5091 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 29 Dec 2020 14:44:34 +0000 Subject: [PATCH 1/6] Consistency: consume a single character at a time during attribute name state --- html5lib/_tokenizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index 4748a197..f65cbb21 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -891,8 +891,7 @@ def attributeNameState(self): if data == "=": self.state = self.beforeAttributeValueState elif data in asciiLetters: - self.currentToken["data"][-1][0] += data +\ - self.stream.charsUntil(asciiLetters, True) + self.currentToken["data"][-1][0] += data leavingThisState = False elif data == ">": # XXX If we emit here the attributes are converted to a dict From 2e8637367ad001710ac5fb7b41191ace3f43f458 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 29 Dec 2020 16:53:28 +0000 Subject: [PATCH 2/6] Refactor: pretranslate lowercase element and attribute names --- html5lib/_tokenizer.py | 62 ++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index f65cbb21..0b4200a7 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -233,7 +233,6 @@ def emitCurrentToken(self): token = self.currentToken # Add token to the queue to be yielded if (token["type"] in tagTokenTypes): - token["name"] = token["name"].translate(asciiUpper2Lower) if token["type"] == tokenTypes["StartTag"]: raw = token["data"] data = attributeMap(raw) @@ -380,7 +379,8 @@ def tagOpenState(self): self.state = self.closeTagOpenState elif data in asciiLetters: self.currentToken = {"type": tokenTypes["StartTag"], - "name": data, "data": [], + "name": data.translate(asciiUpper2Lower), + "data": [], "selfClosing": False, "selfClosingAcknowledged": False} self.state = self.tagNameState @@ -410,7 +410,8 @@ def tagOpenState(self): def closeTagOpenState(self): data = self.stream.char() if data in asciiLetters: - self.currentToken = {"type": tokenTypes["EndTag"], "name": data, + self.currentToken = {"type": tokenTypes["EndTag"], + "name": data.translate(asciiUpper2Lower), "data": [], "selfClosing": False} self.state = self.tagNameState elif data == ">": @@ -448,7 +449,7 @@ def tagNameState(self): "data": "invalid-codepoint"}) self.currentToken["name"] += "\uFFFD" else: - self.currentToken["name"] += data + self.currentToken["name"] += data.translate(asciiUpper2Lower) # (Don't use charsUntil here, because tag names are # very short and it's faster to not do anything fancy) return True @@ -467,7 +468,7 @@ def rcdataLessThanSignState(self): def rcdataEndTagOpenState(self): data = self.stream.char() if data in asciiLetters: - self.temporaryBuffer += data + self.temporaryBuffer += data.translate(asciiUpper2Lower) self.state = self.rcdataEndTagNameState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ""))): self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) - if self.temporaryBuffer.lower() == "script": + if self.temporaryBuffer == "script": self.state = self.scriptDataDoubleEscapedState else: self.state = self.scriptDataEscapedState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) - self.temporaryBuffer += data + self.temporaryBuffer += data.translate(asciiUpper2Lower) else: self.stream.unget(data) self.state = self.scriptDataEscapedState @@ -842,13 +843,13 @@ def scriptDataDoubleEscapeEndState(self): data = self.stream.char() if data in (spaceCharacters | frozenset(("/", ">"))): self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) - if self.temporaryBuffer.lower() == "script": + if self.temporaryBuffer == "script": self.state = self.scriptDataEscapedState else: self.state = self.scriptDataDoubleEscapedState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) - self.temporaryBuffer += data + self.temporaryBuffer += data.translate(asciiUpper2Lower) else: self.stream.unget(data) self.state = self.scriptDataDoubleEscapedState @@ -859,7 +860,8 @@ def beforeAttributeNameState(self): if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: - self.currentToken["data"].append([data, ""]) + attr_name = data.translate(asciiUpper2Lower) + self.currentToken["data"].append([attr_name, ""]) self.state = self.attributeNameState elif data == ">": self.emitCurrentToken() @@ -891,7 +893,7 @@ def attributeNameState(self): if data == "=": self.state = self.beforeAttributeValueState elif data in asciiLetters: - self.currentToken["data"][-1][0] += data + self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower) leavingThisState = False elif data == ">": # XXX If we emit here the attributes are converted to a dict @@ -918,15 +920,13 @@ def attributeNameState(self): "data": "eof-in-attribute-name"}) self.state = self.dataState else: - self.currentToken["data"][-1][0] += data + self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower) leavingThisState = False if leavingThisState: # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. - self.currentToken["data"][-1][0] = ( - self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, _ in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": @@ -946,7 +946,8 @@ def afterAttributeNameState(self): elif data == ">": self.emitCurrentToken() elif data in asciiLetters: - self.currentToken["data"].append([data, ""]) + attr_name = data.translate(asciiUpper2Lower) + self.currentToken["data"].append([attr_name, ""]) self.state = self.attributeNameState elif data == "/": self.state = self.selfClosingStartTagState @@ -1340,17 +1341,15 @@ def beforeDoctypeNameState(self): self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["name"] = data + self.currentToken["name"] = data.translate(asciiUpper2Lower) self.state = self.doctypeNameState return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.state = self.afterDoctypeNameState elif data == ">": - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": @@ -1362,11 +1361,10 @@ def doctypeNameState(self): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype-name"}) self.currentToken["correct"] = False - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["name"] += data + self.currentToken["name"] += data.translate(asciiUpper2Lower) return True def afterDoctypeNameState(self): From 8f96b1745382a7ac650e06fb57263e285c808c93 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 29 Dec 2020 17:59:05 +0000 Subject: [PATCH 3/6] Restore self.currentToken safety check --- html5lib/_tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index 0b4200a7..0d2c9a00 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -477,7 +477,7 @@ def rcdataEndTagOpenState(self): return True def rcdataEndTagNameState(self): - appropriate = self.currentToken["name"] == self.temporaryBuffer + appropriate = self.currentToken and self.currentToken["name"] == self.temporaryBuffer data = self.stream.char() if data in spaceCharacters and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], @@ -527,7 +527,7 @@ def rawtextEndTagOpenState(self): return True def rawtextEndTagNameState(self): - appropriate = self.currentToken["name"] == self.temporaryBuffer + appropriate = self.currentToken and self.currentToken["name"] == self.temporaryBuffer data = self.stream.char() if data in spaceCharacters and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], @@ -580,7 +580,7 @@ def scriptDataEndTagOpenState(self): return True def scriptDataEndTagNameState(self): - appropriate = self.currentToken["name"] == self.temporaryBuffer + appropriate = self.currentToken and self.currentToken["name"] == self.temporaryBuffer data = self.stream.char() if data in spaceCharacters and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], @@ -716,7 +716,7 @@ def scriptDataEscapedEndTagOpenState(self): return True def scriptDataEscapedEndTagNameState(self): - appropriate = self.currentToken["name"] == self.temporaryBuffer + appropriate = self.currentToken and self.currentToken["name"] == self.temporaryBuffer data = self.stream.char() if data in spaceCharacters and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], From a912842e76343ef570e82f19ec4c78c85b21f327 Mon Sep 17 00:00:00 2001 From: James Addison Date: Wed, 30 Dec 2020 12:18:59 +0000 Subject: [PATCH 4/6] Alternate approach: do not pretranslate temporary buffered data --- html5lib/_tokenizer.py | 62 ++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index 0d2c9a00..e603fcbd 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -468,7 +468,7 @@ def rcdataLessThanSignState(self): def rcdataEndTagOpenState(self): data = self.stream.char() if data in asciiLetters: - self.temporaryBuffer += data.translate(asciiUpper2Lower) + self.temporaryBuffer += data self.state = self.rcdataEndTagNameState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: - self.temporaryBuffer += data.translate(asciiUpper2Lower) + self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: - self.temporaryBuffer += data.translate(asciiUpper2Lower) + self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: - self.temporaryBuffer += data.translate(asciiUpper2Lower) + self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: - self.temporaryBuffer += data.translate(asciiUpper2Lower) + self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ""))): self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) - if self.temporaryBuffer == "script": + if self.temporaryBuffer.lower() == "script": self.state = self.scriptDataDoubleEscapedState else: self.state = self.scriptDataEscapedState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) - self.temporaryBuffer += data.translate(asciiUpper2Lower) + self.temporaryBuffer += data else: self.stream.unget(data) self.state = self.scriptDataEscapedState @@ -843,13 +847,13 @@ def scriptDataDoubleEscapeEndState(self): data = self.stream.char() if data in (spaceCharacters | frozenset(("/", ">"))): self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) - if self.temporaryBuffer == "script": + if self.temporaryBuffer.lower() == "script": self.state = self.scriptDataEscapedState else: self.state = self.scriptDataDoubleEscapedState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) - self.temporaryBuffer += data.translate(asciiUpper2Lower) + self.temporaryBuffer += data else: self.stream.unget(data) self.state = self.scriptDataDoubleEscapedState From f9f370e2351c353d1c89d2f0f8cc8ce03b5334fe Mon Sep 17 00:00:00 2001 From: James Addison Date: Wed, 30 Dec 2020 12:36:57 +0000 Subject: [PATCH 5/6] Consistency: character consumption within double-escaped state --- html5lib/_tokenizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index e603fcbd..86c750aa 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -781,7 +781,9 @@ def scriptDataDoubleEscapedState(self): "eof-in-script-in-script"}) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + chars = self.stream.charsUntil(("<", "-", "\u0000")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) return True def scriptDataDoubleEscapedDashState(self): From fa626715196d487a58cc9be96132520b652820be Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 5 Jan 2021 14:22:22 +0000 Subject: [PATCH 6/6] Check ASCII character data condition before performing temporary buffer data translation --- html5lib/_tokenizer.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index 86c750aa..b037a9b4 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -477,9 +477,13 @@ def rcdataEndTagOpenState(self): return True def rcdataEndTagNameState(self): + data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer += data + return True + name = self.temporaryBuffer.translate(asciiUpper2Lower) appropriate = self.currentToken and self.currentToken["name"] == name - data = self.stream.char() if data in spaceCharacters and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], "name": name, @@ -496,8 +500,6 @@ def rcdataEndTagNameState(self): "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState - elif data in asciiLetters: - self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "