Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Tokenizer: pretranslate lowercase element and attribute names #520

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
93 changes: 52 additions & 41 deletions html5lib/_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,6 @@ def emitCurrentToken(self):
token = self.currentToken
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
data = attributeMap(raw)
Expand Down Expand Up @@ -380,7 +379,8 @@ def tagOpenState(self):
self.state = self.closeTagOpenState
elif data in asciiLetters:
self.currentToken = {"type": tokenTypes["StartTag"],
"name": data, "data": [],
"name": data.translate(asciiUpper2Lower),
"data": [],
"selfClosing": False,
"selfClosingAcknowledged": False}
self.state = self.tagNameState
Expand Down Expand Up @@ -410,7 +410,8 @@ def tagOpenState(self):
def closeTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
self.currentToken = {"type": tokenTypes["EndTag"],
"name": data.translate(asciiUpper2Lower),
"data": [], "selfClosing": False}
self.state = self.tagNameState
elif data == ">":
Expand Down Expand Up @@ -448,7 +449,7 @@ def tagNameState(self):
"data": "invalid-codepoint"})
self.currentToken["name"] += "\uFFFD"
else:
self.currentToken["name"] += data
self.currentToken["name"] += data.translate(asciiUpper2Lower)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm very skeptical about this being a perf win, versus it being in emitCurrentToken. What do the benchmarks say?

Yes, emitCurrentToken's lowercasing becomes redundant in the RCDATA/RAWTEXT/script cases, but I expect the cost of this will negate any gains.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair, yep - especially for short element names it seems likely that the translate method call overhead (especially if called repeatedly) could negate any benefits provided by simpler comparisons.

I hadn't assessed the performance of this code path separately; it felt worth maintaining consistency but I don't believe there's a noticeable performance change.

# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return True
Expand Down Expand Up @@ -476,26 +477,29 @@ def rcdataEndTagOpenState(self):
return True

def rcdataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand Down Expand Up @@ -526,26 +530,29 @@ def rawtextEndTagOpenState(self):
return True

def rawtextEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand Down Expand Up @@ -579,26 +586,29 @@ def scriptDataEndTagOpenState(self):
return True

def scriptDataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand Down Expand Up @@ -715,26 +725,29 @@ def scriptDataEscapedEndTagOpenState(self):
return True

def scriptDataEscapedEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand Down Expand Up @@ -776,7 +789,9 @@ def scriptDataDoubleEscapedState(self):
"eof-in-script-in-script"})
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
chars = self.stream.charsUntil(("<", "-", "\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True

def scriptDataDoubleEscapedDashState(self):
Expand Down Expand Up @@ -859,7 +874,8 @@ def beforeAttributeNameState(self):
if data in spaceCharacters:
self.stream.charsUntil(spaceCharacters, True)
elif data in asciiLetters:
self.currentToken["data"].append([data, ""])
attr_name = data.translate(asciiUpper2Lower)
self.currentToken["data"].append([attr_name, ""])
self.state = self.attributeNameState
elif data == ">":
self.emitCurrentToken()
Expand Down Expand Up @@ -891,8 +907,7 @@ def attributeNameState(self):
if data == "=":
self.state = self.beforeAttributeValueState
elif data in asciiLetters:
self.currentToken["data"][-1][0] += data +\
self.stream.charsUntil(asciiLetters, True)
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
leavingThisState = False
elif data == ">":
# XXX If we emit here the attributes are converted to a dict
Expand All @@ -919,15 +934,13 @@ def attributeNameState(self):
"data": "eof-in-attribute-name"})
self.state = self.dataState
else:
self.currentToken["data"][-1][0] += data
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
leavingThisState = False

if leavingThisState:
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, _ in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
Expand All @@ -947,7 +960,8 @@ def afterAttributeNameState(self):
elif data == ">":
self.emitCurrentToken()
elif data in asciiLetters:
self.currentToken["data"].append([data, ""])
attr_name = data.translate(asciiUpper2Lower)
self.currentToken["data"].append([attr_name, ""])
self.state = self.attributeNameState
elif data == "/":
self.state = self.selfClosingStartTagState
Expand Down Expand Up @@ -1341,17 +1355,15 @@ def beforeDoctypeNameState(self):
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["name"] = data
self.currentToken["name"] = data.translate(asciiUpper2Lower)
self.state = self.doctypeNameState
return True

def doctypeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.state = self.afterDoctypeNameState
elif data == ">":
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == "\u0000":
Expand All @@ -1363,11 +1375,10 @@ def doctypeNameState(self):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype-name"})
self.currentToken["correct"] = False
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["name"] += data
self.currentToken["name"] += data.translate(asciiUpper2Lower)
return True

def afterDoctypeNameState(self):
Expand Down