diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index 4748a197..b037a9b4 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -233,7 +233,6 @@ def emitCurrentToken(self): token = self.currentToken # Add token to the queue to be yielded if (token["type"] in tagTokenTypes): - token["name"] = token["name"].translate(asciiUpper2Lower) if token["type"] == tokenTypes["StartTag"]: raw = token["data"] data = attributeMap(raw) @@ -380,7 +379,8 @@ def tagOpenState(self): self.state = self.closeTagOpenState elif data in asciiLetters: self.currentToken = {"type": tokenTypes["StartTag"], - "name": data, "data": [], + "name": data.translate(asciiUpper2Lower), + "data": [], "selfClosing": False, "selfClosingAcknowledged": False} self.state = self.tagNameState @@ -410,7 +410,8 @@ def tagOpenState(self): def closeTagOpenState(self): data = self.stream.char() if data in asciiLetters: - self.currentToken = {"type": tokenTypes["EndTag"], "name": data, + self.currentToken = {"type": tokenTypes["EndTag"], + "name": data.translate(asciiUpper2Lower), "data": [], "selfClosing": False} self.state = self.tagNameState elif data == ">": @@ -448,7 +449,7 @@ def tagNameState(self): "data": "invalid-codepoint"}) self.currentToken["name"] += "\uFFFD" else: - self.currentToken["name"] += data + self.currentToken["name"] += data.translate(asciiUpper2Lower) # (Don't use charsUntil here, because tag names are # very short and it's faster to not do anything fancy) return True @@ -476,26 +477,29 @@ def rcdataEndTagOpenState(self): return True def rcdataEndTagNameState(self): - appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer += data + return True + + name = self.temporaryBuffer.translate(asciiUpper2Lower) + appropriate = self.currentToken and self.currentToken["name"] == name if data in spaceCharacters and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.state = self.beforeAttributeNameState elif data == "/" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.state = self.selfClosingStartTagState elif data == ">" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState - elif data in asciiLetters: - self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState - elif data in asciiLetters: - self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState - elif data in asciiLetters: - self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], - "name": self.temporaryBuffer, + "name": name, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState - elif data in asciiLetters: - self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "": self.emitCurrentToken() @@ -891,8 +907,7 @@ def attributeNameState(self): if data == "=": self.state = self.beforeAttributeValueState elif data in asciiLetters: - self.currentToken["data"][-1][0] += data +\ - self.stream.charsUntil(asciiLetters, True) + self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower) leavingThisState = False elif data == ">": # XXX If we emit here the attributes are converted to a dict @@ -919,15 +934,13 @@ def attributeNameState(self): "data": "eof-in-attribute-name"}) self.state = self.dataState else: - self.currentToken["data"][-1][0] += data + self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower) leavingThisState = False if leavingThisState: # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. - self.currentToken["data"][-1][0] = ( - self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, _ in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": @@ -947,7 +960,8 @@ def afterAttributeNameState(self): elif data == ">": self.emitCurrentToken() elif data in asciiLetters: - self.currentToken["data"].append([data, ""]) + attr_name = data.translate(asciiUpper2Lower) + self.currentToken["data"].append([attr_name, ""]) self.state = self.attributeNameState elif data == "/": self.state = self.selfClosingStartTagState @@ -1341,17 +1355,15 @@ def beforeDoctypeNameState(self): self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["name"] = data + self.currentToken["name"] = data.translate(asciiUpper2Lower) self.state = self.doctypeNameState return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.state = self.afterDoctypeNameState elif data == ">": - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": @@ -1363,11 +1375,10 @@ def doctypeNameState(self): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype-name"}) self.currentToken["correct"] = False - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: - self.currentToken["name"] += data + self.currentToken["name"] += data.translate(asciiUpper2Lower) return True def afterDoctypeNameState(self):