diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py
index 4748a197..b037a9b4 100644
--- a/html5lib/_tokenizer.py
+++ b/html5lib/_tokenizer.py
@@ -233,7 +233,6 @@ def emitCurrentToken(self):
token = self.currentToken
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
- token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
data = attributeMap(raw)
@@ -380,7 +379,8 @@ def tagOpenState(self):
self.state = self.closeTagOpenState
elif data in asciiLetters:
self.currentToken = {"type": tokenTypes["StartTag"],
- "name": data, "data": [],
+ "name": data.translate(asciiUpper2Lower),
+ "data": [],
"selfClosing": False,
"selfClosingAcknowledged": False}
self.state = self.tagNameState
@@ -410,7 +410,8 @@ def tagOpenState(self):
def closeTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
- self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": data.translate(asciiUpper2Lower),
"data": [], "selfClosing": False}
self.state = self.tagNameState
elif data == ">":
@@ -448,7 +449,7 @@ def tagNameState(self):
"data": "invalid-codepoint"})
self.currentToken["name"] += "\uFFFD"
else:
- self.currentToken["name"] += data
+ self.currentToken["name"] += data.translate(asciiUpper2Lower)
# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return True
@@ -476,26 +477,29 @@ def rcdataEndTagOpenState(self):
return True
def rcdataEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ return True
+
+ name = self.temporaryBuffer.translate(asciiUpper2Lower)
+ appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "" + self.temporaryBuffer})
@@ -526,26 +530,29 @@ def rawtextEndTagOpenState(self):
return True
def rawtextEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ return True
+
+ name = self.temporaryBuffer.translate(asciiUpper2Lower)
+ appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "" + self.temporaryBuffer})
@@ -579,26 +586,29 @@ def scriptDataEndTagOpenState(self):
return True
def scriptDataEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ return True
+
+ name = self.temporaryBuffer.translate(asciiUpper2Lower)
+ appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "" + self.temporaryBuffer})
@@ -715,26 +725,29 @@ def scriptDataEscapedEndTagOpenState(self):
return True
def scriptDataEscapedEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ return True
+
+ name = self.temporaryBuffer.translate(asciiUpper2Lower)
+ appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
+ "name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "" + self.temporaryBuffer})
@@ -776,7 +789,9 @@ def scriptDataDoubleEscapedState(self):
"eof-in-script-in-script"})
self.state = self.dataState
else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ chars = self.stream.charsUntil(("<", "-", "\u0000"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
return True
def scriptDataDoubleEscapedDashState(self):
@@ -859,7 +874,8 @@ def beforeAttributeNameState(self):
if data in spaceCharacters:
self.stream.charsUntil(spaceCharacters, True)
elif data in asciiLetters:
- self.currentToken["data"].append([data, ""])
+ attr_name = data.translate(asciiUpper2Lower)
+ self.currentToken["data"].append([attr_name, ""])
self.state = self.attributeNameState
elif data == ">":
self.emitCurrentToken()
@@ -891,8 +907,7 @@ def attributeNameState(self):
if data == "=":
self.state = self.beforeAttributeValueState
elif data in asciiLetters:
- self.currentToken["data"][-1][0] += data +\
- self.stream.charsUntil(asciiLetters, True)
+ self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
leavingThisState = False
elif data == ">":
# XXX If we emit here the attributes are converted to a dict
@@ -919,15 +934,13 @@ def attributeNameState(self):
"data": "eof-in-attribute-name"})
self.state = self.dataState
else:
- self.currentToken["data"][-1][0] += data
+ self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
leavingThisState = False
if leavingThisState:
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
- self.currentToken["data"][-1][0] = (
- self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, _ in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
@@ -947,7 +960,8 @@ def afterAttributeNameState(self):
elif data == ">":
self.emitCurrentToken()
elif data in asciiLetters:
- self.currentToken["data"].append([data, ""])
+ attr_name = data.translate(asciiUpper2Lower)
+ self.currentToken["data"].append([attr_name, ""])
self.state = self.attributeNameState
elif data == "/":
self.state = self.selfClosingStartTagState
@@ -1341,17 +1355,15 @@ def beforeDoctypeNameState(self):
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
- self.currentToken["name"] = data
+ self.currentToken["name"] = data.translate(asciiUpper2Lower)
self.state = self.doctypeNameState
return True
def doctypeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.state = self.afterDoctypeNameState
elif data == ">":
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == "\u0000":
@@ -1363,11 +1375,10 @@ def doctypeNameState(self):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype-name"})
self.currentToken["correct"] = False
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
- self.currentToken["name"] += data
+ self.currentToken["name"] += data.translate(asciiUpper2Lower)
return True
def afterDoctypeNameState(self):