diff --git a/html5lib/treewalkers/__init__.py b/html5lib/treewalkers/__init__.py
index 402b722e..9bec2076 100644
--- a/html5lib/treewalkers/__init__.py
+++ b/html5lib/treewalkers/__init__.py
@@ -21,20 +21,25 @@
def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support
- Args:
- treeType (str): the name of the tree type required (case-insensitive).
- Supported values are:
-
- - "dom": The xml.dom.minidom DOM implementation
- - "etree": A generic walker for tree implementations exposing an
- elementtree-like interface (known to work with
- ElementTree, cElementTree and lxml.etree).
- - "lxml": Optimized walker for lxml.etree
- - "genshi": a Genshi stream
-
- Implementation: A module implementing the tree type e.g.
- xml.etree.ElementTree or cElementTree (Currently applies to the
- "etree" tree type only).
+ :arg str treeType: the name of the tree type required (case-insensitive).
+ Supported values are:
+
+ * "dom": The xml.dom.minidom DOM implementation
+ * "etree": A generic walker for tree implementations exposing an
+ elementtree-like interface (known to work with ElementTree,
+ cElementTree and lxml.etree).
+ * "lxml": Optimized walker for lxml.etree
+ * "genshi": a Genshi stream
+
+ :arg implementation: A module implementing the tree type e.g.
+ xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
+ tree type only).
+
+ :arg kwargs: keyword arguments passed to the etree walker--for other
+ walkers, this has no effect
+
+ :returns: a TreeWalker class
+
"""
treeType = treeType.lower()
@@ -73,7 +78,13 @@ def concatenateCharacterTokens(tokens):
def pprint(walker):
- """Pretty printer for tree walkers"""
+ """Pretty printer for tree walkers
+
+ Takes a TreeWalker instance and pretty prints the output of walking the tree.
+
+ :arg walker: a TreeWalker instance
+
+ """
output = []
indent = 0
for token in concatenateCharacterTokens(walker):
diff --git a/html5lib/treewalkers/base.py b/html5lib/treewalkers/base.py
index 36e1ba24..80c474c4 100644
--- a/html5lib/treewalkers/base.py
+++ b/html5lib/treewalkers/base.py
@@ -18,16 +18,48 @@
class TreeWalker(object):
+ """Walks a tree yielding tokens
+
+ Tokens are dicts that all have a ``type`` field specifying the type of the
+ token.
+
+ """
def __init__(self, tree):
+ """Creates a TreeWalker
+
+ :arg tree: the tree to walk
+
+ """
self.tree = tree
def __iter__(self):
raise NotImplementedError
def error(self, msg):
+ """Generates an error token with the given message
+
+ :arg msg: the error message
+
+ :returns: SerializeError token
+
+ """
return {"type": "SerializeError", "data": msg}
def emptyTag(self, namespace, name, attrs, hasChildren=False):
+ """Generates an EmptyTag token
+
+ :arg namespace: the namespace of the token--can be ``None``
+
+ :arg name: the name of the element
+
+ :arg attrs: the attributes of the element as a dict
+
+ :arg hasChildren: whether or not to yield a SerializationError because
+ this tag shouldn't have children
+
+ :returns: EmptyTag token
+
+ """
yield {"type": "EmptyTag", "name": name,
"namespace": namespace,
"data": attrs}
@@ -35,17 +67,61 @@ def emptyTag(self, namespace, name, attrs, hasChildren=False):
yield self.error("Void element has children")
def startTag(self, namespace, name, attrs):
+ """Generates a StartTag token
+
+ :arg namespace: the namespace of the token--can be ``None``
+
+ :arg name: the name of the element
+
+ :arg attrs: the attributes of the element as a dict
+
+ :returns: StartTag token
+
+ """
return {"type": "StartTag",
"name": name,
"namespace": namespace,
"data": attrs}
def endTag(self, namespace, name):
+ """Generates an EndTag token
+
+ :arg namespace: the namespace of the token--can be ``None``
+
+ :arg name: the name of the element
+
+ :returns: EndTag token
+
+ """
return {"type": "EndTag",
"name": name,
"namespace": namespace}
def text(self, data):
+ """Generates SpaceCharacters and Characters tokens
+
+ Depending on what's in the data, this generates one or more
+ ``SpaceCharacters`` and ``Characters`` tokens.
+
+ For example:
+
+ >>> from html5lib.treewalkers.base import TreeWalker
+ >>> # Give it an empty tree just so it instantiates
+ >>> walker = TreeWalker([])
+ >>> list(walker.text(''))
+ []
+ >>> list(walker.text(' '))
+ [{u'data': ' ', u'type': u'SpaceCharacters'}]
+ >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
+ [{u'data': ' ', u'type': u'SpaceCharacters'},
+ {u'data': u'abc', u'type': u'Characters'},
+ {u'data': u' ', u'type': u'SpaceCharacters'}]
+
+ :arg data: the text data
+
+ :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
+
+ """
data = data
middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)]
@@ -60,18 +136,44 @@ def text(self, data):
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
+ """Generates a Comment token
+
+ :arg data: the comment
+
+ :returns: Comment token
+
+ """
return {"type": "Comment", "data": data}
def doctype(self, name, publicId=None, systemId=None):
+ """Generates a Doctype token
+
+ :arg name:
+
+ :arg publicId:
+
+ :arg systemId:
+
+ :returns: the Doctype token
+
+ """
return {"type": "Doctype",
"name": name,
"publicId": publicId,
"systemId": systemId}
def entity(self, name):
+ """Generates an Entity token
+
+ :arg name: the entity name
+
+ :returns: an Entity token
+
+ """
return {"type": "Entity", "name": name}
def unknown(self, nodeType):
+ """Handles unknown node types"""
return self.error("Unknown node type: " + nodeType)