From f748f3dc80afe0c38d7225bdb824f3bdd031862a Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 30 Nov 2017 09:19:36 -0500 Subject: [PATCH] First pass at documenting serializer --- html5lib/serializer.py | 163 ++++++++++++++++++++++++++++++----------- 1 file changed, 119 insertions(+), 44 deletions(-) diff --git a/html5lib/serializer.py b/html5lib/serializer.py index 103dd206..d6b7105d 100644 --- a/html5lib/serializer.py +++ b/html5lib/serializer.py @@ -68,10 +68,33 @@ def htmlentityreplace_errors(exc): else: return xmlcharrefreplace_errors(exc) + register_error("htmlentityreplace", htmlentityreplace_errors) def serialize(input, tree="etree", encoding=None, **serializer_opts): + """Serializes the input token stream using the specified treewalker + + :arg input: the token stream to serialize + + :arg tree: the treewalker to use + + :arg encoding: the encoding to use + + :arg serializer_opts: any options to pass to the + :py:class:`html5lib.serializer.HTMLSerializer` that gets created + + :returns: the tree serialized as a string + + Example: + + >>> from html5lib.html5parser import parse + >>> from html5lib.serializer import serialize + >>> token_stream = parse('

Hi!

') + >>> serialize(token_stream, omit_optional_tags=False) + '

Hi!

' + + """ # XXX: Should we cache this? walker = treewalkers.getTreeWalker(tree) s = HTMLSerializer(**serializer_opts) @@ -110,50 +133,83 @@ class HTMLSerializer(object): "strip_whitespace", "sanitize") def __init__(self, **kwargs): - """Initialize HTMLSerializer. - - Keyword options (default given first unless specified) include: - - inject_meta_charset=True|False - Whether it insert a meta element to define the character set of the - document. - quote_attr_values="legacy"|"spec"|"always" - Whether to quote attribute values that don't require quoting - per legacy browser behaviour, when required by the standard, or always. - quote_char=u'"'|u"'" - Use given quote character for attribute quoting. Default is to - use double quote unless attribute value contains a double quote, - in which case single quotes are used instead. - escape_lt_in_attrs=False|True - Whether to escape < in attribute values. - escape_rcdata=False|True - Whether to escape characters that need to be escaped within normal - elements within rcdata elements such as style. - resolve_entities=True|False - Whether to resolve named character entities that appear in the - source tree. The XML predefined entities < > & " ' - are unaffected by this setting. - strip_whitespace=False|True - Whether to remove semantically meaningless whitespace. (This - compresses all whitespace to a single space except within pre.) - minimize_boolean_attributes=True|False - Shortens boolean attributes to give just the attribute value, - for example becomes . - use_trailing_solidus=False|True - Includes a close-tag slash at the end of the start tag of void - elements (empty elements whose end tag is forbidden). E.g.
. - space_before_trailing_solidus=True|False - Places a space immediately before the closing slash in a tag - using a trailing solidus. E.g.
. Requires use_trailing_solidus. - sanitize=False|True - Strip all unsafe or unknown constructs from output. - See `html5lib user documentation`_ - omit_optional_tags=True|False - Omit start/end tags that are optional. - alphabetical_attributes=False|True - Reorder attributes to be in alphabetical order. - - .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation + """Initialize HTMLSerializer + + :arg inject_meta_charset: Whether or not to inject the meta charset. + + Defaults to ``True``. + + :arg quote_attr_values: Whether to quote attribute values that don't + require quoting per legacy browser behavior (``"legacy"``), when + required by the standard (``"spec"``), or always (``"always"``). + + Defaults to ``"legacy"``. + + :arg quote_char: Use given quote character for attribute quoting. + + Defaults to ``"`` which will use double quotes unless attribute + value contains a double quote, in which case single quotes are + used. + + :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute + values. + + Defaults to ``False``. + + :arg escape_rcdata: Whether to escape characters that need to be + escaped within normal elements within rcdata elements such as + style. + + Defaults to ``False``. + + :arg resolve_entities: Whether to resolve named character entities that + appear in the source tree. The XML predefined entities < > + & " ' are unaffected by this setting. + + Defaults to ``True``. + + :arg strip_whitespace: Whether to remove semantically meaningless + whitespace. (This compresses all whitespace to a single space + except within ``pre``.) + + Defaults to ``False``. + + :arg minimize_boolean_attributes: Shortens boolean attributes to give + just the attribute value, for example:: + + + + becomes:: + + + + Defaults to ``True``. + + :arg use_trailing_solidus: Includes a close-tag slash at the end of the + start tag of void elements (empty elements whose end tag is + forbidden). E.g. ``
``. + + Defaults to ``False``. + + :arg space_before_trailing_solidus: Places a space immediately before + the closing slash in a tag using a trailing solidus. E.g. + ``
``. Requires ``use_trailing_solidus=True``. + + Defaults to ``True``. + + :arg sanitize: Strip all unsafe or unknown constructs from output. + See :py:class:`html5lib.filters.sanitizer.Filter`. + + Defaults to ``False``. + + :arg omit_optional_tags: Omit start/end tags that are optional. + + Defaults to ``True``. + + :arg alphabetical_attributes: Reorder attributes to be in alphabetical order. + + Defaults to ``False``. + """ unexpected_args = frozenset(kwargs) - frozenset(self.options) if len(unexpected_args) > 0: @@ -317,6 +373,25 @@ def serialize(self, treewalker, encoding=None): self.serializeError(token["data"]) def render(self, treewalker, encoding=None): + """Serializes the stream from the treewalker into a string + + :arg treewalker: the treewalker to serialize + + :arg encoding: the string encoding to use + + :returns: the serialized tree + + Example: + + >>> from html5lib import parse, getTreeWalker + >>> from html5lib.serializer import HTMLSerializer + >>> token_stream = parse('Hi!') + >>> walker = getTreeWalker('etree') + >>> serializer = HTMLSerializer(omit_optional_tags=False) + >>> serializer.render(walker(token_stream)) + 'Hi!' + + """ if encoding: return b"".join(list(self.serialize(treewalker, encoding))) else: