22
22
23
23
24
24
def parse (doc , treebuilder = "etree" , encoding = None ,
25
- namespaceHTMLElements = True ):
25
+ namespaceHTMLElements = True , scripting = False ):
26
26
"""Parse a string or file-like object into a tree"""
27
27
tb = treebuilders .getTreeBuilder (treebuilder )
28
28
p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
29
- return p .parse (doc , encoding = encoding )
29
+ return p .parse (doc , encoding = encoding , scripting = scripting )
30
30
31
31
32
32
def parseFragment (doc , container = "div" , treebuilder = "etree" , encoding = None ,
33
- namespaceHTMLElements = True ):
33
+ namespaceHTMLElements = True , scripting = False ):
34
34
tb = treebuilders .getTreeBuilder (treebuilder )
35
35
p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
36
- return p .parseFragment (doc , container = container , encoding = encoding )
36
+ return p .parseFragment (doc , container = container , encoding = encoding , scripting = scripting )
37
37
38
38
39
39
def method_decorator_metaclass (function ):
@@ -78,11 +78,12 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
78
78
self .phases = dict ([(name , cls (self , self .tree )) for name , cls in
79
79
getPhases (debug ).items ()])
80
80
81
- def _parse (self , stream , innerHTML = False , container = "div" ,
82
- encoding = None , parseMeta = True , useChardet = True , ** kwargs ):
81
+ def _parse (self , stream , innerHTML = False , container = "div" , encoding = None ,
82
+ parseMeta = True , useChardet = True , scripting = False , ** kwargs ):
83
83
84
84
self .innerHTMLMode = innerHTML
85
85
self .container = container
86
+ self .scripting = scripting
86
87
self .tokenizer = self .tokenizer_class (stream , encoding = encoding ,
87
88
parseMeta = parseMeta ,
88
89
useChardet = useChardet ,
@@ -221,7 +222,8 @@ def normalizedTokens(self):
221
222
for token in self .tokenizer :
222
223
yield self .normalizeToken (token )
223
224
224
- def parse (self , stream , encoding = None , parseMeta = True , useChardet = True ):
225
+ def parse (self , stream , encoding = None , parseMeta = True ,
226
+ useChardet = True , scripting = False ):
225
227
"""Parse a HTML document into a well-formed tree
226
228
227
229
stream - a filelike object or string containing the HTML to be parsed
@@ -230,13 +232,15 @@ def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
230
232
the encoding. If specified, that encoding will be used,
231
233
regardless of any BOM or later declaration (such as in a meta
232
234
element)
235
+
236
+ scripting - treat noscript elements as if javascript was turned on
233
237
"""
234
238
self ._parse (stream , innerHTML = False , encoding = encoding ,
235
- parseMeta = parseMeta , useChardet = useChardet )
239
+ parseMeta = parseMeta , useChardet = useChardet , scripting = scripting )
236
240
return self .tree .getDocument ()
237
241
238
242
def parseFragment (self , stream , container = "div" , encoding = None ,
239
- parseMeta = False , useChardet = True ):
243
+ parseMeta = False , useChardet = True , scripting = False ):
240
244
"""Parse a HTML fragment into a well-formed tree fragment
241
245
242
246
container - name of the element we're setting the innerHTML property
@@ -248,8 +252,11 @@ def parseFragment(self, stream, container="div", encoding=None,
248
252
the encoding. If specified, that encoding will be used,
249
253
regardless of any BOM or later declaration (such as in a meta
250
254
element)
255
+
256
+ scripting - treat noscript elements as if javascript was turned on
251
257
"""
252
- self ._parse (stream , True , container = container , encoding = encoding )
258
+ self ._parse (stream , True , container = container ,
259
+ encoding = encoding , scripting = scripting )
253
260
return self .tree .getFragment ()
254
261
255
262
def parseError (self , errorcode = "XXX-undefined-error" , datavars = {}):
@@ -707,7 +714,8 @@ def __init__(self, parser, tree):
707
714
self .startTagHandler = utils .MethodDispatcher ([
708
715
("html" , self .startTagHtml ),
709
716
("title" , self .startTagTitle ),
710
- (("noscript" , "noframes" , "style" ), self .startTagNoScriptNoFramesStyle ),
717
+ (("noframes" , "style" ), self .startTagNoFramesStyle ),
718
+ ("noscript" , self .startTagNoscript ),
711
719
("script" , self .startTagScript ),
712
720
(("base" , "basefont" , "bgsound" , "command" , "link" ),
713
721
self .startTagBaseLinkCommand ),
@@ -716,7 +724,7 @@ def __init__(self, parser, tree):
716
724
])
717
725
self .startTagHandler .default = self .startTagOther
718
726
719
- self . endTagHandler = utils .MethodDispatcher ([
727
+ self .endTagHandler = utils .MethodDispatcher ([
720
728
("head" , self .endTagHead ),
721
729
(("br" , "html" , "body" ), self .endTagHtmlBodyBr )
722
730
])
@@ -766,10 +774,17 @@ def startTagMeta(self, token):
766
774
def startTagTitle (self , token ):
767
775
self .parser .parseRCDataRawtext (token , "RCDATA" )
768
776
769
- def startTagNoScriptNoFramesStyle (self , token ):
777
+ def startTagNoFramesStyle (self , token ):
770
778
# Need to decide whether to implement the scripting-disabled case
771
779
self .parser .parseRCDataRawtext (token , "RAWTEXT" )
772
780
781
+ def startTagNoscript (self , token ):
782
+ if self .parser .scripting :
783
+ self .parser .parseRCDataRawtext (token , "RAWTEXT" )
784
+ else :
785
+ self .tree .insertElement (token )
786
+ self .parser .phase = self .parser .phases ["inHeadNoscript" ]
787
+
773
788
def startTagScript (self , token ):
774
789
self .tree .insertElement (token )
775
790
self .parser .tokenizer .state = self .parser .tokenizer .scriptDataState
@@ -795,10 +810,51 @@ def endTagOther(self, token):
795
810
def anythingElse (self ):
796
811
self .endTagHead (impliedTagToken ("head" ))
797
812
798
- # XXX If we implement a parser for which scripting is disabled we need to
799
- # implement this phase.
800
- #
801
- # class InHeadNoScriptPhase(Phase):
813
+ class InHeadNoscriptPhase (Phase ):
814
+ def __init__ (self , parser , tree ):
815
+ Phase .__init__ (self , parser , tree )
816
+
817
+ self .startTagHandler = utils .MethodDispatcher ([
818
+ ("html" , self .startTagHtml ),
819
+ (("basefont" , "bgsound" , "link" , "meta" , "noframes" , "style" ), self .startTagBaseLinkCommand ),
820
+ (("head" , "noscript" ), self .startTagHeadNoscript ),
821
+ ])
822
+ self .startTagHandler .default = self .startTagOther
823
+
824
+ self .endTagHandler = utils .MethodDispatcher ([
825
+ ("noscript" , self .endTagNoscript ),
826
+ ("br" , self .endTagBr ),
827
+ ])
828
+ self .endTagHandler .default = self .endTagOther
829
+
830
+ def startTagHtml (self , token ):
831
+ return self .parser .phases ["inBody" ].processStartTag (token )
832
+
833
+ def startTagBaseLinkCommand (self , token ):
834
+ return self .parser .phases ["inHead" ].startTagBaseLinkCommand (token )
835
+
836
+ def startTagHeadNoscript (self , token ):
837
+ self .parser .parseError ("unexpected-start-tag" , {"name" : token ["name" ]})
838
+
839
+ def startTagOther (self , token ):
840
+ return self .anythingElse (token )
841
+
842
+ def endTagNoscript (self , token ):
843
+ node = self .parser .tree .openElements .pop ()
844
+ assert node .name == "noscript" , "Expected noscript got %s" % node .name
845
+ self .parser .phase = self .parser .phases ["inHead" ]
846
+
847
+ def endTagBr (self , token ):
848
+ return self .anythingElse (token )
849
+
850
+ def endTagOther (self , token ):
851
+ self .parser .parseError ("unexpected-end-tag" , {"name" : token ["name" ]})
852
+
853
+ def anythingElse (self , token ):
854
+ self .parser .parseError ("unexpected-inhead-noscript-tag" , {"name" : token ["name" ]})
855
+ self .endTagNoscript (impliedTagToken ("noscript" ))
856
+ return token
857
+
802
858
class AfterHeadPhase (Phase ):
803
859
def __init__ (self , parser , tree ):
804
860
Phase .__init__ (self , parser , tree )
@@ -909,7 +965,8 @@ def __init__(self, parser, tree):
909
965
("isindex" , self .startTagIsIndex ),
910
966
("textarea" , self .startTagTextarea ),
911
967
("iframe" , self .startTagIFrame ),
912
- (("noembed" , "noframes" , "noscript" ), self .startTagRawtext ),
968
+ ("noscript" , self .startTagNoscript ),
969
+ (("noembed" , "noframes" ), self .startTagRawtext ),
913
970
("select" , self .startTagSelect ),
914
971
(("rp" , "rt" ), self .startTagRpRt ),
915
972
(("option" , "optgroup" ), self .startTagOpt ),
@@ -1230,6 +1287,12 @@ def startTagIFrame(self, token):
1230
1287
self .parser .framesetOK = False
1231
1288
self .startTagRawtext (token )
1232
1289
1290
+ def startTagNoscript (self , token ):
1291
+ if self .parser .scripting :
1292
+ self .startTagRawtext (token )
1293
+ else :
1294
+ self .startTagOther (token )
1295
+
1233
1296
def startTagRawtext (self , token ):
1234
1297
"""iframe, noembed noframes, noscript(if scripting enabled)"""
1235
1298
self .parser .parseRCDataRawtext (token , "RAWTEXT" )
@@ -2686,7 +2749,7 @@ def processEndTag(self, token):
2686
2749
"beforeHtml" : BeforeHtmlPhase ,
2687
2750
"beforeHead" : BeforeHeadPhase ,
2688
2751
"inHead" : InHeadPhase ,
2689
- # XXX "inHeadNoscript": InHeadNoScriptPhase ,
2752
+ "inHeadNoscript" : InHeadNoscriptPhase ,
2690
2753
"afterHead" : AfterHeadPhase ,
2691
2754
"inBody" : InBodyPhase ,
2692
2755
"text" : TextPhase ,
0 commit comments