From 7ee9026f48587ddeec965ec81855447b0403fa67 Mon Sep 17 00:00:00 2001 From: John Vandenberg Date: Wed, 27 Jul 2016 12:15:58 +0700 Subject: [PATCH 1/3] Unbreak utils/spider.py spider.py used both Python 2-only (md5) and Python 3-only (urllib) imports. Also, it didnt use a namespace when searching for links to spider, and did not read the robots.txt, preventing any spidering occurring. Fix exception occuring when robots processing removed items from list toVisit while iterating over the list. Add more output on stderr, and a main() which spiders yahoo.com --- utils/spider.py | 94 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 27 deletions(-) diff --git a/utils/spider.py b/utils/spider.py index 3a325888..d7e5a9f0 100644 --- a/utils/spider.py +++ b/utils/spider.py @@ -1,22 +1,28 @@ #!/usr/bin/env python -"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree +"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree. usage: import spider s = spider.Spider() s.spider("http://www.google.com", maxURLs=100) """ +from __future__ import absolute_import, division, unicode_literals, print_function -import urllib.request -import urllib.error -import urllib.parse -import urllib.robotparser -import md5 +import sys -import httplib2 +try: + import urllib.parse as urllib_parse +except ImportError: + import urlparse as urllib_parse +try: + import urllib.robotparser as robotparser +except ImportError: + import robotparser + +from hashlib import md5 +import httplib2 import html5lib -from html5lib.treebuilders import etree class Spider(object): @@ -25,7 +31,7 @@ def __init__(self): self.unvisitedURLs = set() self.visitedURLs = set() self.buggyURLs = set() - self.robotParser = urllib.robotparser.RobotFileParser() + self.robotParser = robotparser.RobotFileParser() self.contentDigest = {} self.http = httplib2.Http(".cache") @@ -40,31 +46,39 @@ def run(self, initialURL, maxURLs=1000): if not self.unvisitedURLs: break content = self.loadURL(self.unvisitedURLs.pop()) + return urlNumber def parse(self, content): failed = False - p = html5lib.HTMLParser(tree=etree.TreeBuilder) + p = html5lib.HTMLParser(tree=html5lib.getTreeBuilder('etree')) try: tree = p.parse(content) - except: + except Exception as e: self.buggyURLs.add(self.currentURL) failed = True - print("BUGGY:", self.currentURL) + print("BUGGY: {0}: {1}".format(self.currentURL, e), file=sys.stderr) self.visitedURLs.add(self.currentURL) if not failed: self.updateURLs(tree) def loadURL(self, url): - resp, content = self.http.request(url, "GET") + print('Processing {0}'.format(url), file=sys.stderr) + try: + resp, content = self.http.request(url, "GET") + except Exception as e: + print("Failed to fetch {0}: {1}".format(url, e), file=sys.stderr) + return None + self.currentURL = url - digest = md5.md5(content).hexdigest() + digest = md5(content).hexdigest() if digest in self.contentDigest: content = None self.visitedURLs.add(url) else: self.contentDigest[digest] = url - if resp['status'] != "200": + if resp['status'] not in ('200', '304'): + print("Fetch {0} status {1}".format(url, resp['status']), file=sys.stderr) content = None return content @@ -75,9 +89,11 @@ def updateURLs(self, tree): have seen them before or not""" urls = set() # Remove all links we have already visited - for link in tree.findall(".//a"): + namespace = tree.tag[1:].split('}')[0] + links = list(tree.findall('.//{%s}a' % namespace)) + for link in links: try: - url = urllib.parse.urldefrag(link.attrib['href'])[0] + url = urllib_parse.urldefrag(link.attrib['href'])[0] if (url and url not in self.unvisitedURLs and url not in self.visitedURLs): urls.add(url) @@ -88,38 +104,62 @@ def updateURLs(self, tree): # missing newUrls = set() for url in urls: - splitURL = list(urllib.parse.urlsplit(url)) + splitURL = list(urllib_parse.urlsplit(url)) if splitURL[0] != "http": continue if splitURL[1] == "": - splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1] - newUrls.add(urllib.parse.urlunsplit(splitURL)) + splitURL[1] = urllib_parse.urlsplit(self.currentURL)[1] + newUrls.add(urllib_parse.urlunsplit(splitURL)) urls = newUrls responseHeaders = {} # Now we want to find the content types of the links we haven't visited for url in urls: + print('Checking {0}'.format(url), file=sys.stderr) try: resp, content = self.http.request(url, "HEAD") responseHeaders[url] = resp - except AttributeError: - # Don't know why this happens - pass + except Exception as e: + print('Error fetching HEAD of {0}: {1}'.format(url, e), file=sys.stderr) # Remove links not of content-type html or pages not found # XXX - need to deal with other status codes? toVisit = set([url for url in urls if url in responseHeaders and - "html" in responseHeaders[url]['content-type'] and + 'html' in responseHeaders[url].get('content-type', '') and responseHeaders[url]['status'] == "200"]) # Now check we are allowed to spider the page - for url in toVisit: - robotURL = list(urllib.parse.urlsplit(url)[:2]) + for url in list(toVisit): + robotURL = list(urllib_parse.urlsplit(url)[:2]) robotURL.extend(["robots.txt", "", ""]) - robotURL = urllib.parse.urlunsplit(robotURL) + robotURL = urllib_parse.urlunsplit(robotURL) self.robotParser.set_url(robotURL) + try: + self.robotParser.read() + except Exception as e: + print('Failed to read {0}: {1}'.format(robotURL, e), file=sys.stderr) + toVisit.remove(url) + continue + if not self.robotParser.can_fetch("*", url): + print('{0} rejects {1}'.format(robotURL, url), file=sys.stderr) toVisit.remove(url) self.visitedURLs.update(urls) self.unvisitedURLs.update(toVisit) + + +def main(): + max_urls = 100 + s = Spider() + count = s.run("http://yahoo.com/", maxURLs=max_urls) + if s.buggyURLs: + print('Buggy URLs:') + print(' ' + '\n '.join(s.buggyURLs)) + print('') + if count != max_urls: + print('{0} of {1} processed'.format(count, max_urls)) + sys.exit(count == max_urls and len(s.buggyURLs) == 0) + +if __name__ == '__main__': + main() From 0116f9997614bb415c7e31bab49957e3059929f3 Mon Sep 17 00:00:00 2001 From: John Vandenberg Date: Thu, 28 Jul 2016 06:13:07 +0700 Subject: [PATCH 2/3] Check robots.txt before many HEAD requests --- utils/spider.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/utils/spider.py b/utils/spider.py index d7e5a9f0..ea905681 100644 --- a/utils/spider.py +++ b/utils/spider.py @@ -112,6 +112,13 @@ def updateURLs(self, tree): newUrls.add(urllib_parse.urlunsplit(splitURL)) urls = newUrls + toVisit = self.check_robots(urls) + toVisit = self.check_headers(toVisit) + + self.visitedURLs.update(urls) + self.unvisitedURLs.update(toVisit) + + def check_headers(self, urls): responseHeaders = {} # Now we want to find the content types of the links we haven't visited for url in urls: @@ -128,8 +135,13 @@ def updateURLs(self, tree): 'html' in responseHeaders[url].get('content-type', '') and responseHeaders[url]['status'] == "200"]) + return toVisit + + def check_robots(self, urls): # Now check we are allowed to spider the page - for url in list(toVisit): + toVisit = list(urls) + + for url in toVisit: robotURL = list(urllib_parse.urlsplit(url)[:2]) robotURL.extend(["robots.txt", "", ""]) robotURL = urllib_parse.urlunsplit(robotURL) @@ -138,15 +150,14 @@ def updateURLs(self, tree): self.robotParser.read() except Exception as e: print('Failed to read {0}: {1}'.format(robotURL, e), file=sys.stderr) - toVisit.remove(url) + urls.remove(url) continue if not self.robotParser.can_fetch("*", url): print('{0} rejects {1}'.format(robotURL, url), file=sys.stderr) - toVisit.remove(url) + urls.remove(url) - self.visitedURLs.update(urls) - self.unvisitedURLs.update(toVisit) + return urls def main(): From 5ee2066b49de79c06c9b3bbf1ae4deaa3e2d6a67 Mon Sep 17 00:00:00 2001 From: John Vandenberg Date: Thu, 28 Jul 2016 06:53:07 +0700 Subject: [PATCH 3/3] Safer fetching --- utils/spider.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/utils/spider.py b/utils/spider.py index ea905681..4c9fd238 100644 --- a/utils/spider.py +++ b/utils/spider.py @@ -147,9 +147,25 @@ def check_robots(self, urls): robotURL = urllib_parse.urlunsplit(robotURL) self.robotParser.set_url(robotURL) try: - self.robotParser.read() + resp, content = self.http.request(robotURL, "GET") except Exception as e: - print('Failed to read {0}: {1}'.format(robotURL, e), file=sys.stderr) + print("Failed to fetch {0}: {1}".format(robotURL, e), file=sys.stderr) + urls.remove(url) + continue + + if resp['status'] == '404': + # no robots.txt to check + continue + + if resp['status'] not in ('200', '304'): + print("Fetch {0} status {1}".format(url, resp['status']), file=sys.stderr) + urls.remove(url) + continue + + try: + self.robotParser.parse(content.decode('utf8')) + except Exception as e: + print('Failed to parse {0}: {1}'.format(robotURL, e), file=sys.stderr) urls.remove(url) continue