From 2094f0bcad9c0bdc9143a42c44cabcb2befde091 Mon Sep 17 00:00:00 2001 From: Jon Dufresne Date: Sat, 28 Oct 2017 16:07:07 -0700 Subject: [PATCH] Remove utils/spider.py Fixes #349 --- utils/spider.py | 125 ------------------------------------------------ 1 file changed, 125 deletions(-) delete mode 100644 utils/spider.py diff --git a/utils/spider.py b/utils/spider.py deleted file mode 100644 index 3a325888..00000000 --- a/utils/spider.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python -"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree - -usage: -import spider -s = spider.Spider() -s.spider("http://www.google.com", maxURLs=100) -""" - -import urllib.request -import urllib.error -import urllib.parse -import urllib.robotparser -import md5 - -import httplib2 - -import html5lib -from html5lib.treebuilders import etree - - -class Spider(object): - - def __init__(self): - self.unvisitedURLs = set() - self.visitedURLs = set() - self.buggyURLs = set() - self.robotParser = urllib.robotparser.RobotFileParser() - self.contentDigest = {} - self.http = httplib2.Http(".cache") - - def run(self, initialURL, maxURLs=1000): - urlNumber = 0 - self.visitedURLs.add(initialURL) - content = self.loadURL(initialURL) - while maxURLs is None or urlNumber < maxURLs: - if content is not None: - self.parse(content) - urlNumber += 1 - if not self.unvisitedURLs: - break - content = self.loadURL(self.unvisitedURLs.pop()) - - def parse(self, content): - failed = False - p = html5lib.HTMLParser(tree=etree.TreeBuilder) - try: - tree = p.parse(content) - except: - self.buggyURLs.add(self.currentURL) - failed = True - print("BUGGY:", self.currentURL) - self.visitedURLs.add(self.currentURL) - if not failed: - self.updateURLs(tree) - - def loadURL(self, url): - resp, content = self.http.request(url, "GET") - self.currentURL = url - digest = md5.md5(content).hexdigest() - if digest in self.contentDigest: - content = None - self.visitedURLs.add(url) - else: - self.contentDigest[digest] = url - - if resp['status'] != "200": - content = None - - return content - - def updateURLs(self, tree): - """Take all the links in the current document, extract the URLs and - update the list of visited and unvisited URLs according to whether we - have seen them before or not""" - urls = set() - # Remove all links we have already visited - for link in tree.findall(".//a"): - try: - url = urllib.parse.urldefrag(link.attrib['href'])[0] - if (url and url not in self.unvisitedURLs and url - not in self.visitedURLs): - urls.add(url) - except KeyError: - pass - - # Remove all non-http URLs and add a suitable base URL where that is - # missing - newUrls = set() - for url in urls: - splitURL = list(urllib.parse.urlsplit(url)) - if splitURL[0] != "http": - continue - if splitURL[1] == "": - splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1] - newUrls.add(urllib.parse.urlunsplit(splitURL)) - urls = newUrls - - responseHeaders = {} - # Now we want to find the content types of the links we haven't visited - for url in urls: - try: - resp, content = self.http.request(url, "HEAD") - responseHeaders[url] = resp - except AttributeError: - # Don't know why this happens - pass - - # Remove links not of content-type html or pages not found - # XXX - need to deal with other status codes? - toVisit = set([url for url in urls if url in responseHeaders and - "html" in responseHeaders[url]['content-type'] and - responseHeaders[url]['status'] == "200"]) - - # Now check we are allowed to spider the page - for url in toVisit: - robotURL = list(urllib.parse.urlsplit(url)[:2]) - robotURL.extend(["robots.txt", "", ""]) - robotURL = urllib.parse.urlunsplit(robotURL) - self.robotParser.set_url(robotURL) - if not self.robotParser.can_fetch("*", url): - toVisit.remove(url) - - self.visitedURLs.update(urls) - self.unvisitedURLs.update(toVisit)