From 2094f0bcad9c0bdc9143a42c44cabcb2befde091 Mon Sep 17 00:00:00 2001
From: Jon Dufresne <jon.dufresne@gmail.com>
Date: Sat, 28 Oct 2017 16:07:07 -0700
Subject: [PATCH] Remove utils/spider.py

Fixes #349
---
 utils/spider.py | 125 ------------------------------------------------
 1 file changed, 125 deletions(-)
 delete mode 100644 utils/spider.py

diff --git a/utils/spider.py b/utils/spider.py
deleted file mode 100644
index 3a325888..00000000
--- a/utils/spider.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/usr/bin/env python
-"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
-
-usage:
-import spider
-s = spider.Spider()
-s.spider("http://www.google.com", maxURLs=100)
-"""
-
-import urllib.request
-import urllib.error
-import urllib.parse
-import urllib.robotparser
-import md5
-
-import httplib2
-
-import html5lib
-from html5lib.treebuilders import etree
-
-
-class Spider(object):
-
-    def __init__(self):
-        self.unvisitedURLs = set()
-        self.visitedURLs = set()
-        self.buggyURLs = set()
-        self.robotParser = urllib.robotparser.RobotFileParser()
-        self.contentDigest = {}
-        self.http = httplib2.Http(".cache")
-
-    def run(self, initialURL, maxURLs=1000):
-        urlNumber = 0
-        self.visitedURLs.add(initialURL)
-        content = self.loadURL(initialURL)
-        while maxURLs is None or urlNumber < maxURLs:
-            if content is not None:
-                self.parse(content)
-                urlNumber += 1
-            if not self.unvisitedURLs:
-                break
-            content = self.loadURL(self.unvisitedURLs.pop())
-
-    def parse(self, content):
-        failed = False
-        p = html5lib.HTMLParser(tree=etree.TreeBuilder)
-        try:
-            tree = p.parse(content)
-        except:
-            self.buggyURLs.add(self.currentURL)
-            failed = True
-            print("BUGGY:", self.currentURL)
-        self.visitedURLs.add(self.currentURL)
-        if not failed:
-            self.updateURLs(tree)
-
-    def loadURL(self, url):
-        resp, content = self.http.request(url, "GET")
-        self.currentURL = url
-        digest = md5.md5(content).hexdigest()
-        if digest in self.contentDigest:
-            content = None
-            self.visitedURLs.add(url)
-        else:
-            self.contentDigest[digest] = url
-
-        if resp['status'] != "200":
-            content = None
-
-        return content
-
-    def updateURLs(self, tree):
-        """Take all the links in the current document, extract the URLs and
-        update the list of visited and unvisited URLs according to whether we
-        have seen them before or not"""
-        urls = set()
-        # Remove all links we have already visited
-        for link in tree.findall(".//a"):
-            try:
-                url = urllib.parse.urldefrag(link.attrib['href'])[0]
-                if (url and url not in self.unvisitedURLs and url
-                        not in self.visitedURLs):
-                    urls.add(url)
-            except KeyError:
-                pass
-
-        # Remove all non-http URLs and add a suitable base URL where that is
-        # missing
-        newUrls = set()
-        for url in urls:
-            splitURL = list(urllib.parse.urlsplit(url))
-            if splitURL[0] != "http":
-                continue
-            if splitURL[1] == "":
-                splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
-            newUrls.add(urllib.parse.urlunsplit(splitURL))
-        urls = newUrls
-
-        responseHeaders = {}
-        # Now we want to find the content types of the links we haven't visited
-        for url in urls:
-            try:
-                resp, content = self.http.request(url, "HEAD")
-                responseHeaders[url] = resp
-            except AttributeError:
-                # Don't know why this happens
-                pass
-
-        # Remove links not of content-type html or pages not found
-        # XXX - need to deal with other status codes?
-        toVisit = set([url for url in urls if url in responseHeaders and
-                       "html" in responseHeaders[url]['content-type'] and
-                       responseHeaders[url]['status'] == "200"])
-
-        # Now check we are allowed to spider the page
-        for url in toVisit:
-            robotURL = list(urllib.parse.urlsplit(url)[:2])
-            robotURL.extend(["robots.txt", "", ""])
-            robotURL = urllib.parse.urlunsplit(robotURL)
-            self.robotParser.set_url(robotURL)
-            if not self.robotParser.can_fetch("*", url):
-                toVisit.remove(url)
-
-        self.visitedURLs.update(urls)
-        self.unvisitedURLs.update(toVisit)