From 7ee9026f48587ddeec965ec81855447b0403fa67 Mon Sep 17 00:00:00 2001
From: John Vandenberg <jayvdb@gmail.com>
Date: Wed, 27 Jul 2016 12:15:58 +0700
Subject: [PATCH 1/3] Unbreak utils/spider.py

spider.py used both Python 2-only (md5) and Python 3-only (urllib) imports.
Also, it didnt use a namespace when searching for links to spider,
and did not read the robots.txt, preventing any spidering occurring.

Fix exception occuring when robots processing removed items from
list toVisit while iterating over the list.

Add more output on stderr, and a main() which spiders yahoo.com
---
 utils/spider.py | 94 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 67 insertions(+), 27 deletions(-)

diff --git a/utils/spider.py b/utils/spider.py
index 3a325888..d7e5a9f0 100644
--- a/utils/spider.py
+++ b/utils/spider.py
@@ -1,22 +1,28 @@
 #!/usr/bin/env python
-"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
+"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree.
 
 usage:
 import spider
 s = spider.Spider()
 s.spider("http://www.google.com", maxURLs=100)
 """
+from __future__ import absolute_import, division, unicode_literals, print_function
 
-import urllib.request
-import urllib.error
-import urllib.parse
-import urllib.robotparser
-import md5
+import sys
 
-import httplib2
+try:
+    import urllib.parse as urllib_parse
+except ImportError:
+    import urlparse as urllib_parse
+try:
+    import urllib.robotparser as robotparser
+except ImportError:
+    import robotparser
+
+from hashlib import md5
 
+import httplib2
 import html5lib
-from html5lib.treebuilders import etree
 
 
 class Spider(object):
@@ -25,7 +31,7 @@ def __init__(self):
         self.unvisitedURLs = set()
         self.visitedURLs = set()
         self.buggyURLs = set()
-        self.robotParser = urllib.robotparser.RobotFileParser()
+        self.robotParser = robotparser.RobotFileParser()
         self.contentDigest = {}
         self.http = httplib2.Http(".cache")
 
@@ -40,31 +46,39 @@ def run(self, initialURL, maxURLs=1000):
             if not self.unvisitedURLs:
                 break
             content = self.loadURL(self.unvisitedURLs.pop())
+        return urlNumber
 
     def parse(self, content):
         failed = False
-        p = html5lib.HTMLParser(tree=etree.TreeBuilder)
+        p = html5lib.HTMLParser(tree=html5lib.getTreeBuilder('etree'))
         try:
             tree = p.parse(content)
-        except:
+        except Exception as e:
             self.buggyURLs.add(self.currentURL)
             failed = True
-            print("BUGGY:", self.currentURL)
+            print("BUGGY: {0}: {1}".format(self.currentURL, e), file=sys.stderr)
         self.visitedURLs.add(self.currentURL)
         if not failed:
             self.updateURLs(tree)
 
     def loadURL(self, url):
-        resp, content = self.http.request(url, "GET")
+        print('Processing {0}'.format(url), file=sys.stderr)
+        try:
+            resp, content = self.http.request(url, "GET")
+        except Exception as e:
+            print("Failed to fetch {0}: {1}".format(url, e), file=sys.stderr)
+            return None
+
         self.currentURL = url
-        digest = md5.md5(content).hexdigest()
+        digest = md5(content).hexdigest()
         if digest in self.contentDigest:
             content = None
             self.visitedURLs.add(url)
         else:
             self.contentDigest[digest] = url
 
-        if resp['status'] != "200":
+        if resp['status'] not in ('200', '304'):
+            print("Fetch {0} status {1}".format(url, resp['status']), file=sys.stderr)
             content = None
 
         return content
@@ -75,9 +89,11 @@ def updateURLs(self, tree):
         have seen them before or not"""
         urls = set()
         # Remove all links we have already visited
-        for link in tree.findall(".//a"):
+        namespace = tree.tag[1:].split('}')[0]
+        links = list(tree.findall('.//{%s}a' % namespace))
+        for link in links:
             try:
-                url = urllib.parse.urldefrag(link.attrib['href'])[0]
+                url = urllib_parse.urldefrag(link.attrib['href'])[0]
                 if (url and url not in self.unvisitedURLs and url
                         not in self.visitedURLs):
                     urls.add(url)
@@ -88,38 +104,62 @@ def updateURLs(self, tree):
         # missing
         newUrls = set()
         for url in urls:
-            splitURL = list(urllib.parse.urlsplit(url))
+            splitURL = list(urllib_parse.urlsplit(url))
             if splitURL[0] != "http":
                 continue
             if splitURL[1] == "":
-                splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
-            newUrls.add(urllib.parse.urlunsplit(splitURL))
+                splitURL[1] = urllib_parse.urlsplit(self.currentURL)[1]
+            newUrls.add(urllib_parse.urlunsplit(splitURL))
         urls = newUrls
 
         responseHeaders = {}
         # Now we want to find the content types of the links we haven't visited
         for url in urls:
+            print('Checking {0}'.format(url), file=sys.stderr)
             try:
                 resp, content = self.http.request(url, "HEAD")
                 responseHeaders[url] = resp
-            except AttributeError:
-                # Don't know why this happens
-                pass
+            except Exception as e:
+                print('Error fetching HEAD of {0}: {1}'.format(url, e), file=sys.stderr)
 
         # Remove links not of content-type html or pages not found
         # XXX - need to deal with other status codes?
         toVisit = set([url for url in urls if url in responseHeaders and
-                       "html" in responseHeaders[url]['content-type'] and
+                       'html' in responseHeaders[url].get('content-type', '') and
                        responseHeaders[url]['status'] == "200"])
 
         # Now check we are allowed to spider the page
-        for url in toVisit:
-            robotURL = list(urllib.parse.urlsplit(url)[:2])
+        for url in list(toVisit):
+            robotURL = list(urllib_parse.urlsplit(url)[:2])
             robotURL.extend(["robots.txt", "", ""])
-            robotURL = urllib.parse.urlunsplit(robotURL)
+            robotURL = urllib_parse.urlunsplit(robotURL)
             self.robotParser.set_url(robotURL)
+            try:
+                self.robotParser.read()
+            except Exception as e:
+                print('Failed to read {0}: {1}'.format(robotURL, e), file=sys.stderr)
+                toVisit.remove(url)
+                continue
+
             if not self.robotParser.can_fetch("*", url):
+                print('{0} rejects {1}'.format(robotURL, url), file=sys.stderr)
                 toVisit.remove(url)
 
         self.visitedURLs.update(urls)
         self.unvisitedURLs.update(toVisit)
+
+
+def main():
+    max_urls = 100
+    s = Spider()
+    count = s.run("http://yahoo.com/", maxURLs=max_urls)
+    if s.buggyURLs:
+        print('Buggy URLs:')
+        print('  ' + '\n  '.join(s.buggyURLs))
+        print('')
+    if count != max_urls:
+        print('{0} of {1} processed'.format(count, max_urls))
+    sys.exit(count == max_urls and len(s.buggyURLs) == 0)
+
+if __name__ == '__main__':
+    main()

From 0116f9997614bb415c7e31bab49957e3059929f3 Mon Sep 17 00:00:00 2001
From: John Vandenberg <jayvdb@gmail.com>
Date: Thu, 28 Jul 2016 06:13:07 +0700
Subject: [PATCH 2/3] Check robots.txt before many HEAD requests

---
 utils/spider.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/utils/spider.py b/utils/spider.py
index d7e5a9f0..ea905681 100644
--- a/utils/spider.py
+++ b/utils/spider.py
@@ -112,6 +112,13 @@ def updateURLs(self, tree):
             newUrls.add(urllib_parse.urlunsplit(splitURL))
         urls = newUrls
 
+        toVisit = self.check_robots(urls)
+        toVisit = self.check_headers(toVisit)
+
+        self.visitedURLs.update(urls)
+        self.unvisitedURLs.update(toVisit)
+
+    def check_headers(self, urls):
         responseHeaders = {}
         # Now we want to find the content types of the links we haven't visited
         for url in urls:
@@ -128,8 +135,13 @@ def updateURLs(self, tree):
                        'html' in responseHeaders[url].get('content-type', '') and
                        responseHeaders[url]['status'] == "200"])
 
+        return toVisit
+
+    def check_robots(self, urls):
         # Now check we are allowed to spider the page
-        for url in list(toVisit):
+        toVisit = list(urls)
+
+        for url in toVisit:
             robotURL = list(urllib_parse.urlsplit(url)[:2])
             robotURL.extend(["robots.txt", "", ""])
             robotURL = urllib_parse.urlunsplit(robotURL)
@@ -138,15 +150,14 @@ def updateURLs(self, tree):
                 self.robotParser.read()
             except Exception as e:
                 print('Failed to read {0}: {1}'.format(robotURL, e), file=sys.stderr)
-                toVisit.remove(url)
+                urls.remove(url)
                 continue
 
             if not self.robotParser.can_fetch("*", url):
                 print('{0} rejects {1}'.format(robotURL, url), file=sys.stderr)
-                toVisit.remove(url)
+                urls.remove(url)
 
-        self.visitedURLs.update(urls)
-        self.unvisitedURLs.update(toVisit)
+        return urls
 
 
 def main():

From 5ee2066b49de79c06c9b3bbf1ae4deaa3e2d6a67 Mon Sep 17 00:00:00 2001
From: John Vandenberg <jayvdb@gmail.com>
Date: Thu, 28 Jul 2016 06:53:07 +0700
Subject: [PATCH 3/3] Safer fetching

---
 utils/spider.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/utils/spider.py b/utils/spider.py
index ea905681..4c9fd238 100644
--- a/utils/spider.py
+++ b/utils/spider.py
@@ -147,9 +147,25 @@ def check_robots(self, urls):
             robotURL = urllib_parse.urlunsplit(robotURL)
             self.robotParser.set_url(robotURL)
             try:
-                self.robotParser.read()
+                resp, content = self.http.request(robotURL, "GET")
             except Exception as e:
-                print('Failed to read {0}: {1}'.format(robotURL, e), file=sys.stderr)
+                print("Failed to fetch {0}: {1}".format(robotURL, e), file=sys.stderr)
+                urls.remove(url)
+                continue
+
+            if resp['status'] == '404':
+                # no robots.txt to check
+                continue
+
+            if resp['status'] not in ('200', '304'):
+                print("Fetch {0} status {1}".format(url, resp['status']), file=sys.stderr)
+                urls.remove(url)
+                continue
+
+            try:
+                self.robotParser.parse(content.decode('utf8'))
+            except Exception as e:
+                print('Failed to parse {0}: {1}'.format(robotURL, e), file=sys.stderr)
                 urls.remove(url)
                 continue