Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Fix spider #294

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 97 additions & 30 deletions utils/spider.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
#!/usr/bin/env python
"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree.

usage:
import spider
s = spider.Spider()
s.spider("http://www.google.com", maxURLs=100)
"""
from __future__ import absolute_import, division, unicode_literals, print_function

import urllib.request
import urllib.error
import urllib.parse
import urllib.robotparser
import md5
import sys

import httplib2
try:
import urllib.parse as urllib_parse
except ImportError:
import urlparse as urllib_parse
try:
import urllib.robotparser as robotparser
except ImportError:
import robotparser

from hashlib import md5

import httplib2
import html5lib
from html5lib.treebuilders import etree


class Spider(object):
Expand All @@ -25,7 +31,7 @@ def __init__(self):
self.unvisitedURLs = set()
self.visitedURLs = set()
self.buggyURLs = set()
self.robotParser = urllib.robotparser.RobotFileParser()
self.robotParser = robotparser.RobotFileParser()
self.contentDigest = {}
self.http = httplib2.Http(".cache")

Expand All @@ -40,31 +46,39 @@ def run(self, initialURL, maxURLs=1000):
if not self.unvisitedURLs:
break
content = self.loadURL(self.unvisitedURLs.pop())
return urlNumber

def parse(self, content):
failed = False
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
p = html5lib.HTMLParser(tree=html5lib.getTreeBuilder('etree'))
try:
tree = p.parse(content)
except:
except Exception as e:
self.buggyURLs.add(self.currentURL)
failed = True
print("BUGGY:", self.currentURL)
print("BUGGY: {0}: {1}".format(self.currentURL, e), file=sys.stderr)
self.visitedURLs.add(self.currentURL)
if not failed:
self.updateURLs(tree)

def loadURL(self, url):
resp, content = self.http.request(url, "GET")
print('Processing {0}'.format(url), file=sys.stderr)
try:
resp, content = self.http.request(url, "GET")
except Exception as e:
print("Failed to fetch {0}: {1}".format(url, e), file=sys.stderr)
return None

self.currentURL = url
digest = md5.md5(content).hexdigest()
digest = md5(content).hexdigest()
if digest in self.contentDigest:
content = None
self.visitedURLs.add(url)
else:
self.contentDigest[digest] = url

if resp['status'] != "200":
if resp['status'] not in ('200', '304'):
print("Fetch {0} status {1}".format(url, resp['status']), file=sys.stderr)
content = None

return content
Expand All @@ -75,9 +89,11 @@ def updateURLs(self, tree):
have seen them before or not"""
urls = set()
# Remove all links we have already visited
for link in tree.findall(".//a"):
namespace = tree.tag[1:].split('}')[0]
links = list(tree.findall('.//{%s}a' % namespace))
for link in links:
try:
url = urllib.parse.urldefrag(link.attrib['href'])[0]
url = urllib_parse.urldefrag(link.attrib['href'])[0]
if (url and url not in self.unvisitedURLs and url
not in self.visitedURLs):
urls.add(url)
Expand All @@ -88,38 +104,89 @@ def updateURLs(self, tree):
# missing
newUrls = set()
for url in urls:
splitURL = list(urllib.parse.urlsplit(url))
splitURL = list(urllib_parse.urlsplit(url))
if splitURL[0] != "http":
continue
if splitURL[1] == "":
splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
newUrls.add(urllib.parse.urlunsplit(splitURL))
splitURL[1] = urllib_parse.urlsplit(self.currentURL)[1]
newUrls.add(urllib_parse.urlunsplit(splitURL))
urls = newUrls

toVisit = self.check_robots(urls)
toVisit = self.check_headers(toVisit)

self.visitedURLs.update(urls)
self.unvisitedURLs.update(toVisit)

def check_headers(self, urls):
responseHeaders = {}
# Now we want to find the content types of the links we haven't visited
for url in urls:
print('Checking {0}'.format(url), file=sys.stderr)
try:
resp, content = self.http.request(url, "HEAD")
responseHeaders[url] = resp
except AttributeError:
# Don't know why this happens
pass
except Exception as e:
print('Error fetching HEAD of {0}: {1}'.format(url, e), file=sys.stderr)

# Remove links not of content-type html or pages not found
# XXX - need to deal with other status codes?
toVisit = set([url for url in urls if url in responseHeaders and
"html" in responseHeaders[url]['content-type'] and
'html' in responseHeaders[url].get('content-type', '') and
responseHeaders[url]['status'] == "200"])

return toVisit

def check_robots(self, urls):
# Now check we are allowed to spider the page
toVisit = list(urls)

for url in toVisit:
robotURL = list(urllib.parse.urlsplit(url)[:2])
robotURL = list(urllib_parse.urlsplit(url)[:2])
robotURL.extend(["robots.txt", "", ""])
robotURL = urllib.parse.urlunsplit(robotURL)
robotURL = urllib_parse.urlunsplit(robotURL)
self.robotParser.set_url(robotURL)
if not self.robotParser.can_fetch("*", url):
toVisit.remove(url)
try:
resp, content = self.http.request(robotURL, "GET")
except Exception as e:
print("Failed to fetch {0}: {1}".format(robotURL, e), file=sys.stderr)
urls.remove(url)
continue

self.visitedURLs.update(urls)
self.unvisitedURLs.update(toVisit)
if resp['status'] == '404':
# no robots.txt to check
continue

if resp['status'] not in ('200', '304'):
print("Fetch {0} status {1}".format(url, resp['status']), file=sys.stderr)
urls.remove(url)
continue

try:
self.robotParser.parse(content.decode('utf8'))
except Exception as e:
print('Failed to parse {0}: {1}'.format(robotURL, e), file=sys.stderr)
urls.remove(url)
continue

if not self.robotParser.can_fetch("*", url):
print('{0} rejects {1}'.format(robotURL, url), file=sys.stderr)
urls.remove(url)

return urls


def main():
max_urls = 100
s = Spider()
count = s.run("http://yahoo.com/", maxURLs=max_urls)
if s.buggyURLs:
print('Buggy URLs:')
print(' ' + '\n '.join(s.buggyURLs))
print('')
if count != max_urls:
print('{0} of {1} processed'.format(count, max_urls))
sys.exit(count == max_urls and len(s.buggyURLs) == 0)

if __name__ == '__main__':
main()