diff --git a/README.md b/README.md index 3abc458..5cc96e3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,6 @@ # ScrapyTutorial Scrapy Tutorial + +《Python3网络爬虫开发实战(第一版)》见 v1 分支。 + +《Python3网络爬虫开发实战(第二版)》见 master 分支。 \ No newline at end of file diff --git a/scrapy.cfg b/scrapy.cfg index ce8c9bb..912411d 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -4,8 +4,8 @@ # https://scrapyd.readthedocs.org/en/latest/deploy.html [settings] -default = tutorial.settings +default = scrapytutorial.settings [deploy] #url = http://localhost:6800/ -project = tutorial +project = scrapytutorial diff --git a/tutorial/__init__.py b/scrapytutorial/__init__.py similarity index 100% rename from tutorial/__init__.py rename to scrapytutorial/__init__.py diff --git a/tutorial/items.py b/scrapytutorial/items.py similarity index 100% rename from tutorial/items.py rename to scrapytutorial/items.py diff --git a/tutorial/middlewares.py b/scrapytutorial/middlewares.py similarity index 100% rename from tutorial/middlewares.py rename to scrapytutorial/middlewares.py diff --git a/tutorial/pipelines.py b/scrapytutorial/pipelines.py similarity index 58% rename from tutorial/pipelines.py rename to scrapytutorial/pipelines.py index 59c3f88..04d9088 100644 --- a/tutorial/pipelines.py +++ b/scrapytutorial/pipelines.py @@ -15,25 +15,26 @@ def process_item(self, item, spider): return DropItem('Missing Text') -class MongoPipeline(object): - def __init__(self, mongo_uri, mongo_db): - self.mongo_uri = mongo_uri - self.mongo_db = mongo_db +class MongoDBPipeline(object): + def __init__(self, connection_string, database): + self.connection_string = connection_string + self.database = database @classmethod def from_crawler(cls, crawler): return cls( - mongo_uri=crawler.settings.get('MONGO_URI'), - mongo_db=crawler.settings.get('MONGO_DB') + connection_string=crawler.settings.get( + 'MONGODB_CONNECTION_STRING'), + database=crawler.settings.get('MONGODB_DATABASE') ) def open_spider(self, spider): - self.client = pymongo.MongoClient(self.mongo_uri) - self.db = self.client[self.mongo_db] + self.client = pymongo.MongoClient(self.connection_string) + self.db = self.client[self.database] def process_item(self, item, spider): name = item.__class__.__name__ - self.db[name].insert(dict(item)) + self.db[name].insert_one(dict(item)) return item def close_spider(self, spider): diff --git a/tutorial/quotes.csv b/scrapytutorial/quotes.csv similarity index 100% rename from tutorial/quotes.csv rename to scrapytutorial/quotes.csv diff --git a/tutorial/quotes.jl b/scrapytutorial/quotes.jl similarity index 100% rename from tutorial/quotes.jl rename to scrapytutorial/quotes.jl diff --git a/tutorial/quotes.json b/scrapytutorial/quotes.json similarity index 100% rename from tutorial/quotes.json rename to scrapytutorial/quotes.json diff --git a/tutorial/quotes.jsonlines b/scrapytutorial/quotes.jsonlines similarity index 100% rename from tutorial/quotes.jsonlines rename to scrapytutorial/quotes.jsonlines diff --git a/tutorial/quotes.marshal b/scrapytutorial/quotes.marshal similarity index 100% rename from tutorial/quotes.marshal rename to scrapytutorial/quotes.marshal diff --git a/tutorial/quotes.pickle b/scrapytutorial/quotes.pickle similarity index 100% rename from tutorial/quotes.pickle rename to scrapytutorial/quotes.pickle diff --git a/tutorial/quotes.xml b/scrapytutorial/quotes.xml similarity index 100% rename from tutorial/quotes.xml rename to scrapytutorial/quotes.xml diff --git a/tutorial/settings.py b/scrapytutorial/settings.py similarity index 65% rename from tutorial/settings.py rename to scrapytutorial/settings.py index 890271d..839f650 100644 --- a/tutorial/settings.py +++ b/scrapytutorial/settings.py @@ -9,87 +9,85 @@ # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -BOT_NAME = 'tutorial' - -SPIDER_MODULES = ['tutorial.spiders'] -NEWSPIDER_MODULE = 'tutorial.spiders' +BOT_NAME = 'scrapytutorial' +SPIDER_MODULES = ['scrapytutorial.spiders'] +NEWSPIDER_MODULE = 'scrapytutorial.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'tutorial (+http://www.yourdomain.com)' +# USER_AGENT = 'scrapytutorial (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'tutorial.middlewares.TutorialSpiderMiddleware': 543, -#} +# SPIDER_MIDDLEWARES = { +# 'scrapytutorial.middlewares.TutorialSpiderMiddleware': 543, +# } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'tutorial.middlewares.MyCustomDownloaderMiddleware': 543, -#} +# DOWNLOADER_MIDDLEWARES = { +# 'scrapytutorial.middlewares.MyCustomDownloaderMiddleware': 543, +# } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'tutorial.pipelines.TextPipeline': 300, - 'tutorial.pipelines.MongoPipeline': 400, + 'scrapytutorial.pipelines.TextPipeline': 300, + 'scrapytutorial.pipelines.MongoDBPipeline': 400, } - # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -MONGO_URI='localhost' -MONGO_DB='tutorial' \ No newline at end of file +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +MONGODB_CONNECTION_STRING = 'localhost' +MONGODB_DATABASE = 'scrapytutorial' diff --git a/tutorial/spiders/__init__.py b/scrapytutorial/spiders/__init__.py similarity index 100% rename from tutorial/spiders/__init__.py rename to scrapytutorial/spiders/__init__.py diff --git a/tutorial/spiders/quotes.py b/scrapytutorial/spiders/quotes.py similarity index 87% rename from tutorial/spiders/quotes.py rename to scrapytutorial/spiders/quotes.py index d5980ae..1b7fb73 100644 --- a/tutorial/spiders/quotes.py +++ b/scrapytutorial/spiders/quotes.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- import scrapy -from tutorial.items import QuoteItem +from scrapytutorial.items import QuoteItem class QuotesSpider(scrapy.Spider): name = "quotes" allowed_domains = ["quotes.toscrape.com"] - start_urls = ['http://quotes.toscrape.com/'] + start_urls = ['https://quotes.toscrape.com/'] def parse(self, response): quotes = response.css('.quote')