From c6888be37634085a50cf29e67aea71a0e30e2e99 Mon Sep 17 00:00:00 2001 From: Germey Date: Sat, 29 Aug 2020 20:13:15 +0800 Subject: [PATCH 1/4] update v2 --- README.md | 3 +++ tutorial/pipelines.py | 26 +++++++++--------- tutorial/settings.py | 62 +++++++++++++++++++++---------------------- 3 files changed, 46 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 3abc458..208f3c0 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,5 @@ # ScrapyTutorial Scrapy Tutorial + +《Python3网络爬虫开发实战(第一版)》见 v1 分支。 +《Python3网络爬虫开发实战(第二版)》见 master 分支。 \ No newline at end of file diff --git a/tutorial/pipelines.py b/tutorial/pipelines.py index 59c3f88..47b5ffd 100644 --- a/tutorial/pipelines.py +++ b/tutorial/pipelines.py @@ -5,7 +5,7 @@ class TextPipeline(object): def __init__(self): self.limit = 50 - + def process_item(self, item, spider): if item['text']: if len(item['text']) > self.limit: @@ -15,26 +15,26 @@ def process_item(self, item, spider): return DropItem('Missing Text') -class MongoPipeline(object): - def __init__(self, mongo_uri, mongo_db): - self.mongo_uri = mongo_uri - self.mongo_db = mongo_db - +class MongoDBPipeline(object): + def __init__(self, connection_string, database): + self.connection_string = connection_string + self.database = database + @classmethod def from_crawler(cls, crawler): return cls( - mongo_uri=crawler.settings.get('MONGO_URI'), - mongo_db=crawler.settings.get('MONGO_DB') + connection_string=crawler.settings.get('MONGODB_CONNECTION_STRING'), + database=crawler.settings.get('MONGODB_DATABASE') ) - + def open_spider(self, spider): - self.client = pymongo.MongoClient(self.mongo_uri) - self.db = self.client[self.mongo_db] - + self.client = pymongo.MongoClient(self.connection_string) + self.db = self.client[self.database] + def process_item(self, item, spider): name = item.__class__.__name__ self.db[name].insert(dict(item)) return item - + def close_spider(self, spider): self.client.close() diff --git a/tutorial/settings.py b/tutorial/settings.py index 890271d..7f385c6 100644 --- a/tutorial/settings.py +++ b/tutorial/settings.py @@ -14,82 +14,80 @@ SPIDER_MODULES = ['tutorial.spiders'] NEWSPIDER_MODULE = 'tutorial.spiders' - # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'tutorial (+http://www.yourdomain.com)' +# USER_AGENT = 'tutorial (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'tutorial.middlewares.TutorialSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# DOWNLOADER_MIDDLEWARES = { # 'tutorial.middlewares.MyCustomDownloaderMiddleware': 543, -#} +# } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'tutorial.pipelines.TextPipeline': 300, - 'tutorial.pipelines.MongoPipeline': 400, + 'tutorial.pipelines.TextPipeline': 300, + 'tutorial.pipelines.MongoDBPipeline': 400, } - # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -MONGO_URI='localhost' -MONGO_DB='tutorial' \ No newline at end of file +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +MONGODB_CONNECTION_STRING = 'localhost' +MONGODB_DATABASE = 'tutorial' From e8342df1d479cf0195db64fb5fa984a4940427e1 Mon Sep 17 00:00:00 2001 From: Germey Date: Sat, 29 Aug 2020 20:15:35 +0800 Subject: [PATCH 2/4] update --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 208f3c0..5cc96e3 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,5 @@ Scrapy Tutorial 《Python3网络爬虫开发实战(第一版)》见 v1 分支。 + 《Python3网络爬虫开发实战(第二版)》见 master 分支。 \ No newline at end of file From bd54f463bd5bd1a3241238b62ae32a67aedcf89b Mon Sep 17 00:00:00 2001 From: Germey Date: Sun, 25 Jul 2021 19:54:34 +0800 Subject: [PATCH 3/4] to scrapytutorial --- scrapy.cfg | 4 ++-- {tutorial => scrapytutorial}/__init__.py | 0 {tutorial => scrapytutorial}/items.py | 0 {tutorial => scrapytutorial}/middlewares.py | 0 {tutorial => scrapytutorial}/pipelines.py | 13 +++++++------ {tutorial => scrapytutorial}/quotes.csv | 0 {tutorial => scrapytutorial}/quotes.jl | 0 {tutorial => scrapytutorial}/quotes.json | 0 {tutorial => scrapytutorial}/quotes.jsonlines | 0 {tutorial => scrapytutorial}/quotes.marshal | Bin {tutorial => scrapytutorial}/quotes.pickle | Bin {tutorial => scrapytutorial}/quotes.xml | 0 {tutorial => scrapytutorial}/settings.py | 18 +++++++++--------- .../spiders/__init__.py | 0 .../spiders/quotes.py | 4 ++-- 15 files changed, 20 insertions(+), 19 deletions(-) rename {tutorial => scrapytutorial}/__init__.py (100%) rename {tutorial => scrapytutorial}/items.py (100%) rename {tutorial => scrapytutorial}/middlewares.py (100%) rename {tutorial => scrapytutorial}/pipelines.py (90%) rename {tutorial => scrapytutorial}/quotes.csv (100%) rename {tutorial => scrapytutorial}/quotes.jl (100%) rename {tutorial => scrapytutorial}/quotes.json (100%) rename {tutorial => scrapytutorial}/quotes.jsonlines (100%) rename {tutorial => scrapytutorial}/quotes.marshal (100%) rename {tutorial => scrapytutorial}/quotes.pickle (100%) rename {tutorial => scrapytutorial}/quotes.xml (100%) rename {tutorial => scrapytutorial}/settings.py (86%) rename {tutorial => scrapytutorial}/spiders/__init__.py (100%) rename {tutorial => scrapytutorial}/spiders/quotes.py (87%) diff --git a/scrapy.cfg b/scrapy.cfg index ce8c9bb..912411d 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -4,8 +4,8 @@ # https://scrapyd.readthedocs.org/en/latest/deploy.html [settings] -default = tutorial.settings +default = scrapytutorial.settings [deploy] #url = http://localhost:6800/ -project = tutorial +project = scrapytutorial diff --git a/tutorial/__init__.py b/scrapytutorial/__init__.py similarity index 100% rename from tutorial/__init__.py rename to scrapytutorial/__init__.py diff --git a/tutorial/items.py b/scrapytutorial/items.py similarity index 100% rename from tutorial/items.py rename to scrapytutorial/items.py diff --git a/tutorial/middlewares.py b/scrapytutorial/middlewares.py similarity index 100% rename from tutorial/middlewares.py rename to scrapytutorial/middlewares.py diff --git a/tutorial/pipelines.py b/scrapytutorial/pipelines.py similarity index 90% rename from tutorial/pipelines.py rename to scrapytutorial/pipelines.py index 47b5ffd..c4d6f5d 100644 --- a/tutorial/pipelines.py +++ b/scrapytutorial/pipelines.py @@ -5,7 +5,7 @@ class TextPipeline(object): def __init__(self): self.limit = 50 - + def process_item(self, item, spider): if item['text']: if len(item['text']) > self.limit: @@ -19,22 +19,23 @@ class MongoDBPipeline(object): def __init__(self, connection_string, database): self.connection_string = connection_string self.database = database - + @classmethod def from_crawler(cls, crawler): return cls( - connection_string=crawler.settings.get('MONGODB_CONNECTION_STRING'), + connection_string=crawler.settings.get( + 'MONGODB_CONNECTION_STRING'), database=crawler.settings.get('MONGODB_DATABASE') ) - + def open_spider(self, spider): self.client = pymongo.MongoClient(self.connection_string) self.db = self.client[self.database] - + def process_item(self, item, spider): name = item.__class__.__name__ self.db[name].insert(dict(item)) return item - + def close_spider(self, spider): self.client.close() diff --git a/tutorial/quotes.csv b/scrapytutorial/quotes.csv similarity index 100% rename from tutorial/quotes.csv rename to scrapytutorial/quotes.csv diff --git a/tutorial/quotes.jl b/scrapytutorial/quotes.jl similarity index 100% rename from tutorial/quotes.jl rename to scrapytutorial/quotes.jl diff --git a/tutorial/quotes.json b/scrapytutorial/quotes.json similarity index 100% rename from tutorial/quotes.json rename to scrapytutorial/quotes.json diff --git a/tutorial/quotes.jsonlines b/scrapytutorial/quotes.jsonlines similarity index 100% rename from tutorial/quotes.jsonlines rename to scrapytutorial/quotes.jsonlines diff --git a/tutorial/quotes.marshal b/scrapytutorial/quotes.marshal similarity index 100% rename from tutorial/quotes.marshal rename to scrapytutorial/quotes.marshal diff --git a/tutorial/quotes.pickle b/scrapytutorial/quotes.pickle similarity index 100% rename from tutorial/quotes.pickle rename to scrapytutorial/quotes.pickle diff --git a/tutorial/quotes.xml b/scrapytutorial/quotes.xml similarity index 100% rename from tutorial/quotes.xml rename to scrapytutorial/quotes.xml diff --git a/tutorial/settings.py b/scrapytutorial/settings.py similarity index 86% rename from tutorial/settings.py rename to scrapytutorial/settings.py index 7f385c6..839f650 100644 --- a/tutorial/settings.py +++ b/scrapytutorial/settings.py @@ -9,13 +9,13 @@ # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -BOT_NAME = 'tutorial' +BOT_NAME = 'scrapytutorial' -SPIDER_MODULES = ['tutorial.spiders'] -NEWSPIDER_MODULE = 'tutorial.spiders' +SPIDER_MODULES = ['scrapytutorial.spiders'] +NEWSPIDER_MODULE = 'scrapytutorial.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent -# USER_AGENT = 'tutorial (+http://www.yourdomain.com)' +# USER_AGENT = 'scrapytutorial (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True @@ -46,13 +46,13 @@ # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { -# 'tutorial.middlewares.TutorialSpiderMiddleware': 543, +# 'scrapytutorial.middlewares.TutorialSpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { -# 'tutorial.middlewares.MyCustomDownloaderMiddleware': 543, +# 'scrapytutorial.middlewares.MyCustomDownloaderMiddleware': 543, # } # Enable or disable extensions @@ -64,8 +64,8 @@ # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'tutorial.pipelines.TextPipeline': 300, - 'tutorial.pipelines.MongoDBPipeline': 400, + 'scrapytutorial.pipelines.TextPipeline': 300, + 'scrapytutorial.pipelines.MongoDBPipeline': 400, } # Enable and configure the AutoThrottle extension (disabled by default) @@ -90,4 +90,4 @@ # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' MONGODB_CONNECTION_STRING = 'localhost' -MONGODB_DATABASE = 'tutorial' +MONGODB_DATABASE = 'scrapytutorial' diff --git a/tutorial/spiders/__init__.py b/scrapytutorial/spiders/__init__.py similarity index 100% rename from tutorial/spiders/__init__.py rename to scrapytutorial/spiders/__init__.py diff --git a/tutorial/spiders/quotes.py b/scrapytutorial/spiders/quotes.py similarity index 87% rename from tutorial/spiders/quotes.py rename to scrapytutorial/spiders/quotes.py index d5980ae..1b7fb73 100644 --- a/tutorial/spiders/quotes.py +++ b/scrapytutorial/spiders/quotes.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- import scrapy -from tutorial.items import QuoteItem +from scrapytutorial.items import QuoteItem class QuotesSpider(scrapy.Spider): name = "quotes" allowed_domains = ["quotes.toscrape.com"] - start_urls = ['http://quotes.toscrape.com/'] + start_urls = ['https://quotes.toscrape.com/'] def parse(self, response): quotes = response.css('.quote') From 22d7018280157a3ee260553780b06635537288e3 Mon Sep 17 00:00:00 2001 From: MetaJock Date: Sun, 12 Dec 2021 17:52:09 +0800 Subject: [PATCH 4/4] Modification description: insert() has been abolished in pymongo 4.x. Modified by: Jock --- scrapytutorial/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapytutorial/pipelines.py b/scrapytutorial/pipelines.py index c4d6f5d..04d9088 100644 --- a/scrapytutorial/pipelines.py +++ b/scrapytutorial/pipelines.py @@ -34,7 +34,7 @@ def open_spider(self, spider): def process_item(self, item, spider): name = item.__class__.__name__ - self.db[name].insert(dict(item)) + self.db[name].insert_one(dict(item)) return item def close_spider(self, spider):