From c6888be37634085a50cf29e67aea71a0e30e2e99 Mon Sep 17 00:00:00 2001
From: Germey <cqc@cuiqingcai.com>
Date: Sat, 29 Aug 2020 20:13:15 +0800
Subject: [PATCH 1/4] update v2

---
 README.md             |  3 +++
 tutorial/pipelines.py | 26 +++++++++---------
 tutorial/settings.py  | 62 +++++++++++++++++++++----------------------
 3 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 3abc458..208f3c0 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,5 @@
 # ScrapyTutorial
 Scrapy Tutorial
+
+《Python3网络爬虫开发实战（第一版）》见 v1 分支。
+《Python3网络爬虫开发实战（第二版）》见 master 分支。
\ No newline at end of file
diff --git a/tutorial/pipelines.py b/tutorial/pipelines.py
index 59c3f88..47b5ffd 100644
--- a/tutorial/pipelines.py
+++ b/tutorial/pipelines.py
@@ -5,7 +5,7 @@
 class TextPipeline(object):
     def __init__(self):
         self.limit = 50
-
+    
     def process_item(self, item, spider):
         if item['text']:
             if len(item['text']) > self.limit:
@@ -15,26 +15,26 @@ def process_item(self, item, spider):
             return DropItem('Missing Text')
 
 
-class MongoPipeline(object):
-    def __init__(self, mongo_uri, mongo_db):
-        self.mongo_uri = mongo_uri
-        self.mongo_db = mongo_db
-
+class MongoDBPipeline(object):
+    def __init__(self, connection_string, database):
+        self.connection_string = connection_string
+        self.database = database
+    
     @classmethod
     def from_crawler(cls, crawler):
         return cls(
-            mongo_uri=crawler.settings.get('MONGO_URI'),
-            mongo_db=crawler.settings.get('MONGO_DB')
+            connection_string=crawler.settings.get('MONGODB_CONNECTION_STRING'),
+            database=crawler.settings.get('MONGODB_DATABASE')
         )
-
+    
     def open_spider(self, spider):
-        self.client = pymongo.MongoClient(self.mongo_uri)
-        self.db = self.client[self.mongo_db]
-
+        self.client = pymongo.MongoClient(self.connection_string)
+        self.db = self.client[self.database]
+    
     def process_item(self, item, spider):
         name = item.__class__.__name__
         self.db[name].insert(dict(item))
         return item
-
+    
     def close_spider(self, spider):
         self.client.close()
diff --git a/tutorial/settings.py b/tutorial/settings.py
index 890271d..7f385c6 100644
--- a/tutorial/settings.py
+++ b/tutorial/settings.py
@@ -14,82 +14,80 @@
 SPIDER_MODULES = ['tutorial.spiders']
 NEWSPIDER_MODULE = 'tutorial.spiders'
 
-
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
+# USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+# CONCURRENT_REQUESTS = 32
 
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+# DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
 
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }
 
 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'tutorial.middlewares.TutorialSpiderMiddleware': 543,
-#}
+# }
 
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
+# DOWNLOADER_MIDDLEWARES = {
 #    'tutorial.middlewares.MyCustomDownloaderMiddleware': 543,
-#}
+# }
 
 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }
 
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-   'tutorial.pipelines.TextPipeline': 300,
-   'tutorial.pipelines.MongoPipeline': 400,
+    'tutorial.pipelines.TextPipeline': 300,
+    'tutorial.pipelines.MongoDBPipeline': 400,
 }
 
-
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
-
-MONGO_URI='localhost'
-MONGO_DB='tutorial'
\ No newline at end of file
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+MONGODB_CONNECTION_STRING = 'localhost'
+MONGODB_DATABASE = 'tutorial'

From e8342df1d479cf0195db64fb5fa984a4940427e1 Mon Sep 17 00:00:00 2001
From: Germey <cqc@cuiqingcai.com>
Date: Sat, 29 Aug 2020 20:15:35 +0800
Subject: [PATCH 2/4] update

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 208f3c0..5cc96e3 100644
--- a/README.md
+++ b/README.md
@@ -2,4 +2,5 @@
 Scrapy Tutorial
 
 《Python3网络爬虫开发实战（第一版）》见 v1 分支。
+
 《Python3网络爬虫开发实战（第二版）》见 master 分支。
\ No newline at end of file

From bd54f463bd5bd1a3241238b62ae32a67aedcf89b Mon Sep 17 00:00:00 2001
From: Germey <qicu@microsoft.com>
Date: Sun, 25 Jul 2021 19:54:34 +0800
Subject: [PATCH 3/4] to scrapytutorial

---
 scrapy.cfg                                    |   4 ++--
 {tutorial => scrapytutorial}/__init__.py      |   0
 {tutorial => scrapytutorial}/items.py         |   0
 {tutorial => scrapytutorial}/middlewares.py   |   0
 {tutorial => scrapytutorial}/pipelines.py     |  13 +++++++------
 {tutorial => scrapytutorial}/quotes.csv       |   0
 {tutorial => scrapytutorial}/quotes.jl        |   0
 {tutorial => scrapytutorial}/quotes.json      |   0
 {tutorial => scrapytutorial}/quotes.jsonlines |   0
 {tutorial => scrapytutorial}/quotes.marshal   | Bin
 {tutorial => scrapytutorial}/quotes.pickle    | Bin
 {tutorial => scrapytutorial}/quotes.xml       |   0
 {tutorial => scrapytutorial}/settings.py      |  18 +++++++++---------
 .../spiders/__init__.py                       |   0
 .../spiders/quotes.py                         |   4 ++--
 15 files changed, 20 insertions(+), 19 deletions(-)
 rename {tutorial => scrapytutorial}/__init__.py (100%)
 rename {tutorial => scrapytutorial}/items.py (100%)
 rename {tutorial => scrapytutorial}/middlewares.py (100%)
 rename {tutorial => scrapytutorial}/pipelines.py (90%)
 rename {tutorial => scrapytutorial}/quotes.csv (100%)
 rename {tutorial => scrapytutorial}/quotes.jl (100%)
 rename {tutorial => scrapytutorial}/quotes.json (100%)
 rename {tutorial => scrapytutorial}/quotes.jsonlines (100%)
 rename {tutorial => scrapytutorial}/quotes.marshal (100%)
 rename {tutorial => scrapytutorial}/quotes.pickle (100%)
 rename {tutorial => scrapytutorial}/quotes.xml (100%)
 rename {tutorial => scrapytutorial}/settings.py (86%)
 rename {tutorial => scrapytutorial}/spiders/__init__.py (100%)
 rename {tutorial => scrapytutorial}/spiders/quotes.py (87%)

diff --git a/scrapy.cfg b/scrapy.cfg
index ce8c9bb..912411d 100644
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -4,8 +4,8 @@
 # https://scrapyd.readthedocs.org/en/latest/deploy.html
 
 [settings]
-default = tutorial.settings
+default = scrapytutorial.settings
 
 [deploy]
 #url = http://localhost:6800/
-project = tutorial
+project = scrapytutorial
diff --git a/tutorial/__init__.py b/scrapytutorial/__init__.py
similarity index 100%
rename from tutorial/__init__.py
rename to scrapytutorial/__init__.py
diff --git a/tutorial/items.py b/scrapytutorial/items.py
similarity index 100%
rename from tutorial/items.py
rename to scrapytutorial/items.py
diff --git a/tutorial/middlewares.py b/scrapytutorial/middlewares.py
similarity index 100%
rename from tutorial/middlewares.py
rename to scrapytutorial/middlewares.py
diff --git a/tutorial/pipelines.py b/scrapytutorial/pipelines.py
similarity index 90%
rename from tutorial/pipelines.py
rename to scrapytutorial/pipelines.py
index 47b5ffd..c4d6f5d 100644
--- a/tutorial/pipelines.py
+++ b/scrapytutorial/pipelines.py
@@ -5,7 +5,7 @@
 class TextPipeline(object):
     def __init__(self):
         self.limit = 50
-    
+
     def process_item(self, item, spider):
         if item['text']:
             if len(item['text']) > self.limit:
@@ -19,22 +19,23 @@ class MongoDBPipeline(object):
     def __init__(self, connection_string, database):
         self.connection_string = connection_string
         self.database = database
-    
+
     @classmethod
     def from_crawler(cls, crawler):
         return cls(
-            connection_string=crawler.settings.get('MONGODB_CONNECTION_STRING'),
+            connection_string=crawler.settings.get(
+                'MONGODB_CONNECTION_STRING'),
             database=crawler.settings.get('MONGODB_DATABASE')
         )
-    
+
     def open_spider(self, spider):
         self.client = pymongo.MongoClient(self.connection_string)
         self.db = self.client[self.database]
-    
+
     def process_item(self, item, spider):
         name = item.__class__.__name__
         self.db[name].insert(dict(item))
         return item
-    
+
     def close_spider(self, spider):
         self.client.close()
diff --git a/tutorial/quotes.csv b/scrapytutorial/quotes.csv
similarity index 100%
rename from tutorial/quotes.csv
rename to scrapytutorial/quotes.csv
diff --git a/tutorial/quotes.jl b/scrapytutorial/quotes.jl
similarity index 100%
rename from tutorial/quotes.jl
rename to scrapytutorial/quotes.jl
diff --git a/tutorial/quotes.json b/scrapytutorial/quotes.json
similarity index 100%
rename from tutorial/quotes.json
rename to scrapytutorial/quotes.json
diff --git a/tutorial/quotes.jsonlines b/scrapytutorial/quotes.jsonlines
similarity index 100%
rename from tutorial/quotes.jsonlines
rename to scrapytutorial/quotes.jsonlines
diff --git a/tutorial/quotes.marshal b/scrapytutorial/quotes.marshal
similarity index 100%
rename from tutorial/quotes.marshal
rename to scrapytutorial/quotes.marshal
diff --git a/tutorial/quotes.pickle b/scrapytutorial/quotes.pickle
similarity index 100%
rename from tutorial/quotes.pickle
rename to scrapytutorial/quotes.pickle
diff --git a/tutorial/quotes.xml b/scrapytutorial/quotes.xml
similarity index 100%
rename from tutorial/quotes.xml
rename to scrapytutorial/quotes.xml
diff --git a/tutorial/settings.py b/scrapytutorial/settings.py
similarity index 86%
rename from tutorial/settings.py
rename to scrapytutorial/settings.py
index 7f385c6..839f650 100644
--- a/tutorial/settings.py
+++ b/scrapytutorial/settings.py
@@ -9,13 +9,13 @@
 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 
-BOT_NAME = 'tutorial'
+BOT_NAME = 'scrapytutorial'
 
-SPIDER_MODULES = ['tutorial.spiders']
-NEWSPIDER_MODULE = 'tutorial.spiders'
+SPIDER_MODULES = ['scrapytutorial.spiders']
+NEWSPIDER_MODULE = 'scrapytutorial.spiders'
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-# USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
+# USER_AGENT = 'scrapytutorial (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
@@ -46,13 +46,13 @@
 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 # SPIDER_MIDDLEWARES = {
-#    'tutorial.middlewares.TutorialSpiderMiddleware': 543,
+#    'scrapytutorial.middlewares.TutorialSpiderMiddleware': 543,
 # }
 
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 # DOWNLOADER_MIDDLEWARES = {
-#    'tutorial.middlewares.MyCustomDownloaderMiddleware': 543,
+#    'scrapytutorial.middlewares.MyCustomDownloaderMiddleware': 543,
 # }
 
 # Enable or disable extensions
@@ -64,8 +64,8 @@
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'tutorial.pipelines.TextPipeline': 300,
-    'tutorial.pipelines.MongoDBPipeline': 400,
+    'scrapytutorial.pipelines.TextPipeline': 300,
+    'scrapytutorial.pipelines.MongoDBPipeline': 400,
 }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
@@ -90,4 +90,4 @@
 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 
 MONGODB_CONNECTION_STRING = 'localhost'
-MONGODB_DATABASE = 'tutorial'
+MONGODB_DATABASE = 'scrapytutorial'
diff --git a/tutorial/spiders/__init__.py b/scrapytutorial/spiders/__init__.py
similarity index 100%
rename from tutorial/spiders/__init__.py
rename to scrapytutorial/spiders/__init__.py
diff --git a/tutorial/spiders/quotes.py b/scrapytutorial/spiders/quotes.py
similarity index 87%
rename from tutorial/spiders/quotes.py
rename to scrapytutorial/spiders/quotes.py
index d5980ae..1b7fb73 100644
--- a/tutorial/spiders/quotes.py
+++ b/scrapytutorial/spiders/quotes.py
@@ -1,13 +1,13 @@
 # -*- coding: utf-8 -*-
 import scrapy
 
-from tutorial.items import QuoteItem
+from scrapytutorial.items import QuoteItem
 
 
 class QuotesSpider(scrapy.Spider):
     name = "quotes"
     allowed_domains = ["quotes.toscrape.com"]
-    start_urls = ['http://quotes.toscrape.com/']
+    start_urls = ['https://quotes.toscrape.com/']
 
     def parse(self, response):
         quotes = response.css('.quote')

From 22d7018280157a3ee260553780b06635537288e3 Mon Sep 17 00:00:00 2001
From: MetaJock <tanjy@whu.edu.cn>
Date: Sun, 12 Dec 2021 17:52:09 +0800
Subject: [PATCH 4/4] Modification description: insert() has been abolished in
 pymongo 4.x. Modified by: Jock

---
 scrapytutorial/pipelines.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrapytutorial/pipelines.py b/scrapytutorial/pipelines.py
index c4d6f5d..04d9088 100644
--- a/scrapytutorial/pipelines.py
+++ b/scrapytutorial/pipelines.py
@@ -34,7 +34,7 @@ def open_spider(self, spider):
 
     def process_item(self, item, spider):
         name = item.__class__.__name__
-        self.db[name].insert(dict(item))
+        self.db[name].insert_one(dict(item))
         return item
 
     def close_spider(self, spider):