Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d4992db

Browse files
committed
异步io aiohttp爬虫
1 parent 8e213f2 commit d4992db

File tree

2 files changed

+69
-22
lines changed

2 files changed

+69
-22
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# @Time : 2019/10/8 18:21
2+
# @Author : Libuda
3+
# @FileName: aiohttp_demo.py
4+
# @Software: PyCharm
5+
# asyncio 没有提供http协议的接口 aiohttp
6+
import asyncio
7+
import socket
8+
from urllib.parse import urlparse
9+
10+
11+
async def get_url(url):
12+
# 通过socket请求html
13+
url = urlparse(url)
14+
host = url.netloc
15+
path = url.path
16+
if path == "":
17+
path = "/"
18+
19+
# 建立socket连接
20+
reader, writer = await asyncio.open_connection(host, 80)
21+
writer.write("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))
22+
all_lines = []
23+
async for raw_line in reader:
24+
data = raw_line.decode("utf8")
25+
all_lines.append(data)
26+
html = "\n".join(all_lines)
27+
return html
28+
29+
30+
async def main():
31+
tasks = []
32+
for url in range(20):
33+
url = "http://shop.projectsedu.com/goods/{}/".format(url)
34+
tasks.append(asyncio.ensure_future(get_url(url)))
35+
for task in asyncio.as_completed(tasks):
36+
result = await task
37+
print(result)
38+
39+
40+
if __name__ == "__main__":
41+
import time
42+
43+
start_time = time.time()
44+
loop = asyncio.get_event_loop()
45+
loop.run_until_complete(main())
46+
print('last time:{}'.format(time.time() - start_time))

python中多线程,多进程,进程池/aiohttp_spider.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
class Spider:
1414
def __init__(self):
15-
self.start_url = ''
15+
self.start_url = 'https://yq.aliyun.com/articles/'
1616
self.pool = ""
1717
self.stop = False
1818
self.waitting_url = Queue()
@@ -24,10 +24,10 @@ def extract_urls(self, html):
2424
:return:
2525
"""
2626
urls = []
27-
pq = PyQuery()
27+
pq = PyQuery(html)
2828
for link in pq.items("a"):
2929
url = link.attr("href")
30-
if url and url.startwith("http") and url not in self.seen_urls:
30+
if url and url.startwith("https") and url not in self.seen_urls:
3131
urls.append(url)
3232
self.waitting_url.put(url)
3333
return urls
@@ -42,35 +42,28 @@ async def article_handler(self, url, session, pool):
4242
html = await self.fetch(url, session)
4343
self.seen_urls.add(url)
4444
self.extract_urls(html)
45-
pq = PyQuery()
46-
title = pq("title").text()
45+
pq = PyQuery(html)
46+
title = pq("#blog-title").text()
4747
async with pool.acquire() as conn:
4848
async with conn.cursor() as cur:
4949
insert_sql = "insert into spider(title) values('{}')".format(title)
5050
await cur.execute(insert_sql)
5151

52-
async def consumer(self, pool):
53-
async with aiohttp.ClientSession() as session:
54-
while not self.stop:
55-
url = self.waitting_url.get()
56-
if re.match("http://.*?jobble.com", url):
57-
if url not in self.seen_urls:
58-
asyncio.ensure_future(self.article_handler(url, session))
59-
else:
60-
if url not in self.seen_urls:
61-
asyncio.ensure_future(self.init_urls(url, session))
52+
async def consumer(self, pool, session):
53+
while not self.stop:
54+
url = self.waitting_url.get()
55+
if re.match("https://yq.aliyun.com/articles/\d*", url):
56+
if url not in self.seen_urls:
57+
asyncio.ensure_future(self.article_handler(url, session, pool))
58+
else:
59+
if url not in self.seen_urls:
60+
asyncio.ensure_future(self.init_urls(url, session))
6261

6362
async def init_urls(self, url, session):
6463
html = await self.fetch(url, session)
6564
self.seen_urls.add(url)
6665
self.extract_urls(html=html)
6766

68-
async def connection(self, loop):
69-
pool = await aiomysql.create_pool(host="127.0.0.1", port=3306, user='root', password='root', db='aiospider',
70-
loop=loop, charset='utf-8', autocommit=True)
71-
asyncio.ensure_future(self.init_urls(self.start_url))
72-
asyncio.ensure_future(self.consumer(self.pool))
73-
7467
async def fetch(self, url, session):
7568
try:
7669
async with session.get(url) as response:
@@ -80,7 +73,15 @@ async def fetch(self, url, session):
8073
except Exception as e:
8174
print(e)
8275

76+
async def main(self, loop):
77+
await aiomysql.create_pool(host="127.0.0.1", port=3306, user='root', password='root', db='aiospider',
78+
loop=loop, charset='utf8', autocommit=True)
79+
async with aiohttp.ClientSession() as session:
80+
asyncio.ensure_future(self.init_urls(self.start_url, session))
81+
asyncio.ensure_future(self.consumer(self.pool, session))
8382

8483
if __name__ == "__main__":
8584
spider = Spider()
86-
spider.fetch()
85+
loop = asyncio.get_event_loop()
86+
asyncio.ensure_future(spider.main(loop))
87+
loop.run_forever()

0 commit comments

Comments
 (0)