12
12
13
13
class Spider :
14
14
def __init__ (self ):
15
- self .start_url = ''
15
+ self .start_url = 'https://yq.aliyun.com/articles/ '
16
16
self .pool = ""
17
17
self .stop = False
18
18
self .waitting_url = Queue ()
@@ -24,10 +24,10 @@ def extract_urls(self, html):
24
24
:return:
25
25
"""
26
26
urls = []
27
- pq = PyQuery ()
27
+ pq = PyQuery (html )
28
28
for link in pq .items ("a" ):
29
29
url = link .attr ("href" )
30
- if url and url .startwith ("http " ) and url not in self .seen_urls :
30
+ if url and url .startwith ("https " ) and url not in self .seen_urls :
31
31
urls .append (url )
32
32
self .waitting_url .put (url )
33
33
return urls
@@ -42,35 +42,28 @@ async def article_handler(self, url, session, pool):
42
42
html = await self .fetch (url , session )
43
43
self .seen_urls .add (url )
44
44
self .extract_urls (html )
45
- pq = PyQuery ()
46
- title = pq ("title" ).text ()
45
+ pq = PyQuery (html )
46
+ title = pq ("#blog- title" ).text ()
47
47
async with pool .acquire () as conn :
48
48
async with conn .cursor () as cur :
49
49
insert_sql = "insert into spider(title) values('{}')" .format (title )
50
50
await cur .execute (insert_sql )
51
51
52
- async def consumer (self , pool ):
53
- async with aiohttp .ClientSession () as session :
54
- while not self .stop :
55
- url = self .waitting_url .get ()
56
- if re .match ("http://.*?jobble.com" , url ):
57
- if url not in self .seen_urls :
58
- asyncio .ensure_future (self .article_handler (url , session ))
59
- else :
60
- if url not in self .seen_urls :
61
- asyncio .ensure_future (self .init_urls (url , session ))
52
+ async def consumer (self , pool , session ):
53
+ while not self .stop :
54
+ url = self .waitting_url .get ()
55
+ if re .match ("https://yq.aliyun.com/articles/\d*" , url ):
56
+ if url not in self .seen_urls :
57
+ asyncio .ensure_future (self .article_handler (url , session , pool ))
58
+ else :
59
+ if url not in self .seen_urls :
60
+ asyncio .ensure_future (self .init_urls (url , session ))
62
61
63
62
async def init_urls (self , url , session ):
64
63
html = await self .fetch (url , session )
65
64
self .seen_urls .add (url )
66
65
self .extract_urls (html = html )
67
66
68
- async def connection (self , loop ):
69
- pool = await aiomysql .create_pool (host = "127.0.0.1" , port = 3306 , user = 'root' , password = 'root' , db = 'aiospider' ,
70
- loop = loop , charset = 'utf-8' , autocommit = True )
71
- asyncio .ensure_future (self .init_urls (self .start_url ))
72
- asyncio .ensure_future (self .consumer (self .pool ))
73
-
74
67
async def fetch (self , url , session ):
75
68
try :
76
69
async with session .get (url ) as response :
@@ -80,7 +73,15 @@ async def fetch(self, url, session):
80
73
except Exception as e :
81
74
print (e )
82
75
76
+ async def main (self , loop ):
77
+ await aiomysql .create_pool (host = "127.0.0.1" , port = 3306 , user = 'root' , password = 'root' , db = 'aiospider' ,
78
+ loop = loop , charset = 'utf8' , autocommit = True )
79
+ async with aiohttp .ClientSession () as session :
80
+ asyncio .ensure_future (self .init_urls (self .start_url , session ))
81
+ asyncio .ensure_future (self .consumer (self .pool , session ))
83
82
84
83
if __name__ == "__main__" :
85
84
spider = Spider ()
86
- spider .fetch ()
85
+ loop = asyncio .get_event_loop ()
86
+ asyncio .ensure_future (spider .main (loop ))
87
+ loop .run_forever ()
0 commit comments