Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 3d5fea8

Browse files
committed
樊登脚本优化
1 parent 5cdfcae commit 3d5fea8

File tree

9 files changed

+66
-47
lines changed

9 files changed

+66
-47
lines changed

.idea/leetcode-python-.iml

Lines changed: 1 addition & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

微信公众号/check_link.py

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,17 @@
66
from selenium import webdriver
77
import xlrd
88
from xlutils.copy import copy
9+
from queue import Queue
10+
import threading
911

12+
wait_time = 0
13+
14+
link_file_path = r"C:\Users\lenovo\PycharmProjects\leetcode-python-\微信公众号\link.xls"
15+
link_ecel = xlrd.open_workbook(link_file_path)
16+
link_tables = link_ecel.sheet_by_index(0)
17+
link_get_col = 2
18+
link_write_col = 3
19+
link_can_use_index = 1
1020

1121
def get_keywords_data(tables, row, col):
1222
actual_data = tables.cell_value(row, col)
@@ -22,13 +32,13 @@ def write_to_excel(file_path, row, col, value):
2232

2333

2434
def get_links():
25-
res = []
26-
count = 0
27-
link_data = [get_keywords_data(link_tables, i, link_get_col) for i in range(1, link_tables.nrows)]
28-
for index, link in enumerate(link_data):
35+
global count
36+
driver = webdriver.Chrome(r"C:\Users\lenovo\PycharmProjects\leetcode-python-\微信公众号\chromedriver.exe")
37+
while not link_queue.empty():
38+
index, link = link_queue.get()
2939
driver.get(link)
30-
time.sleep(wait_time)
3140
try:
41+
time.sleep(wait_time)
3242
text = driver.find_element_by_xpath("/html/body/div[1]/div[1]/p[1]")
3343
if text.text == "开卡失败":
3444
write_to_excel(link_file_path, index + 1, link_write_col, "已使用")
@@ -37,19 +47,33 @@ def get_links():
3747
count += 1
3848
res.append(link)
3949
print("该卡可以使用:{}".format(link))
40-
print("当前可使用链接个数为:{}".format(count))
41-
return res
50+
# print("当前可使用链接个数为:{}".format(count))
51+
4252

4353

4454
if __name__ == '__main__':
45-
wait_time = 3
46-
driver = webdriver.Chrome(r"C:\Users\lenovo\PycharmProjects\Spider\chromedriver.exe")
47-
link_file_path = r"C:\Users\lenovo\PycharmProjects\leetcode-python-\微信公众号\link.xls"
48-
link_ecel = xlrd.open_workbook(link_file_path)
49-
link_tables = link_ecel.sheet_by_index(0)
50-
link_get_col = 2
51-
link_write_col = 3
52-
link_can_use_index = 1
53-
54-
res = get_links()
55-
print(res)
55+
import time
56+
57+
start_time = time.time()
58+
link_queue = Queue()
59+
print("正在初始化链接队列。。。")
60+
for i in range(1, link_tables.nrows - 1):
61+
link_queue.put([i, get_keywords_data(link_tables, i, link_get_col)])
62+
print("初始化完成")
63+
res = []
64+
count = 0
65+
66+
threads_lis = []
67+
for i in range(3):
68+
thread = threading.Thread(target=get_links)
69+
threads_lis.append(thread)
70+
71+
for one in threads_lis:
72+
one.start()
73+
74+
for one in threads_lis:
75+
one.join()
76+
77+
print("当前可用链接数:{}".format(count))
78+
end_time = time.time()
79+
print("用时:{}".format(end_time - start_time))

微信公众号/chromedriver.exe

8.3 MB
Binary file not shown.

微信公众号/link.xls

-19.5 KB
Binary file not shown.

微信公众号/new_check_link.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# @Author : Libuda
33
# @FileName: new_check_link.py
44
# @Software: PyCharm
5-
5+
import time
66
from bs4 import BeautifulSoup
77
import requests
88
import xlrd
@@ -15,6 +15,8 @@
1515
link_write_col = 3
1616
link_can_use_index = 1
1717

18+
headers = {
19+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
1820

1921
def get_keywords_data(tables, row, col):
2022
actual_data = tables.cell_value(row, col)
@@ -32,23 +34,29 @@ def write_to_excel(file_path, row, col, value):
3234
def get_links():
3335
res = []
3436
count = 0
35-
link_data = [get_keywords_data(link_tables, i, link_get_col) for i in range(1, link_tables.nrows)]
37+
link_data = [get_keywords_data(link_tables, i, link_get_col) for i in range(1, link_tables.nrows - 1)]
3638
for index, link in enumerate(link_data):
39+
response = requests.get(link, headers=headers).content
40+
41+
# print(response)
42+
soup = BeautifulSoup(response.decode('utf-8'), "html.parser")
3743

38-
try:
39-
print(link)
40-
response = requests.get(link).content
41-
soup = BeautifulSoup(response.decode('utf-8'), "html.parser")
42-
print(soup.find(class_='mobile').get_text())
44+
# time.sleep(1)
45+
if soup.find("input", class_='mobile'):
4346

44-
except Exception as e:
45-
print(e)
47+
# print(soup.find("input",class_='mobile'))
4648
count += 1
4749
res.append(link)
4850
print("该卡可以使用:{}".format(link))
51+
else:
52+
print("该链接已使用".format(link))
53+
write_to_excel(link_file_path, index, link_write_col, "已使用")
4954
print("当前可使用链接个数为:{}".format(count))
5055
return res
5156

5257

5358
if __name__ == '__main__':
59+
start_time = time.time()
5460
get_links()
61+
end_time = time.time()
62+
print("时间:{}".format(end_time - start_time))

微信公众号/phantomjs.exe

17.7 MB
Binary file not shown.

樊登读书脚本/all.py

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -273,22 +273,20 @@ def register(phone_data):
273273

274274
def main():
275275
crawl_count = 1
276+
windows = ""
277+
second_window = ''
276278
while 1:
277279

278280
# time_str = datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
279281
# print(times)
280282
# # s="2019.12.02 13:56:20"
281283
# # print(datetime.datetime.strptime(s,"%Y.%m.%d %H:%M:%S")<times)
282284
if crawl_count == 1:
283-
now_time = time.time()
284-
# end_date = now_time- time_jiange
285-
times = datetime.datetime.fromtimestamp(now_time)
286-
#time_str = "{}-{}-{} {}:{}:{}".format(times.year, times.month, times.day - 1, 0, 0, 0)
287285
print("第1次爬取")
288286
# driver.get("https://e.douyin.com/site/manage-center/user-manage")
289287
driver.get("https://e.douyin.com/site/")
290288

291-
print("请您进行登录及手动进行所有的筛选并更改浏览器设置!")
289+
print("请您进行登录及手动进行所有的筛选")
292290
yes = input("您是否已确认进行爬取")
293291
# cookie= driver.get_cookies()
294292
# driver.get("https://e.douyin.com/site/manage-center/user-manage")
@@ -300,28 +298,19 @@ def main():
300298
for wins in driver.window_handles:
301299
if wins != windows:
302300
driver.switch_to.window(wins)
301+
second_window = driver.current_window_handle
302+
y = input("是否设置完毕")
303303
# 测试
304304
# phone_data = [[phone, 0] for phone in [13945868092, 15169722520]]
305305
register(phone_data)
306-
driver.close()
307306
driver.switch_to.window(windows)
308307

309308
crawl_count += 1
310309
else:
311310
print("第{}次爬取".format(crawl_count))
312-
now_time = time.time() - time_jiange
313-
times = datetime.datetime.fromtimestamp(now_time)
314311
phone_data = get_phone_number(start_date, end_date)
315-
windows = driver.current_window_handle
316-
js = 'window.open("https://www.baidu.com");'
317-
driver.execute_script(js)
318-
for wins in driver.window_handles:
319-
if wins != windows:
320-
driver.switch_to.window(wins)
321-
# 测试
322-
# phone_data = [[phone,0] for phone in [15099123201,17621790591]]
312+
driver.switch_to.window(second_window)
323313
register(phone_data)
324-
driver.close()
325314
driver.switch_to.window(windows)
326315
crawl_count += 1
327316

樊登读书脚本/link.xls

0 Bytes
Binary file not shown.

樊登读书脚本/phone_number.xls

-18.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)