Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 50bffa5

Browse files
committed
抖音后台爬虫
1 parent 4a5485b commit 50bffa5

File tree

2 files changed

+45
-0
lines changed

2 files changed

+45
-0
lines changed

抖音后台爬虫/res.xls

29.5 KB
Binary file not shown.

抖音后台爬虫/spider.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# @Time : 2019/11/25 16:55
2+
# @Author : Libuda
3+
# @FileName: spider.py
4+
# @Software: PyCharm
5+
6+
import pandas
7+
import time
8+
from selenium import webdriver
9+
10+
phone_num = 13628398278
11+
driver = webdriver.Chrome(r'C:\Users\lenovo\PycharmProjects\Spider\chromedriver.exe')
12+
df = pandas.DataFrame()
13+
file_path = r"C:\Users\lenovo\PycharmProjects\leetcode-python-\抖音后台爬虫\res.xls"
14+
driver.get("https://e.douyin.com/site/")
15+
print("请您进行登录及手动进行所有的筛选")
16+
yes = input("您是否已确认进行爬取")
17+
if yes == "y":
18+
all_data_len = driver.find_element_by_xpath(
19+
'//*[@id="root"]/div[2]/div[1]/div/div/div[3]/div[1]/div[2]/div[3]/div/div/div/div/div/ul/li[1]').text.split(
20+
"条")[0].split("共")[1]
21+
print("总共 {} 条数据".format(all_data_len))
22+
num_tem = '//*[@id="root"]/div[2]/div[1]/div/div/div[3]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/div[1]/div/table/tbody/tr[{}]/td[4]'
23+
date_tem = '//*[@id="root"]/div[2]/div[1]/div/div/div[3]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/div[1]/div/table/tbody/tr[{}]/td[6]'
24+
25+
while len(df) < int(all_data_len):
26+
time.sleep(3) # 时间间隔
27+
res_data = []
28+
for i in range(1, 11):
29+
res_dic = {}
30+
res_dic['phone_number'] = driver.find_element_by_xpath(num_tem.format(i)).text
31+
res_dic['date'] = driver.find_element_by_xpath(date_tem.format(i)).text
32+
res_data.append(res_dic)
33+
df = df.append(res_data)
34+
df.to_excel(file_path, index=0)
35+
# 点击下一页
36+
try:
37+
38+
driver.find_element_by_css_selector('.ant-pagination-next').click()
39+
except Exception as e:
40+
try:
41+
driver.find_element_by_xpath(
42+
'//*[@id="root"]/div[2]/div[1]/div/div/div[3]/div[1]/div[2]/div[3]/div/div/div/div/div/ul/li[12]').click()
43+
except Exception:
44+
driver.find_element_by_xpath(
45+
'//*[@id="root"]/div[2]/div[1]/div/div/div[3]/div[1]/div[2]/div[3]/div/div/div/div/div/ul/li[10]').click()

0 commit comments

Comments
 (0)