Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 43fe65b

Browse files
committed
谷歌和必应关键词爬虫
1 parent 8e5d598 commit 43fe65b

File tree

10 files changed

+573
-0
lines changed

10 files changed

+573
-0
lines changed
Binary file not shown.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[default]
2+
executable_path = phantomjs.exe
3+
keywords_excel_path = keyword.xls
4+
biying_datas = biying_data.xls
5+
max_test_count = 3
537 KB
Binary file not shown.
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
# @Author : Libuda
2+
# @FileName: google_spider.py
3+
# @Software: PyCharm
4+
import sys
5+
6+
sys.path.append("./")
7+
from configparser import ConfigParser
8+
from selenium import webdriver
9+
import time
10+
import xlrd
11+
import base64
12+
from queue import Queue
13+
from xlutils.copy import copy # 写入Excel
14+
15+
config_parser = ConfigParser()
16+
config_parser.read('config.cfg')
17+
config = config_parser['default']
18+
browser = webdriver.PhantomJS(executable_path=config['executable_path'])
19+
20+
from operationExcel import OperationExcel
21+
22+
res_count = 0
23+
24+
25+
class Spider():
26+
def __init__(self):
27+
self.opExcel = OperationExcel(config['keywords_excel_path'], 0)
28+
self.file_path = config['biying_datas']
29+
# self.pass_key_excel = OperationExcel(config['pass_key_path'],0)
30+
self.dataExcel = OperationExcel(self.file_path, 0)
31+
self.keywords_queue = Queue()
32+
self.res = set()
33+
34+
def get_keywords_data(self, row):
35+
"""
36+
获取关键词数据
37+
:param row:
38+
:return:
39+
"""
40+
actual_data = OperationExcel(config['keywords_excel_path'], 0).get_cel_value(row, 0)
41+
return actual_data
42+
43+
def write_to_excel(self, file_path, sheet_id, row, col, value):
44+
"""
45+
写入Excel
46+
:param sheet_id:
47+
:param row:
48+
:param col:
49+
:param value:
50+
:return:
51+
"""
52+
work_book = xlrd.open_workbook(file_path, formatting_info=False)
53+
# 先通过xlutils.copy下copy复制Excel
54+
write_to_work = copy(work_book)
55+
# 通过sheet_by_index没有write方法 而get_sheet有write方法
56+
sheet_data = write_to_work.get_sheet(sheet_id)
57+
sheet_data.write(row, col, str(value))
58+
# 这里要注意保存 可是会将原来的Excel覆盖 样式消失
59+
write_to_work.save(file_path)
60+
61+
def main(self):
62+
global res_count
63+
test_count = int(config['max_test_count'])
64+
last_count = 0
65+
count = self.dataExcel.tables.nrows
66+
print("当前已有url数量:", count)
67+
key_len = self.opExcel.get_nrows()
68+
print("关键词总数:", key_len)
69+
# tem = 0 if self.pass_key_excel.tables.nrows==0 else self.pass_key_excel.tables.nrows-1
70+
# print("已爬取关键词个数 :",tem)
71+
# print("剩余爬取关键词个数:",key_len-tem)
72+
for index in range(1, key_len):
73+
74+
key = self.get_keywords_data(index)
75+
76+
try:
77+
print("启动中。。。。,如果20s内没有启动 请重新启动本软件")
78+
browser.get("https://cn.bing.com/?FORM=BEHPTB&ensearch=1")
79+
browser.find_element_by_css_selector("#sb_form_q").send_keys(key)
80+
browser.find_element_by_css_selector("#sb_form_go").click()
81+
82+
for i in range(20):
83+
if browser.current_url != "https://cn.bing.com/?FORM=BEHPTB&ensearch=1":
84+
continue
85+
else:
86+
print(20 - i)
87+
time.sleep(1)
88+
print("正在第{}次尝试自动启动。。。。。".format(i + 1))
89+
browser.get("https://cn.bing.com/?FORM=BEHPTB&ensearch=1")
90+
browser.find_element_by_css_selector("#sb_form_q").send_keys(key)
91+
browser.find_element_by_css_selector("#sb_form_go").click()
92+
except Exception as e:
93+
# print(e)
94+
print("正在尝试自动启动。。。。。")
95+
browser.get("https://cn.bing.com/?FORM=BEHPTB&ensearch=1")
96+
browser.find_element_by_css_selector("#sb_form_q").send_keys(key)
97+
browser.find_element_by_css_selector("#sb_form_go").click()
98+
99+
current_url_set = set()
100+
flag = True
101+
while flag:
102+
try:
103+
if browser.current_url in current_url_set:
104+
if test_count < 0:
105+
print("no next")
106+
flag = False
107+
else:
108+
print("当前url {} 可能为最后一页,进行第{}次测试".format(browser.current_url, test_count))
109+
test_count -= 1
110+
else:
111+
print("当前正在采集第 {} 个关键词:{},采集的页数为 :{} ".format((index + 1), key, len(current_url_set) + 1))
112+
print("当前url", browser.current_url)
113+
current_url_set.add(browser.current_url)
114+
115+
title = browser.find_elements_by_css_selector("#b_results > li > h2")
116+
url = browser.find_elements_by_css_selector('#b_results > li> h2 > a')
117+
for i in range(len(url)):
118+
119+
s = url[i].get_attribute("href").split("/")
120+
try:
121+
tmp = s[0] + "//" + s[2]
122+
except Exception as e:
123+
# print(e)
124+
tmp = s[0] + "//" + s[2]
125+
if tmp not in self.res:
126+
self.res.add(tmp)
127+
try:
128+
self.write_to_excel(self.file_path, -1, count, 0, title[i].text)
129+
self.write_to_excel(self.file_path, -1, count, 1, tmp)
130+
print(count, title[i].text, tmp)
131+
count += 1
132+
res_count += 1
133+
except Exception as e:
134+
print(e, "请关闭Excel 否则10秒后本条数据将不再写入")
135+
for i in range(10):
136+
print(10 - i)
137+
time.sleep(1)
138+
try:
139+
self.write_to_excel(self.file_path, -1, count, 0, title[i].text)
140+
self.write_to_excel(self.file_path, -1, count, 1, tmp)
141+
print(count, title[i].text, tmp, browser.current_url)
142+
except Exception:
143+
print("已漏掉数据...{} {}".format(title[i].text, tmp))
144+
145+
try:
146+
next_paget = browser.find_element_by_css_selector(
147+
"#b_results > li.b_pag > nav > ul > li:nth-child(9) > a")
148+
next_paget.click()
149+
except Exception as e:
150+
# print(e)
151+
try:
152+
next_paget = browser.find_element_by_css_selector(
153+
"#b_results > li.b_pag > nav > ul > li:nth-child(8) > a")
154+
next_paget.click()
155+
except Exception as e:
156+
# print(e)
157+
next_paget = browser.find_element_by_css_selector(
158+
"#b_results > li.b_pag > nav > ul > li:nth-child(7) > a")
159+
next_paget.click()
160+
except Exception as e:
161+
# print(e)
162+
try:
163+
try:
164+
next_paget = browser.find_element_by_css_selector(
165+
"#b_results > li.b_pag > nav > ul > li:nth-child(9) > a")
166+
next_paget.click()
167+
except Exception as e:
168+
# print(e)
169+
try:
170+
next_paget = browser.find_element_by_css_selector(
171+
"#b_results > li.b_pag > nav > ul > li:nth-child(8) > a")
172+
next_paget.click()
173+
except Exception as e:
174+
try:
175+
next_paget = browser.find_element_by_css_selector(
176+
"#b_results > li.b_pag > nav > ul > li:nth-child(7) > a")
177+
next_paget.click()
178+
except Exception as e:
179+
# print(e)
180+
try:
181+
next_paget = browser.find_element_by_css_selector(
182+
"#b_results > li.b_pag > nav > ul > li:nth-child(6) > a")
183+
next_paget.click()
184+
except Exception as e:
185+
print("找不到下一页呢")
186+
time.sleep(5)
187+
flag = False
188+
except Exception as e:
189+
print(e)
190+
print("可能是最后一页了呢 当前url为{}".format(browser.current_url))
191+
time.sleep(5)
192+
flag = False
193+
194+
try:
195+
# self.write_to_excel(config['pass_key_path'],0,tem,0,key)
196+
# self.write_to_excel(config['pass_key_path'],0,tem,1,res_count-last_count)
197+
print("当前关键词 :{} 爬取完毕 已爬取数据 :{}".format(key, res_count - last_count))
198+
except Exception as e:
199+
print(e)
200+
201+
print("本次采集已获取url总数为:", str(res_count))
202+
last_count = res_count
203+
print("关键词搜索完毕,谢谢使用!")
204+
while 1:
205+
pass
206+
207+
208+
if __name__ == "__main__":
209+
spider = Spider()
210+
spider.main()
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import xlrd
2+
import xlwt
3+
from xlutils.copy import copy # 写入Excel
4+
from xlutils.copy import copy as xl_copy
5+
6+
7+
class OperationExcel():
8+
"""
9+
#以面向对象的方式操作Excel
10+
"""
11+
12+
def __init__(self, file_name=None, sheet_id=None):
13+
"""
14+
初始化OperationExcel对象
15+
:param file_name:
16+
:param sheet_id: vv
17+
"""
18+
if file_name:
19+
self.file_name = file_name
20+
self.sheet_id = sheet_id
21+
else:
22+
self.file_name = r"C:\Users\lenovo\PycharmProjects\Spider\biying_data.xls"
23+
self.sheet_id = 0
24+
self.tables = self.get_tables()
25+
26+
def create_sheet(self, sheet_name):
27+
ecel = xlrd.open_workbook(self.file_name)
28+
wb = xl_copy(ecel)
29+
wb.add_sheet(sheet_name)
30+
wb.save(self.file_name)
31+
32+
def get_tables(self):
33+
"""
34+
返回tables对象
35+
:return:
36+
"""
37+
ecel = xlrd.open_workbook(self.file_name)
38+
tables = ecel.sheet_by_index(self.sheet_id)
39+
return tables
40+
41+
def get_nrows(self):
42+
"""
43+
获取表格行数
44+
:return:
45+
"""
46+
return self.tables.nrows
47+
48+
def get_ncols(self):
49+
"""
50+
获取表格列数
51+
:return:
52+
"""
53+
return self.tables.ncols
54+
55+
def get_data_by_row(self, row):
56+
"""
57+
根据行号获取某一行的内容
58+
:param row:
59+
:return:
60+
"""
61+
if row < 0:
62+
row = 0
63+
if row > self.get_nrows():
64+
row = self.get_nrows()
65+
data = self.tables.row_values(row)
66+
return data
67+
68+
def get_data_by_col(self, col):
69+
"""
70+
根据列号返回某一列的内容
71+
:param col:
72+
:return:
73+
"""
74+
if col < 0:
75+
col = 0
76+
if col > self.get_ncols():
77+
col = self.get_ncols()
78+
data = self.tables.col_values(col)
79+
return data
80+
81+
def get_cel_value(self, row, col):
82+
"""
83+
获取某个指定单元格的内容
84+
:param row:
85+
:param col:
86+
:return:
87+
"""
88+
data = self.tables.cell_value(row, col)
89+
90+
# ecxel中读取数据时默认将数字类型读取为浮点型
91+
if isinstance(data, float):
92+
data = int(data)
93+
return data
94+
95+
96+
if __name__ == "__main__":
97+
operatinrExcel = OperationExcel("")
98+
operatinrExcel.delete_rows(1)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[default]
2+
executable_path = chromedriver.exe
3+
keywords_excel_path = keyword.xls
4+
google_datas = google_data.xls
5+
max_test_count = 3
53.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)