Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0bbf7b0

Browse files
committed
记忆功能 过滤功能
1 parent ae96785 commit 0bbf7b0

File tree

1 file changed

+170
-43
lines changed

1 file changed

+170
-43
lines changed

搜索引擎爬虫/必应爬虫/new_biying_spider.py renamed to 搜索引擎爬虫/必应爬虫/biying_spider.py

Lines changed: 170 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,122 @@
88
from selenium import webdriver
99
import time
1010
import xlrd
11+
from xlutils.copy import copy as xl_copy
1112
import base64
1213
from queue import Queue
13-
from xlutils.copy import copy # 写入Excel
1414

1515
config_parser = ConfigParser()
1616
config_parser.read('config.cfg')
1717
config = config_parser['default']
18+
1819
browser = webdriver.PhantomJS(executable_path=config['executable_path'])
1920

20-
from operationExcel import OperationExcel
2121

2222
res_count = 0
2323

2424

25+
def logger(msg):
26+
"""
27+
日志信息
28+
"""
29+
# now = time.ctime()
30+
print("%s" % (msg))
31+
32+
33+
class OperationExcel():
34+
"""
35+
#以面向对象的方式操作Excel
36+
"""
37+
38+
def __init__(self, file_name=None, sheet_id=None):
39+
"""
40+
初始化OperationExcel对象
41+
:param file_name:
42+
:param sheet_id: vv
43+
"""
44+
if file_name:
45+
self.file_name = file_name
46+
self.sheet_id = sheet_id
47+
else:
48+
self.file_name = r"C:\Users\lenovo\PycharmProjects\Spider\biying_data.xls"
49+
self.sheet_id = 0
50+
self.tables = self.get_tables()
51+
52+
def create_sheet(self, sheet_name):
53+
ecel = xlrd.open_workbook(self.file_name)
54+
wb = xl_copy(ecel)
55+
wb.add_sheet(sheet_name)
56+
wb.save(self.file_name)
57+
58+
def get_tables(self):
59+
"""
60+
返回tables对象
61+
:return:
62+
"""
63+
ecel = xlrd.open_workbook(self.file_name)
64+
tables = ecel.sheet_by_index(self.sheet_id)
65+
return tables
66+
67+
def get_nrows(self):
68+
"""
69+
获取表格行数
70+
:return:
71+
"""
72+
return self.tables.nrows
73+
74+
def get_ncols(self):
75+
"""
76+
获取表格列数
77+
:return:
78+
"""
79+
return self.tables.ncols
80+
81+
def get_data_by_row(self, row):
82+
"""
83+
根据行号获取某一行的内容
84+
:param row:
85+
:return:
86+
"""
87+
if row < 0:
88+
row = 0
89+
if row > self.get_nrows():
90+
row = self.get_nrows()
91+
data = self.tables.row_values(row)
92+
return data
93+
94+
def get_data_by_col(self, col):
95+
"""
96+
根据列号返回某一列的内容
97+
:param col:
98+
:return:
99+
"""
100+
if col < 0:
101+
col = 0
102+
if col > self.get_ncols():
103+
col = self.get_ncols()
104+
data = self.tables.col_values(col)
105+
return data
106+
107+
def get_cel_value(self, row, col):
108+
"""
109+
获取某个指定单元格的内容
110+
:param row:
111+
:param col:
112+
:return:
113+
"""
114+
data = self.tables.cell_value(row, col)
115+
116+
# ecxel中读取数据时默认将数字类型读取为浮点型
117+
if isinstance(data, float):
118+
data = int(data)
119+
return data
120+
25121
class Spider():
26122
def __init__(self):
27123
self.opExcel = OperationExcel(config['keywords_excel_path'], 0)
28124
self.file_path = config['biying_datas']
29-
# self.pass_key_excel = OperationExcel(config['pass_key_path'],0)
125+
self.title_fillter = config['title_fillter'].split(",")
126+
self.url_fillter = config['url_fillter'].split(",")
30127
self.dataExcel = OperationExcel(self.file_path, 0)
31128
self.keywords_queue = Queue()
32129
self.res = set()
@@ -51,7 +148,7 @@ def write_to_excel(self, file_path, sheet_id, row, col, value):
51148
"""
52149
work_book = xlrd.open_workbook(file_path, formatting_info=False)
53150
# 先通过xlutils.copy下copy复制Excel
54-
write_to_work = copy(work_book)
151+
write_to_work = xl_copy(work_book)
55152
# 通过sheet_by_index没有write方法 而get_sheet有write方法
56153
sheet_data = write_to_work.get_sheet(sheet_id)
57154
sheet_data.write(row, col, str(value))
@@ -60,21 +157,21 @@ def write_to_excel(self, file_path, sheet_id, row, col, value):
60157

61158
def main(self):
62159
global res_count
63-
test_count = int(config['max_test_count'])
160+
start_index = int(config['start_index'])
64161
last_count = 0
65162
count = self.dataExcel.tables.nrows
66-
print("当前已有url数量:", count)
163+
logger("当前已有url数量:{}".format(count))
67164
key_len = self.opExcel.get_nrows()
68-
print("关键词总数:", key_len)
165+
logger("关键词总数:{}".format(key_len))
69166
# tem = 0 if self.pass_key_excel.tables.nrows==0 else self.pass_key_excel.tables.nrows-1
70-
# print("已爬取关键词个数 :",tem)
71-
# print("剩余爬取关键词个数:",key_len-tem)
72-
for index in range(1, key_len):
73-
167+
# logger("已爬取关键词个数 :",tem)
168+
# logger("剩余爬取关键词个数:",key_len-tem)
169+
for index in range(start_index, key_len):
170+
test_count = int(config['max_test_count'])
74171
key = self.get_keywords_data(index)
75172

76173
try:
77-
print("启动中。。。。如果20s内没有启动 请重新启动本软件")
174+
logger("启动中。。。。如果20s内没有启动 请重新启动本软件")
78175
browser.get("https://cn.bing.com/?FORM=BEHPTB&ensearch=1")
79176
browser.find_element_by_css_selector("#sb_form_q").send_keys(key)
80177
browser.find_element_by_css_selector("#sb_form_go").click()
@@ -83,15 +180,15 @@ def main(self):
83180
if browser.current_url != "https://cn.bing.com/?FORM=BEHPTB&ensearch=1":
84181
continue
85182
else:
86-
print(20 - i)
183+
logger(20 - i)
87184
time.sleep(1)
88-
print("正在第{}次尝试自动启动。。。。。".format(i + 1))
185+
logger("正在第{}次尝试自动启动。。。。。".format(i + 1))
89186
browser.get("https://cn.bing.com/?FORM=BEHPTB&ensearch=1")
90187
browser.find_element_by_css_selector("#sb_form_q").send_keys(key)
91188
browser.find_element_by_css_selector("#sb_form_go").click()
92189
except Exception as e:
93-
# print(e)
94-
print("正在尝试自动启动。。。。。")
190+
# logger(e)
191+
logger("正在尝试自动启动。。。。。")
95192
browser.get("https://cn.bing.com/?FORM=BEHPTB&ensearch=1")
96193
browser.find_element_by_css_selector("#sb_form_q").send_keys(key)
97194
browser.find_element_by_css_selector("#sb_form_go").click()
@@ -102,70 +199,80 @@ def main(self):
102199
try:
103200
if browser.current_url in current_url_set:
104201
if test_count < 0:
105-
print("no next")
202+
logger("no next")
106203
flag = False
107204
else:
108-
print("当前url {} 可能为最后一页,进行第{}次测试".format(browser.current_url, test_count))
205+
logger("当前url {} 可能为最后一页,进行第{}次测试".format(browser.current_url, test_count))
109206
test_count -= 1
110207
else:
111-
print("当前正在采集第 {} 个关键词:{},采集的页数为 :{} ".format((index + 1), key, len(current_url_set) + 1))
112-
print("当前url", browser.current_url)
208+
logger("当前正在采集第 {} 个关键词:{},采集的页数为 :{} ".format((index + 1), key, len(current_url_set) + 1))
209+
logger("当前url:{}".format(browser.current_url))
113210
current_url_set.add(browser.current_url)
114211

115-
title = browser.find_elements_by_css_selector("#b_results > li > h2")
116-
url = browser.find_elements_by_css_selector('#b_results > li> h2 > a')
212+
title = browser.find_elements_by_css_selector("#b_results > li > h2 ")
213+
url = browser.find_elements_by_css_selector('#b_results > li > h2 > a')
214+
117215
for i in range(len(url)):
118216

119217
s = url[i].get_attribute("href").split("/")
120218
try:
121219
tmp = s[0] + "//" + s[2]
122220
except Exception as e:
123-
# print(e)
221+
# logger(e)
124222
tmp = s[0] + "//" + s[2]
125-
if tmp not in self.res:
223+
224+
pass_flag = False
225+
for one in self.url_fillter:
226+
if one in tmp:
227+
pass_flag = True
228+
for one in self.title_fillter:
229+
if one in title[i].text:
230+
pass_flag = True
231+
232+
if not pass_flag and tmp not in self.res:
126233
self.res.add(tmp)
127234
try:
128235
self.write_to_excel(self.file_path, -1, count, 0, title[i].text)
129236
self.write_to_excel(self.file_path, -1, count, 1, tmp)
130-
print(count, title[i].text, tmp)
237+
logger("{},{},{}".format(count, title[i].text, tmp))
131238
count += 1
132239
res_count += 1
133240
except Exception as e:
134-
print(e, "请关闭Excel 否则10秒后本条数据将不再写入")
241+
logger("请关闭Excel 否则10秒后本条数据将不再写入:{}".format(e))
135242
for i in range(10):
136-
print(10 - i)
243+
logger(10 - i)
137244
time.sleep(1)
138245
try:
139246
self.write_to_excel(self.file_path, -1, count, 0, title[i].text)
140247
self.write_to_excel(self.file_path, -1, count, 1, tmp)
141-
print(count, title[i].text, tmp, browser.current_url)
248+
logger("{},{},{},{}".format(count, title[i].text, tmp, browser.current_url))
142249
except Exception:
143-
print("已漏掉数据...{} {}".format(title[i].text, tmp))
250+
logger("已漏掉数据...{} {}".format(title[i].text, tmp))
144251

145252
try:
146253
next_paget = browser.find_element_by_css_selector(
147254
"#b_results > li.b_pag > nav > ul > li:nth-child(9) > a")
148255
next_paget.click()
149256
except Exception as e:
150-
# print(e)
257+
# logger(e)
151258
try:
152259
next_paget = browser.find_element_by_css_selector(
153260
"#b_results > li.b_pag > nav > ul > li:nth-child(8) > a")
154261
next_paget.click()
155262
except Exception as e:
156-
# print(e)
263+
# logger(e)
157264
next_paget = browser.find_element_by_css_selector(
158265
"#b_results > li.b_pag > nav > ul > li:nth-child(7) > a")
159266
next_paget.click()
160267
except Exception as e:
161-
# print(e)
268+
# logger(e)
162269
try:
163270
try:
164271
next_paget = browser.find_element_by_css_selector(
165272
"#b_results > li.b_pag > nav > ul > li:nth-child(9) > a")
166273
next_paget.click()
167274
except Exception as e:
168-
# print(e)
275+
# logger(e)
169276
try:
170277
next_paget = browser.find_element_by_css_selector(
171278
"#b_results > li.b_pag > nav > ul > li:nth-child(8) > a")
@@ -176,35 +283,55 @@ def main(self):
176283
"#b_results > li.b_pag > nav > ul > li:nth-child(7) > a")
177284
next_paget.click()
178285
except Exception as e:
179-
# print(e)
286+
# logger(e)
180287
try:
181288
next_paget = browser.find_element_by_css_selector(
182289
"#b_results > li.b_pag > nav > ul > li:nth-child(6) > a")
183290
next_paget.click()
184291
except Exception as e:
185-
print("找不到下一页呢")
292+
logger("找不到下一页呢")
186293
time.sleep(5)
187294
flag = False
188295
except Exception as e:
189-
print(e)
190-
print("可能是最后一页了呢 当前url为{}".format(browser.current_url))
296+
logger(e)
297+
logger("可能是最后一页了呢 当前url为{}".format(browser.current_url))
191298
time.sleep(5)
192299
flag = False
193300

194301
try:
195302
# self.write_to_excel(config['pass_key_path'],0,tem,0,key)
196303
# self.write_to_excel(config['pass_key_path'],0,tem,1,res_count-last_count)
197-
print("当前关键词 :{} 爬取完毕 已爬取数据 :{}".format(key, res_count - last_count))
304+
logger("当前关键词 :{} 爬取完毕 已爬取数据 :{}".format(key, res_count - last_count))
198305
except Exception as e:
199-
print(e)
306+
logger(e)
200307

201-
print("本次采集已获取url总数为:", str(res_count))
308+
logger("本次采集已获取url总数为:{}".format(str(res_count)))
202309
last_count = res_count
203-
print("关键词搜索完毕,谢谢使用!")
310+
start_index += 1
311+
config_parser.set("default", "start_index", str(start_index))
312+
config_parser.write(open("config.cfg", 'w'))
313+
314+
logger("关键词搜索完毕,谢谢使用!")
204315
while 1:
205316
pass
206317

207318

208319
if __name__ == "__main__":
209-
spider = Spider()
210-
spider.main()
320+
321+
try:
322+
code = config['code']
323+
now_time = int(time.time())
324+
s = str(base64.b64decode(code), "utf-8")
325+
s2 = time.strptime(s, "%Y-%m-%d %H:%M:%S")
326+
time_sti = int(time.mktime(s2)) # 时间戳
327+
if now_time > time_sti:
328+
logger("您的注册码已过期")
329+
time.sleep(10)
330+
else:
331+
logger("欢迎使用 国外搜索系统")
332+
logger("软件将于 '{}' 过期 ".format(s))
333+
spider = Spider()
334+
spider.main()
335+
except Exception as e:
336+
logger("您的使用权限已过期")
337+
time.sleep(10)

0 commit comments

Comments
 (0)