8
8
from selenium import webdriver
9
9
import time
10
10
import xlrd
11
+ from xlutils .copy import copy as xl_copy
11
12
import base64
12
13
from queue import Queue
13
- from xlutils .copy import copy # 写入Excel
14
14
15
15
config_parser = ConfigParser ()
16
16
config_parser .read ('config.cfg' )
17
17
config = config_parser ['default' ]
18
+
18
19
browser = webdriver .PhantomJS (executable_path = config ['executable_path' ])
19
20
20
- from operationExcel import OperationExcel
21
21
22
22
res_count = 0
23
23
24
24
25
+ def logger (msg ):
26
+ """
27
+ 日志信息
28
+ """
29
+ # now = time.ctime()
30
+ print ("%s" % (msg ))
31
+
32
+
33
+ class OperationExcel ():
34
+ """
35
+ #以面向对象的方式操作Excel
36
+ """
37
+
38
+ def __init__ (self , file_name = None , sheet_id = None ):
39
+ """
40
+ 初始化OperationExcel对象
41
+ :param file_name:
42
+ :param sheet_id: vv
43
+ """
44
+ if file_name :
45
+ self .file_name = file_name
46
+ self .sheet_id = sheet_id
47
+ else :
48
+ self .file_name = r"C:\Users\lenovo\PycharmProjects\Spider\biying_data.xls"
49
+ self .sheet_id = 0
50
+ self .tables = self .get_tables ()
51
+
52
+ def create_sheet (self , sheet_name ):
53
+ ecel = xlrd .open_workbook (self .file_name )
54
+ wb = xl_copy (ecel )
55
+ wb .add_sheet (sheet_name )
56
+ wb .save (self .file_name )
57
+
58
+ def get_tables (self ):
59
+ """
60
+ 返回tables对象
61
+ :return:
62
+ """
63
+ ecel = xlrd .open_workbook (self .file_name )
64
+ tables = ecel .sheet_by_index (self .sheet_id )
65
+ return tables
66
+
67
+ def get_nrows (self ):
68
+ """
69
+ 获取表格行数
70
+ :return:
71
+ """
72
+ return self .tables .nrows
73
+
74
+ def get_ncols (self ):
75
+ """
76
+ 获取表格列数
77
+ :return:
78
+ """
79
+ return self .tables .ncols
80
+
81
+ def get_data_by_row (self , row ):
82
+ """
83
+ 根据行号获取某一行的内容
84
+ :param row:
85
+ :return:
86
+ """
87
+ if row < 0 :
88
+ row = 0
89
+ if row > self .get_nrows ():
90
+ row = self .get_nrows ()
91
+ data = self .tables .row_values (row )
92
+ return data
93
+
94
+ def get_data_by_col (self , col ):
95
+ """
96
+ 根据列号返回某一列的内容
97
+ :param col:
98
+ :return:
99
+ """
100
+ if col < 0 :
101
+ col = 0
102
+ if col > self .get_ncols ():
103
+ col = self .get_ncols ()
104
+ data = self .tables .col_values (col )
105
+ return data
106
+
107
+ def get_cel_value (self , row , col ):
108
+ """
109
+ 获取某个指定单元格的内容
110
+ :param row:
111
+ :param col:
112
+ :return:
113
+ """
114
+ data = self .tables .cell_value (row , col )
115
+
116
+ # ecxel中读取数据时默认将数字类型读取为浮点型
117
+ if isinstance (data , float ):
118
+ data = int (data )
119
+ return data
120
+
25
121
class Spider ():
26
122
def __init__ (self ):
27
123
self .opExcel = OperationExcel (config ['keywords_excel_path' ], 0 )
28
124
self .file_path = config ['biying_datas' ]
29
- # self.pass_key_excel = OperationExcel(config['pass_key_path'],0)
125
+ self .title_fillter = config ['title_fillter' ].split ("," )
126
+ self .url_fillter = config ['url_fillter' ].split ("," )
30
127
self .dataExcel = OperationExcel (self .file_path , 0 )
31
128
self .keywords_queue = Queue ()
32
129
self .res = set ()
@@ -51,7 +148,7 @@ def write_to_excel(self, file_path, sheet_id, row, col, value):
51
148
"""
52
149
work_book = xlrd .open_workbook (file_path , formatting_info = False )
53
150
# 先通过xlutils.copy下copy复制Excel
54
- write_to_work = copy (work_book )
151
+ write_to_work = xl_copy (work_book )
55
152
# 通过sheet_by_index没有write方法 而get_sheet有write方法
56
153
sheet_data = write_to_work .get_sheet (sheet_id )
57
154
sheet_data .write (row , col , str (value ))
@@ -60,21 +157,21 @@ def write_to_excel(self, file_path, sheet_id, row, col, value):
60
157
61
158
def main (self ):
62
159
global res_count
63
- test_count = int (config ['max_test_count ' ])
160
+ start_index = int (config ['start_index ' ])
64
161
last_count = 0
65
162
count = self .dataExcel .tables .nrows
66
- print ("当前已有url数量:" , count )
163
+ logger ("当前已有url数量:{}" . format ( count ) )
67
164
key_len = self .opExcel .get_nrows ()
68
- print ("关键词总数:" , key_len )
165
+ logger ("关键词总数:{}" . format ( key_len ) )
69
166
# tem = 0 if self.pass_key_excel.tables.nrows==0 else self.pass_key_excel.tables.nrows-1
70
- # print ("已爬取关键词个数 :",tem)
71
- # print ("剩余爬取关键词个数:",key_len-tem)
72
- for index in range (1 , key_len ):
73
-
167
+ # logger ("已爬取关键词个数 :",tem)
168
+ # logger ("剩余爬取关键词个数:",key_len-tem)
169
+ for index in range (start_index , key_len ):
170
+ test_count = int ( config [ 'max_test_count' ])
74
171
key = self .get_keywords_data (index )
75
172
76
173
try :
77
- print ("启动中。。。。, 如果20s内没有启动 请重新启动本软件" )
174
+ logger ("启动中。。。。如果20s内没有启动 请重新启动本软件" )
78
175
browser .get ("https://cn.bing.com/?FORM=BEHPTB&ensearch=1" )
79
176
browser .find_element_by_css_selector ("#sb_form_q" ).send_keys (key )
80
177
browser .find_element_by_css_selector ("#sb_form_go" ).click ()
@@ -83,15 +180,15 @@ def main(self):
83
180
if browser .current_url != "https://cn.bing.com/?FORM=BEHPTB&ensearch=1" :
84
181
continue
85
182
else :
86
- print (20 - i )
183
+ logger (20 - i )
87
184
time .sleep (1 )
88
- print ("正在第{}次尝试自动启动。。。。。" .format (i + 1 ))
185
+ logger ("正在第{}次尝试自动启动。。。。。" .format (i + 1 ))
89
186
browser .get ("https://cn.bing.com/?FORM=BEHPTB&ensearch=1" )
90
187
browser .find_element_by_css_selector ("#sb_form_q" ).send_keys (key )
91
188
browser .find_element_by_css_selector ("#sb_form_go" ).click ()
92
189
except Exception as e :
93
- # print (e)
94
- print ("正在尝试自动启动。。。。。" )
190
+ # logger (e)
191
+ logger ("正在尝试自动启动。。。。。" )
95
192
browser .get ("https://cn.bing.com/?FORM=BEHPTB&ensearch=1" )
96
193
browser .find_element_by_css_selector ("#sb_form_q" ).send_keys (key )
97
194
browser .find_element_by_css_selector ("#sb_form_go" ).click ()
@@ -102,70 +199,80 @@ def main(self):
102
199
try :
103
200
if browser .current_url in current_url_set :
104
201
if test_count < 0 :
105
- print ("no next" )
202
+ logger ("no next" )
106
203
flag = False
107
204
else :
108
- print ("当前url {} 可能为最后一页,进行第{}次测试" .format (browser .current_url , test_count ))
205
+ logger ("当前url {} 可能为最后一页,进行第{}次测试" .format (browser .current_url , test_count ))
109
206
test_count -= 1
110
207
else :
111
- print ("当前正在采集第 {} 个关键词:{},采集的页数为 :{} " .format ((index + 1 ), key , len (current_url_set ) + 1 ))
112
- print ("当前url" , browser .current_url )
208
+ logger ("当前正在采集第 {} 个关键词:{},采集的页数为 :{} " .format ((index + 1 ), key , len (current_url_set ) + 1 ))
209
+ logger ("当前url:{}" . format ( browser .current_url ) )
113
210
current_url_set .add (browser .current_url )
114
211
115
- title = browser .find_elements_by_css_selector ("#b_results > li > h2" )
116
- url = browser .find_elements_by_css_selector ('#b_results > li> h2 > a' )
212
+ title = browser .find_elements_by_css_selector ("#b_results > li > h2 " )
213
+ url = browser .find_elements_by_css_selector ('#b_results > li > h2 > a' )
214
+
117
215
for i in range (len (url )):
118
216
119
217
s = url [i ].get_attribute ("href" ).split ("/" )
120
218
try :
121
219
tmp = s [0 ] + "//" + s [2 ]
122
220
except Exception as e :
123
- # print (e)
221
+ # logger (e)
124
222
tmp = s [0 ] + "//" + s [2 ]
125
- if tmp not in self .res :
223
+
224
+ pass_flag = False
225
+ for one in self .url_fillter :
226
+ if one in tmp :
227
+ pass_flag = True
228
+ for one in self .title_fillter :
229
+ if one in title [i ].text :
230
+ pass_flag = True
231
+
232
+ if not pass_flag and tmp not in self .res :
126
233
self .res .add (tmp )
127
234
try :
128
235
self .write_to_excel (self .file_path , - 1 , count , 0 , title [i ].text )
129
236
self .write_to_excel (self .file_path , - 1 , count , 1 , tmp )
130
- print ( count , title [i ].text , tmp )
237
+ logger ( "{},{},{}" . format ( count , title [i ].text , tmp ) )
131
238
count += 1
132
239
res_count += 1
133
240
except Exception as e :
134
- print ( e , "请关闭Excel 否则10秒后本条数据将不再写入" )
241
+ logger ( "请关闭Excel 否则10秒后本条数据将不再写入:{}" . format ( e ) )
135
242
for i in range (10 ):
136
- print (10 - i )
243
+ logger (10 - i )
137
244
time .sleep (1 )
138
245
try :
139
246
self .write_to_excel (self .file_path , - 1 , count , 0 , title [i ].text )
140
247
self .write_to_excel (self .file_path , - 1 , count , 1 , tmp )
141
- print ( count , title [i ].text , tmp , browser .current_url )
248
+ logger ( "{},{},{},{}" . format ( count , title [i ].text , tmp , browser .current_url ) )
142
249
except Exception :
143
- print ("已漏掉数据...{} {}" .format (title [i ].text , tmp ))
250
+ logger ("已漏掉数据...{} {}" .format (title [i ].text , tmp ))
144
251
145
252
try :
146
253
next_paget = browser .find_element_by_css_selector (
147
254
"#b_results > li.b_pag > nav > ul > li:nth-child(9) > a" )
148
255
next_paget .click ()
149
256
except Exception as e :
150
- # print (e)
257
+ # logger (e)
151
258
try :
152
259
next_paget = browser .find_element_by_css_selector (
153
260
"#b_results > li.b_pag > nav > ul > li:nth-child(8) > a" )
154
261
next_paget .click ()
155
262
except Exception as e :
156
- # print (e)
263
+ # logger (e)
157
264
next_paget = browser .find_element_by_css_selector (
158
265
"#b_results > li.b_pag > nav > ul > li:nth-child(7) > a" )
159
266
next_paget .click ()
160
267
except Exception as e :
161
- # print (e)
268
+ # logger (e)
162
269
try :
163
270
try :
164
271
next_paget = browser .find_element_by_css_selector (
165
272
"#b_results > li.b_pag > nav > ul > li:nth-child(9) > a" )
166
273
next_paget .click ()
167
274
except Exception as e :
168
- # print (e)
275
+ # logger (e)
169
276
try :
170
277
next_paget = browser .find_element_by_css_selector (
171
278
"#b_results > li.b_pag > nav > ul > li:nth-child(8) > a" )
@@ -176,35 +283,55 @@ def main(self):
176
283
"#b_results > li.b_pag > nav > ul > li:nth-child(7) > a" )
177
284
next_paget .click ()
178
285
except Exception as e :
179
- # print (e)
286
+ # logger (e)
180
287
try :
181
288
next_paget = browser .find_element_by_css_selector (
182
289
"#b_results > li.b_pag > nav > ul > li:nth-child(6) > a" )
183
290
next_paget .click ()
184
291
except Exception as e :
185
- print ("找不到下一页呢" )
292
+ logger ("找不到下一页呢" )
186
293
time .sleep (5 )
187
294
flag = False
188
295
except Exception as e :
189
- print (e )
190
- print ("可能是最后一页了呢 当前url为{}" .format (browser .current_url ))
296
+ logger (e )
297
+ logger ("可能是最后一页了呢 当前url为{}" .format (browser .current_url ))
191
298
time .sleep (5 )
192
299
flag = False
193
300
194
301
try :
195
302
# self.write_to_excel(config['pass_key_path'],0,tem,0,key)
196
303
# self.write_to_excel(config['pass_key_path'],0,tem,1,res_count-last_count)
197
- print ("当前关键词 :{} 爬取完毕 已爬取数据 :{}" .format (key , res_count - last_count ))
304
+ logger ("当前关键词 :{} 爬取完毕 已爬取数据 :{}" .format (key , res_count - last_count ))
198
305
except Exception as e :
199
- print (e )
306
+ logger (e )
200
307
201
- print ("本次采集已获取url总数为:" , str (res_count ))
308
+ logger ("本次采集已获取url总数为:{}" . format ( str (res_count ) ))
202
309
last_count = res_count
203
- print ("关键词搜索完毕,谢谢使用!" )
310
+ start_index += 1
311
+ config_parser .set ("default" , "start_index" , str (start_index ))
312
+ config_parser .write (open ("config.cfg" , 'w' ))
313
+
314
+ logger ("关键词搜索完毕,谢谢使用!" )
204
315
while 1 :
205
316
pass
206
317
207
318
208
319
if __name__ == "__main__" :
209
- spider = Spider ()
210
- spider .main ()
320
+
321
+ try :
322
+ code = config ['code' ]
323
+ now_time = int (time .time ())
324
+ s = str (base64 .b64decode (code ), "utf-8" )
325
+ s2 = time .strptime (s , "%Y-%m-%d %H:%M:%S" )
326
+ time_sti = int (time .mktime (s2 )) # 时间戳
327
+ if now_time > time_sti :
328
+ logger ("您的注册码已过期" )
329
+ time .sleep (10 )
330
+ else :
331
+ logger ("欢迎使用 国外搜索系统" )
332
+ logger ("软件将于 '{}' 过期 " .format (s ))
333
+ spider = Spider ()
334
+ spider .main ()
335
+ except Exception as e :
336
+ logger ("您的使用权限已过期" )
337
+ time .sleep (10 )
0 commit comments