|
| 1 | +# @Author : Libuda |
| 2 | +# @FileName: google_spider.py |
| 3 | +# @Software: PyCharm |
| 4 | +import sys |
| 5 | + |
| 6 | +sys.path.append("./") |
| 7 | +from configparser import ConfigParser |
| 8 | +from selenium import webdriver |
| 9 | +import time |
| 10 | +import xlrd |
| 11 | +import base64 |
| 12 | +from queue import Queue |
| 13 | +from xlutils.copy import copy # 写入Excel |
| 14 | + |
| 15 | +config_parser = ConfigParser() |
| 16 | +config_parser.read('config.cfg') |
| 17 | +config = config_parser['default'] |
| 18 | +browser = webdriver.PhantomJS(executable_path=config['executable_path']) |
| 19 | + |
| 20 | +from operationExcel import OperationExcel |
| 21 | + |
| 22 | +res_count = 0 |
| 23 | + |
| 24 | + |
| 25 | +class Spider(): |
| 26 | + def __init__(self): |
| 27 | + self.opExcel = OperationExcel(config['keywords_excel_path'], 0) |
| 28 | + self.file_path = config['biying_datas'] |
| 29 | + # self.pass_key_excel = OperationExcel(config['pass_key_path'],0) |
| 30 | + self.dataExcel = OperationExcel(self.file_path, 0) |
| 31 | + self.keywords_queue = Queue() |
| 32 | + self.res = set() |
| 33 | + |
| 34 | + def get_keywords_data(self, row): |
| 35 | + """ |
| 36 | + 获取关键词数据 |
| 37 | + :param row: |
| 38 | + :return: |
| 39 | + """ |
| 40 | + actual_data = OperationExcel(config['keywords_excel_path'], 0).get_cel_value(row, 0) |
| 41 | + return actual_data |
| 42 | + |
| 43 | + def write_to_excel(self, file_path, sheet_id, row, col, value): |
| 44 | + """ |
| 45 | + 写入Excel |
| 46 | + :param sheet_id: |
| 47 | + :param row: |
| 48 | + :param col: |
| 49 | + :param value: |
| 50 | + :return: |
| 51 | + """ |
| 52 | + work_book = xlrd.open_workbook(file_path, formatting_info=False) |
| 53 | + # 先通过xlutils.copy下copy复制Excel |
| 54 | + write_to_work = copy(work_book) |
| 55 | + # 通过sheet_by_index没有write方法 而get_sheet有write方法 |
| 56 | + sheet_data = write_to_work.get_sheet(sheet_id) |
| 57 | + sheet_data.write(row, col, str(value)) |
| 58 | + # 这里要注意保存 可是会将原来的Excel覆盖 样式消失 |
| 59 | + write_to_work.save(file_path) |
| 60 | + |
| 61 | + def main(self): |
| 62 | + global res_count |
| 63 | + test_count = int(config['max_test_count']) |
| 64 | + last_count = 0 |
| 65 | + count = self.dataExcel.tables.nrows |
| 66 | + print("当前已有url数量:", count) |
| 67 | + key_len = self.opExcel.get_nrows() |
| 68 | + print("关键词总数:", key_len) |
| 69 | + # tem = 0 if self.pass_key_excel.tables.nrows==0 else self.pass_key_excel.tables.nrows-1 |
| 70 | + # print("已爬取关键词个数 :",tem) |
| 71 | + # print("剩余爬取关键词个数:",key_len-tem) |
| 72 | + for index in range(1, key_len): |
| 73 | + |
| 74 | + key = self.get_keywords_data(index) |
| 75 | + |
| 76 | + try: |
| 77 | + print("启动中。。。。,如果20s内没有启动 请重新启动本软件") |
| 78 | + browser.get("https://cn.bing.com/?FORM=BEHPTB&ensearch=1") |
| 79 | + browser.find_element_by_css_selector("#sb_form_q").send_keys(key) |
| 80 | + browser.find_element_by_css_selector("#sb_form_go").click() |
| 81 | + |
| 82 | + for i in range(20): |
| 83 | + if browser.current_url != "https://cn.bing.com/?FORM=BEHPTB&ensearch=1": |
| 84 | + continue |
| 85 | + else: |
| 86 | + print(20 - i) |
| 87 | + time.sleep(1) |
| 88 | + print("正在第{}次尝试自动启动。。。。。".format(i + 1)) |
| 89 | + browser.get("https://cn.bing.com/?FORM=BEHPTB&ensearch=1") |
| 90 | + browser.find_element_by_css_selector("#sb_form_q").send_keys(key) |
| 91 | + browser.find_element_by_css_selector("#sb_form_go").click() |
| 92 | + except Exception as e: |
| 93 | + # print(e) |
| 94 | + print("正在尝试自动启动。。。。。") |
| 95 | + browser.get("https://cn.bing.com/?FORM=BEHPTB&ensearch=1") |
| 96 | + browser.find_element_by_css_selector("#sb_form_q").send_keys(key) |
| 97 | + browser.find_element_by_css_selector("#sb_form_go").click() |
| 98 | + |
| 99 | + current_url_set = set() |
| 100 | + flag = True |
| 101 | + while flag: |
| 102 | + try: |
| 103 | + if browser.current_url in current_url_set: |
| 104 | + if test_count < 0: |
| 105 | + print("no next") |
| 106 | + flag = False |
| 107 | + else: |
| 108 | + print("当前url {} 可能为最后一页,进行第{}次测试".format(browser.current_url, test_count)) |
| 109 | + test_count -= 1 |
| 110 | + else: |
| 111 | + print("当前正在采集第 {} 个关键词:{},采集的页数为 :{} ".format((index + 1), key, len(current_url_set) + 1)) |
| 112 | + print("当前url", browser.current_url) |
| 113 | + current_url_set.add(browser.current_url) |
| 114 | + |
| 115 | + title = browser.find_elements_by_css_selector("#b_results > li > h2") |
| 116 | + url = browser.find_elements_by_css_selector('#b_results > li> h2 > a') |
| 117 | + for i in range(len(url)): |
| 118 | + |
| 119 | + s = url[i].get_attribute("href").split("/") |
| 120 | + try: |
| 121 | + tmp = s[0] + "//" + s[2] |
| 122 | + except Exception as e: |
| 123 | + # print(e) |
| 124 | + tmp = s[0] + "//" + s[2] |
| 125 | + if tmp not in self.res: |
| 126 | + self.res.add(tmp) |
| 127 | + try: |
| 128 | + self.write_to_excel(self.file_path, -1, count, 0, title[i].text) |
| 129 | + self.write_to_excel(self.file_path, -1, count, 1, tmp) |
| 130 | + print(count, title[i].text, tmp) |
| 131 | + count += 1 |
| 132 | + res_count += 1 |
| 133 | + except Exception as e: |
| 134 | + print(e, "请关闭Excel 否则10秒后本条数据将不再写入") |
| 135 | + for i in range(10): |
| 136 | + print(10 - i) |
| 137 | + time.sleep(1) |
| 138 | + try: |
| 139 | + self.write_to_excel(self.file_path, -1, count, 0, title[i].text) |
| 140 | + self.write_to_excel(self.file_path, -1, count, 1, tmp) |
| 141 | + print(count, title[i].text, tmp, browser.current_url) |
| 142 | + except Exception: |
| 143 | + print("已漏掉数据...{} {}".format(title[i].text, tmp)) |
| 144 | + |
| 145 | + try: |
| 146 | + next_paget = browser.find_element_by_css_selector( |
| 147 | + "#b_results > li.b_pag > nav > ul > li:nth-child(9) > a") |
| 148 | + next_paget.click() |
| 149 | + except Exception as e: |
| 150 | + # print(e) |
| 151 | + try: |
| 152 | + next_paget = browser.find_element_by_css_selector( |
| 153 | + "#b_results > li.b_pag > nav > ul > li:nth-child(8) > a") |
| 154 | + next_paget.click() |
| 155 | + except Exception as e: |
| 156 | + # print(e) |
| 157 | + next_paget = browser.find_element_by_css_selector( |
| 158 | + "#b_results > li.b_pag > nav > ul > li:nth-child(7) > a") |
| 159 | + next_paget.click() |
| 160 | + except Exception as e: |
| 161 | + # print(e) |
| 162 | + try: |
| 163 | + try: |
| 164 | + next_paget = browser.find_element_by_css_selector( |
| 165 | + "#b_results > li.b_pag > nav > ul > li:nth-child(9) > a") |
| 166 | + next_paget.click() |
| 167 | + except Exception as e: |
| 168 | + # print(e) |
| 169 | + try: |
| 170 | + next_paget = browser.find_element_by_css_selector( |
| 171 | + "#b_results > li.b_pag > nav > ul > li:nth-child(8) > a") |
| 172 | + next_paget.click() |
| 173 | + except Exception as e: |
| 174 | + try: |
| 175 | + next_paget = browser.find_element_by_css_selector( |
| 176 | + "#b_results > li.b_pag > nav > ul > li:nth-child(7) > a") |
| 177 | + next_paget.click() |
| 178 | + except Exception as e: |
| 179 | + # print(e) |
| 180 | + try: |
| 181 | + next_paget = browser.find_element_by_css_selector( |
| 182 | + "#b_results > li.b_pag > nav > ul > li:nth-child(6) > a") |
| 183 | + next_paget.click() |
| 184 | + except Exception as e: |
| 185 | + print("找不到下一页呢") |
| 186 | + time.sleep(5) |
| 187 | + flag = False |
| 188 | + except Exception as e: |
| 189 | + print(e) |
| 190 | + print("可能是最后一页了呢 当前url为{}".format(browser.current_url)) |
| 191 | + time.sleep(5) |
| 192 | + flag = False |
| 193 | + |
| 194 | + try: |
| 195 | + # self.write_to_excel(config['pass_key_path'],0,tem,0,key) |
| 196 | + # self.write_to_excel(config['pass_key_path'],0,tem,1,res_count-last_count) |
| 197 | + print("当前关键词 :{} 爬取完毕 已爬取数据 :{}".format(key, res_count - last_count)) |
| 198 | + except Exception as e: |
| 199 | + print(e) |
| 200 | + |
| 201 | + print("本次采集已获取url总数为:", str(res_count)) |
| 202 | + last_count = res_count |
| 203 | + print("关键词搜索完毕,谢谢使用!") |
| 204 | + while 1: |
| 205 | + pass |
| 206 | + |
| 207 | + |
| 208 | +if __name__ == "__main__": |
| 209 | + spider = Spider() |
| 210 | + spider.main() |
0 commit comments