From 884803a2dbece6c0bf5452f9c6e69ca6251fd73b Mon Sep 17 00:00:00 2001 From: kunal kumar barman Date: Fri, 18 Oct 2019 07:04:34 +0530 Subject: [PATCH 01/10] Python program that scrufs 3 site at a time add input in the compiling time like -- python3 project1.py (man) --- project1.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 project1.py diff --git a/project1.py b/project1.py new file mode 100644 index 000000000000..9264c74e8263 --- /dev/null +++ b/project1.py @@ -0,0 +1,21 @@ +from bs4 import BeautifulSoup +import requests +#from urllib.request import urlopen +import webbrowser +import sys +from fake_useragent import UserAgent +ua = {"UserAgent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0"} +print("Googling.....") +res=requests.get('https://www.google.com/search?q='+ ' '.join(sys.argv[1:]),headers=ua) +#res.raise_for_status() +file=open('project1a.html','wb') #only for knowing the class +for i in res.iter_content(10000): + file.write(i) +soup= BeautifulSoup(res.text,'lxml') +linkele=soup.select('.eZt8xd') + +num=min(5,len(linkele)) +print(num) +for i in range(num): + + webbrowser.open('http://google.com' + linkele[i].get('href')) \ No newline at end of file From dfed24aa0b6c97c8fbc0e61d2d25511d7c2da2b0 Mon Sep 17 00:00:00 2001 From: kunal kumar barman Date: Fri, 18 Oct 2019 21:30:39 +0530 Subject: [PATCH 02/10] Update project1.py --- project1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/project1.py b/project1.py index 9264c74e8263..45c1c49f9ba4 100644 --- a/project1.py +++ b/project1.py @@ -3,7 +3,7 @@ #from urllib.request import urlopen import webbrowser import sys -from fake_useragent import UserAgent +from fake_useragent import UserAgent #This is needed for intialazing the User agent of the system otherwise it act like bot ua = {"UserAgent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0"} print("Googling.....") res=requests.get('https://www.google.com/search?q='+ ' '.join(sys.argv[1:]),headers=ua) @@ -18,4 +18,4 @@ print(num) for i in range(num): - webbrowser.open('http://google.com' + linkele[i].get('href')) \ No newline at end of file + webbrowser.open('http://google.com' + linkele[i].get('href')) From 5c1fbeaff3c0fb327af4ff6c0254ad6610375ad6 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 18 Oct 2019 21:32:33 +0200 Subject: [PATCH 03/10] noqa: F401 and reformat with black --- project1.py | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/project1.py b/project1.py index 45c1c49f9ba4..e7a77b643fe0 100644 --- a/project1.py +++ b/project1.py @@ -1,21 +1,28 @@ +import sys +import webbrowser + from bs4 import BeautifulSoup import requests -#from urllib.request import urlopen -import webbrowser -import sys -from fake_useragent import UserAgent #This is needed for intialazing the User agent of the system otherwise it act like bot -ua = {"UserAgent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0"} + +# fake_useragent is needed for so that this script is not viewed as a bot +from fake_useragent import UserAgent # noqa: F401 + +user_agent = { + "UserAgent": ( + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) " + "Gecko/20100101 Firefox/69.0" + ) +} print("Googling.....") -res=requests.get('https://www.google.com/search?q='+ ' '.join(sys.argv[1:]),headers=ua) -#res.raise_for_status() -file=open('project1a.html','wb') #only for knowing the class -for i in res.iter_content(10000): - file.write(i) -soup= BeautifulSoup(res.text,'lxml') -linkele=soup.select('.eZt8xd') +url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:]) +res = requests.get(url, headers=user_agent) +# res.raise_for_status() +with open("project1a.html", "wb") as out_file: # only for knowing the class + for data in res.iter_content(10000): + out_file.write(data) +soup = BeautifulSoup(res.text, "lxml") +links = list(soup.select(".eZt8xd"))[:5] -num=min(5,len(linkele)) -print(num) -for i in range(num): - - webbrowser.open('http://google.com' + linkele[i].get('href')) +print(len(links)) +for link in links: + webbrowser.open(f"http://google.com{links.get('href')}") From 1125c44a971c76db3dd1faa72dcb166bff43e9e0 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 18 Oct 2019 21:38:40 +0200 Subject: [PATCH 04/10] Rename project1.py to web_programming/crawl_google_results.py --- project1.py => web_programming/crawl_google_results.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename project1.py => web_programming/crawl_google_results.py (100%) diff --git a/project1.py b/web_programming/crawl_google_results.py similarity index 100% rename from project1.py rename to web_programming/crawl_google_results.py From 05808a2dc62a5b68e4106462beb52e186256261d Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 18 Oct 2019 21:40:16 +0200 Subject: [PATCH 05/10] Add beautifulsoup4 to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f5790ad53c30..8305f55d923f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +beautifulsoup4 black flake8 matplotlib From 81106a61492b84dbda79c655d28da702d0b258e8 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 18 Oct 2019 21:45:23 +0200 Subject: [PATCH 06/10] Add fake_useragent to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 8305f55d923f..357ce9598ca8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ beautifulsoup4 black +fake_useragent flake8 matplotlib mypy From 4600feab6bdb27558530330624192fe4f4c5a795 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 18 Oct 2019 21:47:09 +0200 Subject: [PATCH 07/10] Update crawl_google_results.py --- web_programming/crawl_google_results.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py index e7a77b643fe0..7ef6a3e411f2 100644 --- a/web_programming/crawl_google_results.py +++ b/web_programming/crawl_google_results.py @@ -2,20 +2,12 @@ import webbrowser from bs4 import BeautifulSoup +from fake_useragent import UserAgent import requests -# fake_useragent is needed for so that this script is not viewed as a bot -from fake_useragent import UserAgent # noqa: F401 - -user_agent = { - "UserAgent": ( - "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) " - "Gecko/20100101 Firefox/69.0" - ) -} print("Googling.....") url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:]) -res = requests.get(url, headers=user_agent) +res = requests.get(url, headers=UserAgent().random) # res.raise_for_status() with open("project1a.html", "wb") as out_file: # only for knowing the class for data in res.iter_content(10000): From dd627aeb4a64354a6541691099dff3fedec07717 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 18 Oct 2019 23:14:25 +0200 Subject: [PATCH 08/10] headers={"UserAgent": UserAgent().random} --- web_programming/crawl_google_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py index 7ef6a3e411f2..8aefe525789c 100644 --- a/web_programming/crawl_google_results.py +++ b/web_programming/crawl_google_results.py @@ -7,7 +7,7 @@ print("Googling.....") url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:]) -res = requests.get(url, headers=UserAgent().random) +res = requests.get(url, headers={"UserAgent": UserAgent().random}) # res.raise_for_status() with open("project1a.html", "wb") as out_file: # only for knowing the class for data in res.iter_content(10000): From 1b2eb0c368c4863c99c3309fe17b40d3cf38b6dc Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 18 Oct 2019 23:20:58 +0200 Subject: [PATCH 09/10] html.parser, not lxml --- web_programming/crawl_google_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py index 8aefe525789c..7e8cb8564b1d 100644 --- a/web_programming/crawl_google_results.py +++ b/web_programming/crawl_google_results.py @@ -12,7 +12,7 @@ with open("project1a.html", "wb") as out_file: # only for knowing the class for data in res.iter_content(10000): out_file.write(data) -soup = BeautifulSoup(res.text, "lxml") +soup = BeautifulSoup(res.text, "html.parser") links = list(soup.select(".eZt8xd"))[:5] print(len(links)) From 188d1f4df73c63d799382e539cb97658ca660a82 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 18 Oct 2019 23:25:16 +0200 Subject: [PATCH 10/10] link, not links --- web_programming/crawl_google_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py index 7e8cb8564b1d..c31ec1526d3e 100644 --- a/web_programming/crawl_google_results.py +++ b/web_programming/crawl_google_results.py @@ -17,4 +17,4 @@ print(len(links)) for link in links: - webbrowser.open(f"http://google.com{links.get('href')}") + webbrowser.open(f"http://google.com{link.get('href')}")