46 lines
1.2 KiB
Python
46 lines
1.2 KiB
Python
|
import requests
|
||
|
import urllib
|
||
|
import pandas as pd
|
||
|
from requests_html import HTML
|
||
|
from requests_html import HTMLSession
|
||
|
|
||
|
def get_source(url):
|
||
|
"""Return the source code for the provided URL.
|
||
|
|
||
|
Args:
|
||
|
url (string): URL of the page to scrape.
|
||
|
|
||
|
Returns:
|
||
|
response (object): HTTP response object from requests_html.
|
||
|
"""
|
||
|
|
||
|
try:
|
||
|
session = HTMLSession()
|
||
|
response = session.get(url)
|
||
|
return response
|
||
|
|
||
|
except requests.exceptions.RequestException as e:
|
||
|
print(e)
|
||
|
|
||
|
def scrape_google(query):
|
||
|
|
||
|
query = urllib.parse.quote_plus(query)
|
||
|
response = get_source("https://www.google.co.uk/search?q=" + query)
|
||
|
|
||
|
links = list(response.html.absolute_links)
|
||
|
google_domains = ('https://www.google.',
|
||
|
'https://google.',
|
||
|
'https://webcache.googleusercontent.',
|
||
|
'http://webcache.googleusercontent.',
|
||
|
'https://policies.google.',
|
||
|
'https://support.google.',
|
||
|
'https://maps.google.')
|
||
|
|
||
|
for url in links[:]:
|
||
|
if url.startswith(google_domains):
|
||
|
links.remove(url)
|
||
|
|
||
|
return links
|
||
|
|
||
|
scrape_google('inurl:.si')
|