46 lines
1.2 KiB
Python
46 lines
1.2 KiB
Python
import requests
|
|
import urllib
|
|
import pandas as pd
|
|
from requests_html import HTML
|
|
from requests_html import HTMLSession
|
|
|
|
def get_source(url):
|
|
"""Return the source code for the provided URL.
|
|
|
|
Args:
|
|
url (string): URL of the page to scrape.
|
|
|
|
Returns:
|
|
response (object): HTTP response object from requests_html.
|
|
"""
|
|
|
|
try:
|
|
session = HTMLSession()
|
|
response = session.get(url)
|
|
return response
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(e)
|
|
|
|
def scrape_google(query):
|
|
|
|
query = urllib.parse.quote_plus(query)
|
|
response = get_source("https://www.google.co.uk/search?q=" + query)
|
|
|
|
links = list(response.html.absolute_links)
|
|
google_domains = ('https://www.google.',
|
|
'https://google.',
|
|
'https://webcache.googleusercontent.',
|
|
'http://webcache.googleusercontent.',
|
|
'https://policies.google.',
|
|
'https://support.google.',
|
|
'https://maps.google.')
|
|
|
|
for url in links[:]:
|
|
if url.startswith(google_domains):
|
|
links.remove(url)
|
|
|
|
return links
|
|
|
|
scrape_google('inurl:.si')
|