12 changed files with 127 additions and 65 deletions
-
3.gitignore
-
25README.md
-
BINger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc
-
2ger_gfonts/ger_gfonts/settings.py
-
BINger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc
-
BINger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc
-
104ger_gfonts/ger_gfonts/spiders/gfonts_spider.py
-
6ger_gfonts/ger_gfonts/spiders/start_urls_spider.py
-
BINger_gfonts/utility/.countries.py.swp
-
BINger_gfonts/utility/__pycache__/countries.cpython-37.pyc
-
7ger_gfonts/utility/countries.py
-
45ger_gfonts/utility/google_scrapy.py
@ -1,2 +1,3 @@ |
|||
__pycache__ |
|||
gfonts.json |
|||
__pycache__ |
|||
*/__pycache__ |
@ -0,0 +1,45 @@ |
|||
import requests |
|||
import urllib |
|||
import pandas as pd |
|||
from requests_html import HTML |
|||
from requests_html import HTMLSession |
|||
|
|||
def get_source(url): |
|||
"""Return the source code for the provided URL. |
|||
|
|||
Args: |
|||
url (string): URL of the page to scrape. |
|||
|
|||
Returns: |
|||
response (object): HTTP response object from requests_html. |
|||
""" |
|||
|
|||
try: |
|||
session = HTMLSession() |
|||
response = session.get(url) |
|||
return response |
|||
|
|||
except requests.exceptions.RequestException as e: |
|||
print(e) |
|||
|
|||
def scrape_google(query): |
|||
|
|||
query = urllib.parse.quote_plus(query) |
|||
response = get_source("https://www.google.co.uk/search?q=" + query) |
|||
|
|||
links = list(response.html.absolute_links) |
|||
google_domains = ('https://www.google.', |
|||
'https://google.', |
|||
'https://webcache.googleusercontent.', |
|||
'http://webcache.googleusercontent.', |
|||
'https://policies.google.', |
|||
'https://support.google.', |
|||
'https://maps.google.') |
|||
|
|||
for url in links[:]: |
|||
if url.startswith(google_domains): |
|||
links.remove(url) |
|||
|
|||
return links |
|||
|
|||
scrape_google('inurl:.si') |
Write
Preview
Loading…
Cancel
Save
Reference in new issue