diff --git a/.gitignore b/.gitignore index b096d08..63cc484 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ -__pycache__ gfonts.json +__pycache__ +*/__pycache__ diff --git a/README.md b/README.md index 16ccdf6..83a6be2 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,39 @@ Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html ## Usage + pip3 install -e . scrapy startproject ger_gfonts cd ger_gfonts scrapy crawl gfonts -O gfonts.json ## TODO +!Implement a crawling spider: https://doc.scrapy.org/en/latest/topics/spiders.html#crawlspider + Start checking for google analytics for all eu websites. +- eu countries tlds: https://www.whois365.com/en/listtld/europe + +### meta pixel + + + + + + ## IDEAS Make it into browserextension that would notify you. diff --git a/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc b/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc index 912c08d..4b31e16 100644 Binary files a/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc and b/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc differ diff --git a/ger_gfonts/ger_gfonts/settings.py b/ger_gfonts/ger_gfonts/settings.py index facd1d3..0442b11 100644 --- a/ger_gfonts/ger_gfonts/settings.py +++ b/ger_gfonts/ger_gfonts/settings.py @@ -17,7 +17,7 @@ NEWSPIDER_MODULE = 'ger_gfonts.spiders' #USER_AGENT = 'ger_gfonts (+http://www.yourdomain.com)' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc index 1dc051d..69a306d 100644 Binary files a/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc and b/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc differ diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc index 76470eb..ca557f2 100644 Binary files a/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc and b/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc differ diff --git a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py index 14be6ba..1bbed47 100644 --- a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py +++ b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py @@ -5,7 +5,8 @@ import re import json from urllib.request import urlopen from urllib.parse import urlparse -from utility.countries import * +from utility.countries import isEuropean +import validators class GFontsSpider(Spider): name = "gfonts" @@ -21,88 +22,73 @@ class GFontsSpider(Spider): # check current url - german or eu (todo) # check if api.gfonts.com is found # @todo: check if google analytics is found - if self.isEuropean(response.url): - self.writeTrackers(response) - parsed = urlparse(response.url) - self.eu_domains.append(parsed.hostname) - self.logNewDomain(response.url) + parsed = urlparse(response.url) + if isEuropean(response.url): + print("URL EUROPEAN: " + response.url) + if parsed.hostname not in self.eu_domains: + self.eu_domains.append(parsed.hostname) + self.logNewDomain(response.url) + yield self.writeTrackers(response) else: print("NOT EUROPEAN: " + response.url) - self.parseOn(response) - - def getCountryOfUrl(self, url): - ip = socket.gethostbyname(url) - api_url = 'https://ipinfo.io/' + ip + '/json' - response = urlopen(api_url) - data = json.load(response) - return data['country'] - - def isCountryGerman(self, url): - return 'DE' == self.getCountryOfUrl(url) - - def isGermanTLD(self, url): - parts = urlparse(url) - tld = parts.hostname[-3:] - return tld == '.de' - - def isGerman(self, url): - if not self.isGermanTLD(url): - return self.isCountryGerman(url) - return True - def isEuropean(self, url): - eu_tlds = self.getEuTlds() - parts = urlparse(url) - tld = parts.hostname[-3:] - if tld in eu_tlds: - return eu_tlds[tld] - country = self.getCountryOfUrl(url) - if country in eu_tlds.values(): - return country - return False + self.checked_domains.append(parsed.hostname) + for link in self.parseOn(response): + yield scrapy.Request(link, callback=self.parse) def findGFonts(self, response): - for links in response.css('head link'): - return 'fonts.googleapis.com' in links.attrib['href'] + for link in response.css('head link'): + try: + href = link.attrib['href'] + if 'fonts.googleapis.com' in href: + return True + except: + continue + return False + def findGTrackers(self, response): trackers = { 'ga' : 'www.google-analytics.com', 'gt' : 'www.googletagmanager.com'} - result = {'ga':0, 'gt':0} + result = {'ga':False, 'gt':False} for script in response.css('script::text').getall(): - if script.find(trackers['ga']) > 0: - result['ga'] = 1 - if script.find(trackers['gt']) > 0: - result['gt'] = 1 + if script.find(trackers['ga']) > 0: result['ga'] = True + if script.find(trackers['gt']) > 0: result['gt'] = True return result + def findMetaPixel(self, response): for img in response.css('img'): - if img.attrib['src'].find('www.facebook.com/tr?id='): - return TRUE - return FALSE + try: + if img.attrib['src'].find('www.facebook.com/tr?id=') > 0: return True + except: + continue + return False + def writeTrackers(self,response): gtrackers = self.findGTrackers(response) - yield { + return { 'domain': urlparse(response.url).netloc, - 'country': self.isEuropean(response.url), + 'country': isEuropean(response.url), 'gf': self.findGFonts(response), 'ga': gtrackers['ga'], - 'gt': gtrackers['gm'], + 'gt': gtrackers['gt'], 'mp': self.findMetaPixel(response) } - def parseOn(self, response): links = response.css('a'); print('FOUND: ' + str(len(links)) + ' LINKS') + next_urls = [] for link in links: - url = link.attrib['href'] - # parse valid urls - found = urlparse(url) - if validators.url(url) and bool(found.netloc): - current = urlparse(response.url) - if current.hostname != found.hostname: - yield response.follow(url, callback=self.parse) - else: - print("NOT FOLLOWING: " + url) + try: + url = link.attrib['href'] + found = urlparse(url) + if validators.url(url) and bool(found.netloc): + current = urlparse(response.url) + if current.hostname != found.hostname and found.hostname not in self.checked_domains: + next_urls.append(url) + except: + continue + print('FOLLOW: ' + str(len(next_urls)) + ' LINKS') + return next_urls def getUrls(self): with open('sites.txt') as sites_file: diff --git a/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py index dc0028f..062cccf 100644 --- a/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py +++ b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py @@ -3,9 +3,13 @@ import scrapy from scrapy.linkextractors import LinkExtractor #from utility.countries import getEuTlds from utility import countries +from urllib.parse import urlencode, urlparse, parse_qs -class firstSpider(scrapy.Spider): +class startUrls(scrapy.Spider): name = "start_urls" + custom_settings = { + 'ROBOTSTXT_OBEY': False + } def __init__(self): eu_tlds = countries.getEuTlds() diff --git a/ger_gfonts/utility/.countries.py.swp b/ger_gfonts/utility/.countries.py.swp deleted file mode 100644 index 6aec163..0000000 Binary files a/ger_gfonts/utility/.countries.py.swp and /dev/null differ diff --git a/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc b/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc index c102c3d..2576808 100644 Binary files a/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc and b/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc differ diff --git a/ger_gfonts/utility/countries.py b/ger_gfonts/utility/countries.py index 7f64de3..17634f0 100644 --- a/ger_gfonts/utility/countries.py +++ b/ger_gfonts/utility/countries.py @@ -15,13 +15,14 @@ def getEuTlds(): '.bg':'BG', '.cy':'CY', '.mt':'MT'} def getCountryOfUrl(url): - ip = socket.gethostbyname(url) + parsed = urlparse(url) + ip = socket.gethostbyname(parsed.hostname) api_url = 'https://ipinfo.io/' + ip + '/json' response = urlopen(api_url) data = json.load(response) return data['country'] -def isCountryGerman(self, url): +def isCountryGerman(url): return 'DE' == getCountryOfUrl(url) def isGermanTLD(url): @@ -29,7 +30,7 @@ def isGermanTLD(url): tld = parts.hostname[-3:] return tld == '.de' -def isGerman(self, url): +def isGerman(url): if not isGermanTLD(url): return isCountryGerman(url) return True diff --git a/ger_gfonts/utility/google_scrapy.py b/ger_gfonts/utility/google_scrapy.py new file mode 100644 index 0000000..fa5604b --- /dev/null +++ b/ger_gfonts/utility/google_scrapy.py @@ -0,0 +1,45 @@ +import requests +import urllib +import pandas as pd +from requests_html import HTML +from requests_html import HTMLSession + +def get_source(url): + """Return the source code for the provided URL. + + Args: + url (string): URL of the page to scrape. + + Returns: + response (object): HTTP response object from requests_html. + """ + + try: + session = HTMLSession() + response = session.get(url) + return response + + except requests.exceptions.RequestException as e: + print(e) + +def scrape_google(query): + + query = urllib.parse.quote_plus(query) + response = get_source("https://www.google.co.uk/search?q=" + query) + + links = list(response.html.absolute_links) + google_domains = ('https://www.google.', + 'https://google.', + 'https://webcache.googleusercontent.', + 'http://webcache.googleusercontent.', + 'https://policies.google.', + 'https://support.google.', + 'https://maps.google.') + + for url in links[:]: + if url.startswith(google_domains): + links.remove(url) + + return links + +scrape_google('inurl:.si')