diff --git a/ger_gfonts/ger_gfonts/__pycache__/__init__.cpython-37.pyc b/ger_gfonts/ger_gfonts/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..08d15f7 Binary files /dev/null and b/ger_gfonts/ger_gfonts/__pycache__/__init__.cpython-37.pyc differ diff --git a/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc b/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc new file mode 100644 index 0000000..912c08d Binary files /dev/null and b/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc differ diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/__init__.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..5e21c62 Binary files /dev/null and b/ger_gfonts/ger_gfonts/spiders/__pycache__/__init__.cpython-37.pyc differ diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc new file mode 100644 index 0000000..1dc051d Binary files /dev/null and b/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc differ diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc new file mode 100644 index 0000000..76470eb Binary files /dev/null and b/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc differ diff --git a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py index a30aa9c..14be6ba 100644 --- a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py +++ b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py @@ -1,14 +1,17 @@ import scrapy +from scrapy.spiders import Spider,CrawlSpider, Rule import socket import re import json from urllib.request import urlopen from urllib.parse import urlparse +from utility.countries import * -class GFontsSpider(scrapy.Spider): +class GFontsSpider(Spider): name = "gfonts" #start_urls = self.getUrls() checked_domains = [] + eu_domains = [] custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} def __init__(self): @@ -18,12 +21,14 @@ class GFontsSpider(scrapy.Spider): # check current url - german or eu (todo) # check if api.gfonts.com is found # @todo: check if google analytics is found - parsed = urlparse(response.url) - self.checked_domains.append(parsed.hostname) - self.logNewDomain(response.url) - if self.isGerman(response.url): - self.findGFonts(response) - self.parseOn(response) + if self.isEuropean(response.url): + self.writeTrackers(response) + parsed = urlparse(response.url) + self.eu_domains.append(parsed.hostname) + self.logNewDomain(response.url) + else: + print("NOT EUROPEAN: " + response.url) + self.parseOn(response) def getCountryOfUrl(self, url): ip = socket.gethostbyname(url) @@ -44,24 +49,60 @@ class GFontsSpider(scrapy.Spider): if not self.isGermanTLD(url): return self.isCountryGerman(url) return True + def isEuropean(self, url): + eu_tlds = self.getEuTlds() + parts = urlparse(url) + tld = parts.hostname[-3:] + if tld in eu_tlds: + return eu_tlds[tld] + country = self.getCountryOfUrl(url) + if country in eu_tlds.values(): + return country + return False def findGFonts(self, response): for links in response.css('head link'): - if 'fonts.googleapis.com' in links.attrib['href']: - yield { - 'url': response.url, - 'gfonts': True, + return 'fonts.googleapis.com' in links.attrib['href'] + def findGTrackers(self, response): + trackers = { 'ga' : 'www.google-analytics.com', + 'gt' : 'www.googletagmanager.com'} + result = {'ga':0, 'gt':0} + for script in response.css('script::text').getall(): + if script.find(trackers['ga']) > 0: + result['ga'] = 1 + if script.find(trackers['gt']) > 0: + result['gt'] = 1 + return result + def findMetaPixel(self, response): + for img in response.css('img'): + if img.attrib['src'].find('www.facebook.com/tr?id='): + return TRUE + return FALSE + def writeTrackers(self,response): + gtrackers = self.findGTrackers(response) + yield { + 'domain': urlparse(response.url).netloc, + 'country': self.isEuropean(response.url), + 'gf': self.findGFonts(response), + 'ga': gtrackers['ga'], + 'gt': gtrackers['gm'], + 'mp': self.findMetaPixel(response) } + def parseOn(self, response): - for links in response.css('a'): - url = links.attrib['href'] + links = response.css('a'); + print('FOUND: ' + str(len(links)) + ' LINKS') + for link in links: + url = link.attrib['href'] # parse valid urls - if validators.url(url) and bool(urlparse(url).netloc): + found = urlparse(url) + if validators.url(url) and bool(found.netloc): current = urlparse(response.url) - found = urlparse(url) - if current.hostname != found.hostname and found.hostname not in self.checked_domains: + if current.hostname != found.hostname: yield response.follow(url, callback=self.parse) + else: + print("NOT FOLLOWING: " + url) def getUrls(self): with open('sites.txt') as sites_file: @@ -72,3 +113,11 @@ class GFontsSpider(scrapy.Spider): print('############################################') print('###### ' + url + ' #######') print('############################################') + + def getEuTlds(self): + return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ', + '.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI', + '.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT', + '.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO', + '.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK', + '.bg':'BG', '.cy':'CY', '.mt':'MT'} diff --git a/ger_gfonts/ger_gfonts/spiders/sites.txt b/ger_gfonts/ger_gfonts/spiders/sites.txt index e98e4f3..bcaa116 100644 --- a/ger_gfonts/ger_gfonts/spiders/sites.txt +++ b/ger_gfonts/ger_gfonts/spiders/sites.txt @@ -75,7 +75,7 @@ kik.de wetterzentrale.de service.bund.de katholisch.de -homboldt-foundation.de +humboldt-foundation.de deginvest.de comdirect.de standaard.be diff --git a/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py new file mode 100644 index 0000000..dc0028f --- /dev/null +++ b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py @@ -0,0 +1,17 @@ +import scrapy +#import pandas +from scrapy.linkextractors import LinkExtractor +#from utility.countries import getEuTlds +from utility import countries + +class firstSpider(scrapy.Spider): + name = "start_urls" + + def __init__(self): + eu_tlds = countries.getEuTlds() + self.start_urls = map(lambda t: 'https://www.google.com/search?q=inurl%3A' + t, eu_tlds.keys()) + + def parse(self, response): + xlink = LinkExtractor() + for link in xlink.extract_links(response): + print(link) diff --git a/ger_gfonts/utility/.countries.py.swp b/ger_gfonts/utility/.countries.py.swp new file mode 100644 index 0000000..6aec163 Binary files /dev/null and b/ger_gfonts/utility/.countries.py.swp differ diff --git a/ger_gfonts/utility/__init.py b/ger_gfonts/utility/__init.py new file mode 100644 index 0000000..e69de29 diff --git a/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc b/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc new file mode 100644 index 0000000..c102c3d Binary files /dev/null and b/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc differ diff --git a/ger_gfonts/utility/countries.py b/ger_gfonts/utility/countries.py new file mode 100644 index 0000000..7f64de3 --- /dev/null +++ b/ger_gfonts/utility/countries.py @@ -0,0 +1,47 @@ +# utility functions for countries +# import pycountry +import socket +from urllib.request import urlopen +import json +from urllib.parse import urlparse + +def getEuTlds(): + # map tld to alpha_2 + return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ', + '.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI', + '.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT', + '.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO', + '.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK', + '.bg':'BG', '.cy':'CY', '.mt':'MT'} + +def getCountryOfUrl(url): + ip = socket.gethostbyname(url) + api_url = 'https://ipinfo.io/' + ip + '/json' + response = urlopen(api_url) + data = json.load(response) + return data['country'] + +def isCountryGerman(self, url): + return 'DE' == getCountryOfUrl(url) + +def isGermanTLD(url): + parts = urlparse(url) + tld = parts.hostname[-3:] + return tld == '.de' + +def isGerman(self, url): + if not isGermanTLD(url): + return isCountryGerman(url) + return True + +def isEuropean(url): + eu_tlds = getEuTlds() + parts = urlparse(url) + tld = parts.hostname[-3:] + if tld in eu_tlds: + return eu_tlds[tld] + country = getCountryOfUrl(url) + if country in eu_tlds.values(): + return country + return False +