import scrapy from scrapy.spiders import Spider,CrawlSpider, Rule import socket import re import json from urllib.request import urlopen from urllib.parse import urlparse from utility.countries import * class GFontsSpider(Spider): name = "gfonts" #start_urls = self.getUrls() checked_domains = [] eu_domains = [] custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} def __init__(self): self.start_urls = self.getUrls() def parse(self, response): # check current url - german or eu (todo) # check if api.gfonts.com is found # @todo: check if google analytics is found if self.isEuropean(response.url): self.writeTrackers(response) parsed = urlparse(response.url) self.eu_domains.append(parsed.hostname) self.logNewDomain(response.url) else: print("NOT EUROPEAN: " + response.url) self.parseOn(response) def getCountryOfUrl(self, url): ip = socket.gethostbyname(url) api_url = 'https://ipinfo.io/' + ip + '/json' response = urlopen(api_url) data = json.load(response) return data['country'] def isCountryGerman(self, url): return 'DE' == self.getCountryOfUrl(url) def isGermanTLD(self, url): parts = urlparse(url) tld = parts.hostname[-3:] return tld == '.de' def isGerman(self, url): if not self.isGermanTLD(url): return self.isCountryGerman(url) return True def isEuropean(self, url): eu_tlds = self.getEuTlds() parts = urlparse(url) tld = parts.hostname[-3:] if tld in eu_tlds: return eu_tlds[tld] country = self.getCountryOfUrl(url) if country in eu_tlds.values(): return country return False def findGFonts(self, response): for links in response.css('head link'): return 'fonts.googleapis.com' in links.attrib['href'] def findGTrackers(self, response): trackers = { 'ga' : 'www.google-analytics.com', 'gt' : 'www.googletagmanager.com'} result = {'ga':0, 'gt':0} for script in response.css('script::text').getall(): if script.find(trackers['ga']) > 0: result['ga'] = 1 if script.find(trackers['gt']) > 0: result['gt'] = 1 return result def findMetaPixel(self, response): for img in response.css('img'): if img.attrib['src'].find('www.facebook.com/tr?id='): return TRUE return FALSE def writeTrackers(self,response): gtrackers = self.findGTrackers(response) yield { 'domain': urlparse(response.url).netloc, 'country': self.isEuropean(response.url), 'gf': self.findGFonts(response), 'ga': gtrackers['ga'], 'gt': gtrackers['gm'], 'mp': self.findMetaPixel(response) } def parseOn(self, response): links = response.css('a'); print('FOUND: ' + str(len(links)) + ' LINKS') for link in links: url = link.attrib['href'] # parse valid urls found = urlparse(url) if validators.url(url) and bool(found.netloc): current = urlparse(response.url) if current.hostname != found.hostname: yield response.follow(url, callback=self.parse) else: print("NOT FOLLOWING: " + url) def getUrls(self): with open('sites.txt') as sites_file: sites = sites_file.readlines() return map(lambda s: 'https://' + s,sites) def logNewDomain(self, url): print('############################################') print('###### ' + url + ' #######') print('############################################') def getEuTlds(self): return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ', '.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI', '.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT', '.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO', '.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK', '.bg':'BG', '.cy':'CY', '.mt':'MT'}