import scrapy import socket import re import json from urllib.request import urlopen from urllib.parse import urlparse class GFontsSpider(scrapy.Spider): name = "gfonts" #start_urls = self.getUrls() checked_domains = [] custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} def __init__(self): self.start_urls = self.getUrls() def parse(self, response): # check current url - german or eu (todo) # check if api.gfonts.com is found # @todo: check if google analytics is found parsed = urlparse(response.url) self.checked_domains.append(parsed.hostname) self.logNewDomain(response.url) if self.isGerman(response.url): self.findGFonts(response) self.parseOn(response) def getCountryOfUrl(self, url): ip = socket.gethostbyname(url) api_url = 'https://ipinfo.io/' + ip + '/json' response = urlopen(api_url) data = json.load(response) return data['country'] def isCountryGerman(self, url): return 'DE' == self.getCountryOfUrl(url) def isGermanTLD(self, url): parts = urlparse(url) tld = parts.hostname[-3:] return tld == '.de' def isGerman(self, url): if not self.isGermanTLD(url): return self.isCountryGerman(url) return True def findGFonts(self, response): for links in response.css('head links'): if 'fonts.googleapis.com' in links.attrib['href']: yield { 'url': response.url, 'gfonts': True, } def parseOn(self, response): for links in response.css('a'): url = links.attrib['href'] # parse valid urls if validators.url(url) and bool(urlparse(url).netloc): current = urlparse(response.url) found = urlparse(url) if current.hostname != found.hostname and found.hostname not in self.checked_domains: yield response.follow(url, callback=self.parse) def getUrls(self): with open('sites.txt') as sites_file: sites = sites_file.readlines() return map(lambda s: 'https://' + s,sites) def logNewDomain(self, url): print('############################################') print('###### ' + url + ' #######') print('############################################')