diff --git a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py index 5dad952..109f4ab 100644 --- a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py +++ b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py @@ -8,6 +8,7 @@ from urllib.parse import urlparse class GFontsSpider(scrapy.Spider): name = "gfonts" #start_urls = self.getUrls() + checked_domains = [] custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} def __init__(self): @@ -16,8 +17,11 @@ class GFontsSpider(scrapy.Spider): def parse(self, response): # check current url - german or eu (todo) # check if api.gfonts.com is found - # todo: check if google analytics is found - if (self.isGerman(response.url)): + # @todo: check if google analytics is found + parsed = urlparse(response.url) + self.checked_domains.append(parsed.hostname) + self.logNewDomain(response.url) + if self.isGerman(response.url): self.findGFonts(response) self.parseOn(response) @@ -37,13 +41,13 @@ class GFontsSpider(scrapy.Spider): return tld == '.de' def isGerman(self, url): - if (not self.isGermanTLD(url)): + if not self.isGermanTLD(url): return self.isCountryGerman(url) return True def findGFonts(self, response): for links in response.css('head links'): - if ('fonts.googleapis.com' in links.attrib['href']): + if 'fonts.googleapis.com' in links.attrib['href']: yield { 'url': response.url, 'gfonts': True, @@ -52,13 +56,19 @@ class GFontsSpider(scrapy.Spider): def parseOn(self, response): for links in response.css('a'): url = links.attrib['href'] - if (bool(urlparse(url).netloc)): + # parse valid urls + if validators.url(url) and bool(urlparse(url).netloc): current = urlparse(response.url) found = urlparse(url) - if (current.hostname != found.hostname): + if current.hostname != found.hostname and found.hostname not in self.checked_domains: yield response.follow(url, callback=self.parse) def getUrls(self): with open('sites.txt') as sites_file: sites = sites_file.readlines() return map(lambda s: 'https://' + s,sites) + + def logNewDomain(self, url): + print('############################################') + print('###### ' + url + ' #######') + print('############################################') diff --git a/requirements.txt b/requirements.txt index 268b0db..e6bcda8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ scrapy +validators