diff --git a/README.md b/README.md index de9d4b5..16ccdf6 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html ## Usage + scrapy startproject ger_gfonts cd ger_gfonts scrapy crawl gfonts -O gfonts.json - ## TODO Start checking for google analytics for all eu websites. diff --git a/ger_gfonts/spiders/gfonts_spider.py b/ger_gfonts/spiders/gfonts_spider.py deleted file mode 100644 index 68be344..0000000 --- a/ger_gfonts/spiders/gfonts_spider.py +++ /dev/null @@ -1,61 +0,0 @@ -import scrapy -import socket -import re -import json -from urllib.request import urlopen -from urllib.parse import urlparse - -class GFontsSpider(scrapy.Spider): - name = "gfonts" - start_urls = self.getUrls() - custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} - - def parse(self, response): - # check current url - german or eu (todo) - # check if api.gfonts.com is found - # todo: check if google analytics is found - if (self.isGerman(response.url)): - self.findGFonts(response) - self.parseOn(response) - - def getCountryOfUrl(url): - ip = socket.gethostbyname(url) - api_url = 'https://ipinfo.io/' + ip + '/json' - response = urlopen(api_url) - data = json.load(response) - return data['country'] - - def isCountryGerman(url): - return 'DE' === self.getCountryOfUrl(url) - - def isGermanTLD(url): - parts = urlparse(url) - tld = parts.hostname[-3:] - return tld === '.de' - - def isGerman(url): - if (!self.isGermanTLD(url)): - return self.isCountryGerman(url) - return TRUE - - def findGFonts(response): - for links in response.css('head links'): - if ('fonts.googleapis.com' in links.attrib['href']): - yield { - 'url': response.url, - 'gfonts': TRUE, - } - - def parseOn(response): - for links in response.css('a'): - url = links.attrib['href'] - if (bool(urlparse(url).netloc)): - current = urlparse(response.url) - found = urlparse(url) - if (current.hostname != found.hostname): - yield response.follow(url, callback=self.parse) - - def getUrls(): - with open('../sites.txt') as sites_file: - sites = sites_file.readlines() - return sites