commit 1224833a1ed63b1d0335f82786cc1057af12d41c Author: Lio Novelli Date: Wed Feb 2 16:08:10 2022 +0100 Initial commit of searching for google fonts on german websites. diff --git a/README.md b/README.md new file mode 100644 index 0000000..db7dad5 --- /dev/null +++ b/README.md @@ -0,0 +1,17 @@ +# German google fonts pages + +A spider that's looking for german page with google fonts hosted on google. + +Also look for google analytics on a website. + +## Checking website origin: + +https://ipinfo.io/ + +## TODO + +Start checking for google analytics. + +## IDEAS + +Make it into browserextension that would notify you. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..268b0db --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +scrapy diff --git a/spiders/gfonts_spider.py b/spiders/gfonts_spider.py new file mode 100644 index 0000000..9af99b5 --- /dev/null +++ b/spiders/gfonts_spider.py @@ -0,0 +1,55 @@ +import scrapy +import socket +import re +import json +from urllib.request import urlopen +from urllib.parse import urlparse + +class GFontsSpider(scrapy.Spider): + name = "gfonts" + start_urls = get_urls() + custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} + + def parse(self, response): + # check current url - german or eu (todo) + # check if api.gfonts.com is found + # todo: check if google analytics is found + if (isGerman(response.url)): + findGFonts(response) + parseOn(response) + + def getCountryOfUrl(url): + ip = socket.gethostbyname(url) + api_url = 'https://ipinfo.io/' + ip + '/json' + response = urlopen(api_url) + data = json.load(response) + return data['country'] + + def isCountryGerman(url): + return 'DE' === getCountryOfUrl(url) + + def isGermanTLD(url): + parts = urlparse(url) + tld = parts.hostname[-3:] + return tld === '.de' + + def isGerman(url): + if (!isGermanTLD(url)): + return isCountryGerman(url) + return TRUE + + def findGFonts(response): + for links in response.css('head links'): + if ('fonts.googleapis.com' in links.attrib['href']): + yield { + 'url': response.url, + 'gfonts': TRUE, + } + def parseOn(response): + for links in response.css('a'): + url = links.attrib['href'] + if (bool(urlparse(url).netloc)): + current = urlparse(response.url) + found = urlparse(url) + if (current.hostname != found.hostname): + yield response.follow(url, callback=self.parse)