From 9c176dcbe6b90577090c2fe1bff9e887c13fb648 Mon Sep 17 00:00:00 2001 From: Lio Novelli Date: Wed, 2 Feb 2022 19:35:03 +0100 Subject: [PATCH] proper file structure --- .gitignore | 2 + .../ger_gfonts/spiders/gfonts_spider.py | 64 +++++++++++ ger_gfonts/ger_gfonts/spiders/sites.txt | 106 ++++++++++++++++++ 3 files changed, 172 insertions(+) create mode 100644 .gitignore create mode 100644 ger_gfonts/ger_gfonts/spiders/gfonts_spider.py create mode 100644 ger_gfonts/ger_gfonts/spiders/sites.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2c5cee7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +gfonts.txt diff --git a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py new file mode 100644 index 0000000..5dad952 --- /dev/null +++ b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py @@ -0,0 +1,64 @@ +import scrapy +import socket +import re +import json +from urllib.request import urlopen +from urllib.parse import urlparse + +class GFontsSpider(scrapy.Spider): + name = "gfonts" + #start_urls = self.getUrls() + custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} + + def __init__(self): + self.start_urls = self.getUrls() + + def parse(self, response): + # check current url - german or eu (todo) + # check if api.gfonts.com is found + # todo: check if google analytics is found + if (self.isGerman(response.url)): + self.findGFonts(response) + self.parseOn(response) + + def getCountryOfUrl(self, url): + ip = socket.gethostbyname(url) + api_url = 'https://ipinfo.io/' + ip + '/json' + response = urlopen(api_url) + data = json.load(response) + return data['country'] + + def isCountryGerman(self, url): + return 'DE' == self.getCountryOfUrl(url) + + def isGermanTLD(self, url): + parts = urlparse(url) + tld = parts.hostname[-3:] + return tld == '.de' + + def isGerman(self, url): + if (not self.isGermanTLD(url)): + return self.isCountryGerman(url) + return True + + def findGFonts(self, response): + for links in response.css('head links'): + if ('fonts.googleapis.com' in links.attrib['href']): + yield { + 'url': response.url, + 'gfonts': True, + } + + def parseOn(self, response): + for links in response.css('a'): + url = links.attrib['href'] + if (bool(urlparse(url).netloc)): + current = urlparse(response.url) + found = urlparse(url) + if (current.hostname != found.hostname): + yield response.follow(url, callback=self.parse) + + def getUrls(self): + with open('sites.txt') as sites_file: + sites = sites_file.readlines() + return map(lambda s: 'https://' + s,sites) diff --git a/ger_gfonts/ger_gfonts/spiders/sites.txt b/ger_gfonts/ger_gfonts/spiders/sites.txt new file mode 100644 index 0000000..e98e4f3 --- /dev/null +++ b/ger_gfonts/ger_gfonts/spiders/sites.txt @@ -0,0 +1,106 @@ +porsche.com +kenhub.com +trademachines.com +canyon.com +pinterest.de +buecher.de +conrad.com +bosch-home.com +kaercher.com +mobile.de +deutschland.de +google.de +amazon.de +einreiseanmeldung.de +bmw.de +daad.de +volkswagen.de +mtu.de +aldi.com +degruyter.com +web.de +uni-assist.de +mercedes-benz.de +embl.de +denic.de +bahn.de +dedon.de +giz.de +berlin.de +giz.de +bundestag.de +deutsche-rentenversicherung.de +bild.de +tum.de +rewe.de +commerzbank.de +goethe.de +spiegel.de +pangaea.de +tagesshau.de +fussball.de +adac.de +boys-day.de +audi.de +lidl.de +ndr.de +visitberlin.de +gesetze-im-internet.de +dresden.de +baden-wuttemberg.de +hessen.de +sparkasse.de +museum-folkwang.de +posteo.de +wetter.de +stern.de +biontech.de +mediamarkt.de +rki.de +deutsche-bank.de +duden.de +magro-aktuell.de +notebooksbilliger.de +idealo.de +wg-gesucht.de +bundesregierung.de +ebay.de +rlp.de +taz.de +antolin.westermann.de +electricbrands.de +suedzucker.de +wetteronline.de +kik.de +wetterzentrale.de +service.bund.de +katholisch.de +homboldt-foundation.de +deginvest.de +comdirect.de +standaard.be +jocobs-university.de +naspa.de +uni-bonn.de +zalando.de +ausbildung.de +uni-hamburg.de +auswaertiges-amt.de +jab.de +de.dwa.de +mdc-berlin.de +dlr.de +rwth-aachen.de +studip.uni-goettingen.de +bzga.de +gtai.de +bunte.de +decathlon.de +denbi.de +hannover.de +ewe.de +impfdashboard.de + + + +