proper file structure

master
Lio Novelli 2022-02-02 19:35:03 +01:00
parent 376c308638
commit 9c176dcbe6
3 changed files with 172 additions and 0 deletions

2
.gitignore vendored 100644
View File

@ -0,0 +1,2 @@
__pycache__
gfonts.txt

View File

@ -0,0 +1,64 @@
import scrapy
import socket
import re
import json
from urllib.request import urlopen
from urllib.parse import urlparse
class GFontsSpider(scrapy.Spider):
name = "gfonts"
#start_urls = self.getUrls()
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
def __init__(self):
self.start_urls = self.getUrls()
def parse(self, response):
# check current url - german or eu (todo)
# check if api.gfonts.com is found
# todo: check if google analytics is found
if (self.isGerman(response.url)):
self.findGFonts(response)
self.parseOn(response)
def getCountryOfUrl(self, url):
ip = socket.gethostbyname(url)
api_url = 'https://ipinfo.io/' + ip + '/json'
response = urlopen(api_url)
data = json.load(response)
return data['country']
def isCountryGerman(self, url):
return 'DE' == self.getCountryOfUrl(url)
def isGermanTLD(self, url):
parts = urlparse(url)
tld = parts.hostname[-3:]
return tld == '.de'
def isGerman(self, url):
if (not self.isGermanTLD(url)):
return self.isCountryGerman(url)
return True
def findGFonts(self, response):
for links in response.css('head links'):
if ('fonts.googleapis.com' in links.attrib['href']):
yield {
'url': response.url,
'gfonts': True,
}
def parseOn(self, response):
for links in response.css('a'):
url = links.attrib['href']
if (bool(urlparse(url).netloc)):
current = urlparse(response.url)
found = urlparse(url)
if (current.hostname != found.hostname):
yield response.follow(url, callback=self.parse)
def getUrls(self):
with open('sites.txt') as sites_file:
sites = sites_file.readlines()
return map(lambda s: 'https://' + s,sites)

View File

@ -0,0 +1,106 @@
porsche.com
kenhub.com
trademachines.com
canyon.com
pinterest.de
buecher.de
conrad.com
bosch-home.com
kaercher.com
mobile.de
deutschland.de
google.de
amazon.de
einreiseanmeldung.de
bmw.de
daad.de
volkswagen.de
mtu.de
aldi.com
degruyter.com
web.de
uni-assist.de
mercedes-benz.de
embl.de
denic.de
bahn.de
dedon.de
giz.de
berlin.de
giz.de
bundestag.de
deutsche-rentenversicherung.de
bild.de
tum.de
rewe.de
commerzbank.de
goethe.de
spiegel.de
pangaea.de
tagesshau.de
fussball.de
adac.de
boys-day.de
audi.de
lidl.de
ndr.de
visitberlin.de
gesetze-im-internet.de
dresden.de
baden-wuttemberg.de
hessen.de
sparkasse.de
museum-folkwang.de
posteo.de
wetter.de
stern.de
biontech.de
mediamarkt.de
rki.de
deutsche-bank.de
duden.de
magro-aktuell.de
notebooksbilliger.de
idealo.de
wg-gesucht.de
bundesregierung.de
ebay.de
rlp.de
taz.de
antolin.westermann.de
electricbrands.de
suedzucker.de
wetteronline.de
kik.de
wetterzentrale.de
service.bund.de
katholisch.de
homboldt-foundation.de
deginvest.de
comdirect.de
standaard.be
jocobs-university.de
naspa.de
uni-bonn.de
zalando.de
ausbildung.de
uni-hamburg.de
auswaertiges-amt.de
jab.de
de.dwa.de
mdc-berlin.de
dlr.de
rwth-aachen.de
studip.uni-goettingen.de
bzga.de
gtai.de
bunte.de
decathlon.de
denbi.de
hannover.de
ewe.de
impfdashboard.de