proper file structure
parent
376c308638
commit
9c176dcbe6
|
@ -0,0 +1,2 @@
|
|||
__pycache__
|
||||
gfonts.txt
|
|
@ -0,0 +1,64 @@
|
|||
import scrapy
|
||||
import socket
|
||||
import re
|
||||
import json
|
||||
from urllib.request import urlopen
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class GFontsSpider(scrapy.Spider):
|
||||
name = "gfonts"
|
||||
#start_urls = self.getUrls()
|
||||
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
|
||||
|
||||
def __init__(self):
|
||||
self.start_urls = self.getUrls()
|
||||
|
||||
def parse(self, response):
|
||||
# check current url - german or eu (todo)
|
||||
# check if api.gfonts.com is found
|
||||
# todo: check if google analytics is found
|
||||
if (self.isGerman(response.url)):
|
||||
self.findGFonts(response)
|
||||
self.parseOn(response)
|
||||
|
||||
def getCountryOfUrl(self, url):
|
||||
ip = socket.gethostbyname(url)
|
||||
api_url = 'https://ipinfo.io/' + ip + '/json'
|
||||
response = urlopen(api_url)
|
||||
data = json.load(response)
|
||||
return data['country']
|
||||
|
||||
def isCountryGerman(self, url):
|
||||
return 'DE' == self.getCountryOfUrl(url)
|
||||
|
||||
def isGermanTLD(self, url):
|
||||
parts = urlparse(url)
|
||||
tld = parts.hostname[-3:]
|
||||
return tld == '.de'
|
||||
|
||||
def isGerman(self, url):
|
||||
if (not self.isGermanTLD(url)):
|
||||
return self.isCountryGerman(url)
|
||||
return True
|
||||
|
||||
def findGFonts(self, response):
|
||||
for links in response.css('head links'):
|
||||
if ('fonts.googleapis.com' in links.attrib['href']):
|
||||
yield {
|
||||
'url': response.url,
|
||||
'gfonts': True,
|
||||
}
|
||||
|
||||
def parseOn(self, response):
|
||||
for links in response.css('a'):
|
||||
url = links.attrib['href']
|
||||
if (bool(urlparse(url).netloc)):
|
||||
current = urlparse(response.url)
|
||||
found = urlparse(url)
|
||||
if (current.hostname != found.hostname):
|
||||
yield response.follow(url, callback=self.parse)
|
||||
|
||||
def getUrls(self):
|
||||
with open('sites.txt') as sites_file:
|
||||
sites = sites_file.readlines()
|
||||
return map(lambda s: 'https://' + s,sites)
|
|
@ -0,0 +1,106 @@
|
|||
porsche.com
|
||||
kenhub.com
|
||||
trademachines.com
|
||||
canyon.com
|
||||
pinterest.de
|
||||
buecher.de
|
||||
conrad.com
|
||||
bosch-home.com
|
||||
kaercher.com
|
||||
mobile.de
|
||||
deutschland.de
|
||||
google.de
|
||||
amazon.de
|
||||
einreiseanmeldung.de
|
||||
bmw.de
|
||||
daad.de
|
||||
volkswagen.de
|
||||
mtu.de
|
||||
aldi.com
|
||||
degruyter.com
|
||||
web.de
|
||||
uni-assist.de
|
||||
mercedes-benz.de
|
||||
embl.de
|
||||
denic.de
|
||||
bahn.de
|
||||
dedon.de
|
||||
giz.de
|
||||
berlin.de
|
||||
giz.de
|
||||
bundestag.de
|
||||
deutsche-rentenversicherung.de
|
||||
bild.de
|
||||
tum.de
|
||||
rewe.de
|
||||
commerzbank.de
|
||||
goethe.de
|
||||
spiegel.de
|
||||
pangaea.de
|
||||
tagesshau.de
|
||||
fussball.de
|
||||
adac.de
|
||||
boys-day.de
|
||||
audi.de
|
||||
lidl.de
|
||||
ndr.de
|
||||
visitberlin.de
|
||||
gesetze-im-internet.de
|
||||
dresden.de
|
||||
baden-wuttemberg.de
|
||||
hessen.de
|
||||
sparkasse.de
|
||||
museum-folkwang.de
|
||||
posteo.de
|
||||
wetter.de
|
||||
stern.de
|
||||
biontech.de
|
||||
mediamarkt.de
|
||||
rki.de
|
||||
deutsche-bank.de
|
||||
duden.de
|
||||
magro-aktuell.de
|
||||
notebooksbilliger.de
|
||||
idealo.de
|
||||
wg-gesucht.de
|
||||
bundesregierung.de
|
||||
ebay.de
|
||||
rlp.de
|
||||
taz.de
|
||||
antolin.westermann.de
|
||||
electricbrands.de
|
||||
suedzucker.de
|
||||
wetteronline.de
|
||||
kik.de
|
||||
wetterzentrale.de
|
||||
service.bund.de
|
||||
katholisch.de
|
||||
homboldt-foundation.de
|
||||
deginvest.de
|
||||
comdirect.de
|
||||
standaard.be
|
||||
jocobs-university.de
|
||||
naspa.de
|
||||
uni-bonn.de
|
||||
zalando.de
|
||||
ausbildung.de
|
||||
uni-hamburg.de
|
||||
auswaertiges-amt.de
|
||||
jab.de
|
||||
de.dwa.de
|
||||
mdc-berlin.de
|
||||
dlr.de
|
||||
rwth-aachen.de
|
||||
studip.uni-goettingen.de
|
||||
bzga.de
|
||||
gtai.de
|
||||
bunte.de
|
||||
decathlon.de
|
||||
denbi.de
|
||||
hannover.de
|
||||
ewe.de
|
||||
impfdashboard.de
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue