gfonts/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py

75 lines
2.4 KiB
Python

import scrapy
import socket
import re
import json
from urllib.request import urlopen
from urllib.parse import urlparse
class GFontsSpider(scrapy.Spider):
name = "gfonts"
#start_urls = self.getUrls()
checked_domains = []
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
def __init__(self):
self.start_urls = self.getUrls()
def parse(self, response):
# check current url - german or eu (todo)
# check if api.gfonts.com is found
# @todo: check if google analytics is found
parsed = urlparse(response.url)
self.checked_domains.append(parsed.hostname)
self.logNewDomain(response.url)
if self.isGerman(response.url):
self.findGFonts(response)
self.parseOn(response)
def getCountryOfUrl(self, url):
ip = socket.gethostbyname(url)
api_url = 'https://ipinfo.io/' + ip + '/json'
response = urlopen(api_url)
data = json.load(response)
return data['country']
def isCountryGerman(self, url):
return 'DE' == self.getCountryOfUrl(url)
def isGermanTLD(self, url):
parts = urlparse(url)
tld = parts.hostname[-3:]
return tld == '.de'
def isGerman(self, url):
if not self.isGermanTLD(url):
return self.isCountryGerman(url)
return True
def findGFonts(self, response):
for links in response.css('head links'):
if 'fonts.googleapis.com' in links.attrib['href']:
yield {
'url': response.url,
'gfonts': True,
}
def parseOn(self, response):
for links in response.css('a'):
url = links.attrib['href']
# parse valid urls
if validators.url(url) and bool(urlparse(url).netloc):
current = urlparse(response.url)
found = urlparse(url)
if current.hostname != found.hostname and found.hostname not in self.checked_domains:
yield response.follow(url, callback=self.parse)
def getUrls(self):
with open('sites.txt') as sites_file:
sites = sites_file.readlines()
return map(lambda s: 'https://' + s,sites)
def logNewDomain(self, url):
print('############################################')
print('###### ' + url + ' #######')
print('############################################')