Add logging and url validations

master
Lio Novelli 2022-02-02 19:57:31 +01:00
parent d3c8b6e45c
commit 74e2d03a00
2 changed files with 17 additions and 6 deletions

View File

@ -8,6 +8,7 @@ from urllib.parse import urlparse
class GFontsSpider(scrapy.Spider): class GFontsSpider(scrapy.Spider):
name = "gfonts" name = "gfonts"
#start_urls = self.getUrls() #start_urls = self.getUrls()
checked_domains = []
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
def __init__(self): def __init__(self):
@ -16,8 +17,11 @@ class GFontsSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
# check current url - german or eu (todo) # check current url - german or eu (todo)
# check if api.gfonts.com is found # check if api.gfonts.com is found
# todo: check if google analytics is found # @todo: check if google analytics is found
if (self.isGerman(response.url)): parsed = urlparse(response.url)
self.checked_domains.append(parsed.hostname)
self.logNewDomain(response.url)
if self.isGerman(response.url):
self.findGFonts(response) self.findGFonts(response)
self.parseOn(response) self.parseOn(response)
@ -37,13 +41,13 @@ class GFontsSpider(scrapy.Spider):
return tld == '.de' return tld == '.de'
def isGerman(self, url): def isGerman(self, url):
if (not self.isGermanTLD(url)): if not self.isGermanTLD(url):
return self.isCountryGerman(url) return self.isCountryGerman(url)
return True return True
def findGFonts(self, response): def findGFonts(self, response):
for links in response.css('head links'): for links in response.css('head links'):
if ('fonts.googleapis.com' in links.attrib['href']): if 'fonts.googleapis.com' in links.attrib['href']:
yield { yield {
'url': response.url, 'url': response.url,
'gfonts': True, 'gfonts': True,
@ -52,13 +56,19 @@ class GFontsSpider(scrapy.Spider):
def parseOn(self, response): def parseOn(self, response):
for links in response.css('a'): for links in response.css('a'):
url = links.attrib['href'] url = links.attrib['href']
if (bool(urlparse(url).netloc)): # parse valid urls
if validators.url(url) and bool(urlparse(url).netloc):
current = urlparse(response.url) current = urlparse(response.url)
found = urlparse(url) found = urlparse(url)
if (current.hostname != found.hostname): if current.hostname != found.hostname and found.hostname not in self.checked_domains:
yield response.follow(url, callback=self.parse) yield response.follow(url, callback=self.parse)
def getUrls(self): def getUrls(self):
with open('sites.txt') as sites_file: with open('sites.txt') as sites_file:
sites = sites_file.readlines() sites = sites_file.readlines()
return map(lambda s: 'https://' + s,sites) return map(lambda s: 'https://' + s,sites)
def logNewDomain(self, url):
print('############################################')
print('###### ' + url + ' #######')
print('############################################')

View File

@ -1 +1,2 @@
scrapy scrapy
validators