Add logging and url validations
parent
d3c8b6e45c
commit
74e2d03a00
|
@ -8,6 +8,7 @@ from urllib.parse import urlparse
|
||||||
class GFontsSpider(scrapy.Spider):
|
class GFontsSpider(scrapy.Spider):
|
||||||
name = "gfonts"
|
name = "gfonts"
|
||||||
#start_urls = self.getUrls()
|
#start_urls = self.getUrls()
|
||||||
|
checked_domains = []
|
||||||
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
|
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -16,8 +17,11 @@ class GFontsSpider(scrapy.Spider):
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
# check current url - german or eu (todo)
|
# check current url - german or eu (todo)
|
||||||
# check if api.gfonts.com is found
|
# check if api.gfonts.com is found
|
||||||
# todo: check if google analytics is found
|
# @todo: check if google analytics is found
|
||||||
if (self.isGerman(response.url)):
|
parsed = urlparse(response.url)
|
||||||
|
self.checked_domains.append(parsed.hostname)
|
||||||
|
self.logNewDomain(response.url)
|
||||||
|
if self.isGerman(response.url):
|
||||||
self.findGFonts(response)
|
self.findGFonts(response)
|
||||||
self.parseOn(response)
|
self.parseOn(response)
|
||||||
|
|
||||||
|
@ -37,13 +41,13 @@ class GFontsSpider(scrapy.Spider):
|
||||||
return tld == '.de'
|
return tld == '.de'
|
||||||
|
|
||||||
def isGerman(self, url):
|
def isGerman(self, url):
|
||||||
if (not self.isGermanTLD(url)):
|
if not self.isGermanTLD(url):
|
||||||
return self.isCountryGerman(url)
|
return self.isCountryGerman(url)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def findGFonts(self, response):
|
def findGFonts(self, response):
|
||||||
for links in response.css('head links'):
|
for links in response.css('head links'):
|
||||||
if ('fonts.googleapis.com' in links.attrib['href']):
|
if 'fonts.googleapis.com' in links.attrib['href']:
|
||||||
yield {
|
yield {
|
||||||
'url': response.url,
|
'url': response.url,
|
||||||
'gfonts': True,
|
'gfonts': True,
|
||||||
|
@ -52,13 +56,19 @@ class GFontsSpider(scrapy.Spider):
|
||||||
def parseOn(self, response):
|
def parseOn(self, response):
|
||||||
for links in response.css('a'):
|
for links in response.css('a'):
|
||||||
url = links.attrib['href']
|
url = links.attrib['href']
|
||||||
if (bool(urlparse(url).netloc)):
|
# parse valid urls
|
||||||
|
if validators.url(url) and bool(urlparse(url).netloc):
|
||||||
current = urlparse(response.url)
|
current = urlparse(response.url)
|
||||||
found = urlparse(url)
|
found = urlparse(url)
|
||||||
if (current.hostname != found.hostname):
|
if current.hostname != found.hostname and found.hostname not in self.checked_domains:
|
||||||
yield response.follow(url, callback=self.parse)
|
yield response.follow(url, callback=self.parse)
|
||||||
|
|
||||||
def getUrls(self):
|
def getUrls(self):
|
||||||
with open('sites.txt') as sites_file:
|
with open('sites.txt') as sites_file:
|
||||||
sites = sites_file.readlines()
|
sites = sites_file.readlines()
|
||||||
return map(lambda s: 'https://' + s,sites)
|
return map(lambda s: 'https://' + s,sites)
|
||||||
|
|
||||||
|
def logNewDomain(self, url):
|
||||||
|
print('############################################')
|
||||||
|
print('###### ' + url + ' #######')
|
||||||
|
print('############################################')
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
scrapy
|
scrapy
|
||||||
|
validators
|
||||||
|
|
Loading…
Reference in New Issue