Add logging and url validations

2022-02-02 19:57:31 +01:00 · 2022-02-02 19:57:31 +01:00 · 74e2d03a00
parent d3c8b6e45c
commit 74e2d03a00
2 changed files with 17 additions and 6 deletions
--- a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py
+++ b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py
@ -8,6 +8,7 @@ from urllib.parse import urlparse
 class GFontsSpider(scrapy.Spider):
    name = "gfonts"
    #start_urls = self.getUrls()
+    checked_domains = []
    custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}

    def __init__(self):
@ -16,8 +17,11 @@ class GFontsSpider(scrapy.Spider):
    def parse(self, response):
        # check current url - german or eu (todo)
        # check if api.gfonts.com is found
-        # todo: check if google analytics is found
-        if (self.isGerman(response.url)):
+        # @todo: check if google analytics is found
+        parsed = urlparse(response.url)
+        self.checked_domains.append(parsed.hostname)
+        self.logNewDomain(response.url)
+        if self.isGerman(response.url):
            self.findGFonts(response)
            self.parseOn(response)

@ -37,13 +41,13 @@ class GFontsSpider(scrapy.Spider):
        return tld == '.de'

    def isGerman(self, url):
-        if (not self.isGermanTLD(url)):
+        if not self.isGermanTLD(url):
            return self.isCountryGerman(url)
        return True

    def findGFonts(self, response):
        for links in response.css('head links'):
-            if ('fonts.googleapis.com' in links.attrib['href']):
+            if 'fonts.googleapis.com' in links.attrib['href']:
                yield {
                    'url': response.url,
                    'gfonts': True,
@ -52,13 +56,19 @@ class GFontsSpider(scrapy.Spider):
    def parseOn(self, response):
        for links in response.css('a'):
            url = links.attrib['href']
-            if (bool(urlparse(url).netloc)):
+            # parse valid urls
+            if validators.url(url) and bool(urlparse(url).netloc):
                current = urlparse(response.url)
                found = urlparse(url)
-                if (current.hostname != found.hostname):
+                if current.hostname != found.hostname and found.hostname not in self.checked_domains:
                    yield response.follow(url, callback=self.parse)

    def getUrls(self):
        with open('sites.txt') as sites_file:
            sites = sites_file.readlines()
            return map(lambda s: 'https://' + s,sites)
+
+    def logNewDomain(self, url):
+        print('############################################')
+        print('######   ' + url + '   #######')
+        print('############################################')
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,2 @@
 scrapy
+validators