gfonts/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py

import scrapy
import socket
import re
import json
from urllib.request import urlopen
from urllib.parse import urlparse

class GFontsSpider(scrapy.Spider):
    name = "gfonts"
    #start_urls = self.getUrls()
    checked_domains = []
    custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}

    def __init__(self):
        self.start_urls = self.getUrls()

    def parse(self, response):
        # check current url - german or eu (todo)
        # check if api.gfonts.com is found
        # @todo: check if google analytics is found
        parsed = urlparse(response.url)
        self.checked_domains.append(parsed.hostname)
        self.logNewDomain(response.url)
        if self.isGerman(response.url):
            self.findGFonts(response)
            self.parseOn(response)

    def getCountryOfUrl(self, url):
        ip = socket.gethostbyname(url)
        api_url = 'https://ipinfo.io/' + ip + '/json'
        response = urlopen(api_url)
        data = json.load(response)
        return data['country']

    def isCountryGerman(self, url):
        return 'DE' == self.getCountryOfUrl(url)

    def isGermanTLD(self, url):
        parts = urlparse(url)
        tld = parts.hostname[-3:]
        return tld == '.de'

    def isGerman(self, url):
        if not self.isGermanTLD(url):
            return self.isCountryGerman(url)
        return True

    def findGFonts(self, response):
        for links in response.css('head links'):
            if 'fonts.googleapis.com' in links.attrib['href']:
                yield {
                    'url': response.url,
                    'gfonts': True,
                }

    def parseOn(self, response):
        for links in response.css('a'):
            url = links.attrib['href']
            # parse valid urls
            if validators.url(url) and bool(urlparse(url).netloc):
                current = urlparse(response.url)
                found = urlparse(url)
                if current.hostname != found.hostname and found.hostname not in self.checked_domains:
                    yield response.follow(url, callback=self.parse)

    def getUrls(self):
        with open('sites.txt') as sites_file:
            sites = sites_file.readlines()
            return map(lambda s: 'https://' + s,sites)

    def logNewDomain(self, url):
        print('############################################')
        print('######   ' + url + '   #######')
        print('############################################')