gfonts/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py

import scrapy
import socket
import re
import json
from urllib.request import urlopen
from urllib.parse import urlparse

class GFontsSpider(scrapy.Spider):
    name = "gfonts"
    #start_urls = self.getUrls()
    custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}

    def __init__(self):
        self.start_urls = self.getUrls()

    def parse(self, response):
        # check current url - german or eu (todo)
        # check if api.gfonts.com is found
        # todo: check if google analytics is found
        if (self.isGerman(response.url)):
            self.findGFonts(response)
            self.parseOn(response)

    def getCountryOfUrl(self, url):
        ip = socket.gethostbyname(url)
        api_url = 'https://ipinfo.io/' + ip + '/json'
        response = urlopen(api_url)
        data = json.load(response)
        return data['country']

    def isCountryGerman(self, url):
        return 'DE' == self.getCountryOfUrl(url)

    def isGermanTLD(self, url):
        parts = urlparse(url)
        tld = parts.hostname[-3:]
        return tld == '.de'

    def isGerman(self, url):
        if (not self.isGermanTLD(url)):
            return self.isCountryGerman(url)
        return True

    def findGFonts(self, response):
        for links in response.css('head links'):
            if ('fonts.googleapis.com' in links.attrib['href']):
                yield {
                    'url': response.url,
                    'gfonts': True,
                }

    def parseOn(self, response):
        for links in response.css('a'):
            url = links.attrib['href']
            if (bool(urlparse(url).netloc)):
                current = urlparse(response.url)
                found = urlparse(url)
                if (current.hostname != found.hostname):
                    yield response.follow(url, callback=self.parse)

    def getUrls(self):
        with open('sites.txt') as sites_file:
            sites = sites_file.readlines()
            return map(lambda s: 'https://' + s,sites)