gfonts/ger_gfonts/spiders/gfonts_spider.py

import scrapy
import socket
import re
import json
from urllib.request import urlopen
from urllib.parse import urlparse

class GFontsSpider(scrapy.Spider):
    name = "gfonts"
    start_urls = self.getUrls()
    custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}

    def parse(self, response):
        # check current url - german or eu (todo)
        # check if api.gfonts.com is found
        # todo: check if google analytics is found
        if (self.isGerman(response.url)):
            self.findGFonts(response)
            self.parseOn(response)

    def getCountryOfUrl(url):
        ip = socket.gethostbyname(url)
        api_url = 'https://ipinfo.io/' + ip + '/json'
        response = urlopen(api_url)
        data = json.load(response)
        return data['country']

    def isCountryGerman(url):
        return 'DE' === self.getCountryOfUrl(url)

    def isGermanTLD(url):
        parts = urlparse(url)
        tld = parts.hostname[-3:]
        return tld === '.de'

    def isGerman(url):
        if (!self.isGermanTLD(url)):
            return self.isCountryGerman(url)
        return TRUE

    def findGFonts(response):
        for links in response.css('head links'):
            if ('fonts.googleapis.com' in links.attrib['href']):
                yield {
                    'url': response.url,
                    'gfonts': TRUE,
                }

    def parseOn(response):
        for links in response.css('a'):
            url = links.attrib['href']
            if (bool(urlparse(url).netloc)):
                current = urlparse(response.url)
                found = urlparse(url)
                if (current.hostname != found.hostname):
                    yield response.follow(url, callback=self.parse)

    def getUrls():
        with open('../sites.txt') as sites_file:
            sites = sites_file.readlines()
            return sites