gfonts/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py

import scrapy
from scrapy.spiders import Spider,CrawlSpider, Rule
import socket
import re
import json
from urllib.request import urlopen
from urllib.parse import urlparse
from utility.countries import *

class GFontsSpider(Spider):
    name = "gfonts"
    #start_urls = self.getUrls()
    checked_domains = []
    eu_domains = []
    custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}

    def __init__(self):
        self.start_urls = self.getUrls()

    def parse(self, response):
        # check current url - german or eu (todo)
        # check if api.gfonts.com is found
        # @todo: check if google analytics is found
        if self.isEuropean(response.url):
            self.writeTrackers(response)
            parsed = urlparse(response.url)
            self.eu_domains.append(parsed.hostname)
            self.logNewDomain(response.url)
        else:
            print("NOT EUROPEAN: " + response.url)
        self.parseOn(response)

    def getCountryOfUrl(self, url):
        ip = socket.gethostbyname(url)
        api_url = 'https://ipinfo.io/' + ip + '/json'
        response = urlopen(api_url)
        data = json.load(response)
        return data['country']

    def isCountryGerman(self, url):
        return 'DE' == self.getCountryOfUrl(url)

    def isGermanTLD(self, url):
        parts = urlparse(url)
        tld = parts.hostname[-3:]
        return tld == '.de'

    def isGerman(self, url):
        if not self.isGermanTLD(url):
            return self.isCountryGerman(url)
        return True
    def isEuropean(self, url):
        eu_tlds = self.getEuTlds()
        parts = urlparse(url)
        tld = parts.hostname[-3:]
        if tld in eu_tlds:
            return eu_tlds[tld]
        country = self.getCountryOfUrl(url)
        if country in eu_tlds.values():
            return country
        return False

    def findGFonts(self, response):
        for links in response.css('head link'):
            return 'fonts.googleapis.com' in links.attrib['href']
    def findGTrackers(self, response):
        trackers = { 'ga' : 'www.google-analytics.com',
                'gt' : 'www.googletagmanager.com'}
        result = {'ga':0, 'gt':0}
        for script in response.css('script::text').getall():
            if script.find(trackers['ga']) > 0:
                result['ga'] = 1
            if script.find(trackers['gt']) > 0:
                result['gt'] = 1
        return result
    def findMetaPixel(self, response):
        for img in response.css('img'):
            if img.attrib['src'].find('www.facebook.com/tr?id='):
                return TRUE
        return FALSE
    def writeTrackers(self,response):
        gtrackers = self.findGTrackers(response)
        yield {
            'domain': urlparse(response.url).netloc,
            'country': self.isEuropean(response.url),
            'gf': self.findGFonts(response),
            'ga': gtrackers['ga'],
            'gt': gtrackers['gm'],
            'mp': self.findMetaPixel(response)
                }


    def parseOn(self, response):
        links = response.css('a');
        print('FOUND: ' + str(len(links)) + ' LINKS')
        for link in links:
            url = link.attrib['href']
            # parse valid urls
            found = urlparse(url)
            if validators.url(url) and bool(found.netloc):
                current = urlparse(response.url)
                if current.hostname != found.hostname:
                    yield response.follow(url, callback=self.parse)
                else:
                    print("NOT FOLLOWING: " + url)

    def getUrls(self):
        with open('sites.txt') as sites_file:
            sites = sites_file.readlines()
            return map(lambda s: 'https://' + s,sites)

    def logNewDomain(self, url):
        print('############################################')
        print('######   ' + url + '   #######')
        print('############################################')

    def getEuTlds(self):
        return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ',
        '.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI',
        '.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT',
        '.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO',
        '.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK',
        '.bg':'BG', '.cy':'CY', '.mt':'MT'}