add domain logging

master
Lio Novelli 2022-02-02 20:07:56 +01:00
parent cf5ac35399
commit b69869a040
2 changed files with 1 additions and 62 deletions

View File

@ -6,10 +6,10 @@ Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html
## Usage
scrapy startproject ger_gfonts
cd ger_gfonts
scrapy crawl gfonts -O gfonts.json
## TODO
Start checking for google analytics for all eu websites.

View File

@ -1,61 +0,0 @@
import scrapy
import socket
import re
import json
from urllib.request import urlopen
from urllib.parse import urlparse
class GFontsSpider(scrapy.Spider):
name = "gfonts"
start_urls = self.getUrls()
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
def parse(self, response):
# check current url - german or eu (todo)
# check if api.gfonts.com is found
# todo: check if google analytics is found
if (self.isGerman(response.url)):
self.findGFonts(response)
self.parseOn(response)
def getCountryOfUrl(url):
ip = socket.gethostbyname(url)
api_url = 'https://ipinfo.io/' + ip + '/json'
response = urlopen(api_url)
data = json.load(response)
return data['country']
def isCountryGerman(url):
return 'DE' === self.getCountryOfUrl(url)
def isGermanTLD(url):
parts = urlparse(url)
tld = parts.hostname[-3:]
return tld === '.de'
def isGerman(url):
if (!self.isGermanTLD(url)):
return self.isCountryGerman(url)
return TRUE
def findGFonts(response):
for links in response.css('head links'):
if ('fonts.googleapis.com' in links.attrib['href']):
yield {
'url': response.url,
'gfonts': TRUE,
}
def parseOn(response):
for links in response.css('a'):
url = links.attrib['href']
if (bool(urlparse(url).netloc)):
current = urlparse(response.url)
found = urlparse(url)
if (current.hostname != found.hostname):
yield response.follow(url, callback=self.parse)
def getUrls():
with open('../sites.txt') as sites_file:
sites = sites_file.readlines()
return sites