add domain logging
parent
cf5ac35399
commit
b69869a040
|
@ -6,10 +6,10 @@ Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
scrapy startproject ger_gfonts
|
||||||
cd ger_gfonts
|
cd ger_gfonts
|
||||||
scrapy crawl gfonts -O gfonts.json
|
scrapy crawl gfonts -O gfonts.json
|
||||||
|
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
Start checking for google analytics for all eu websites.
|
Start checking for google analytics for all eu websites.
|
||||||
|
|
|
@ -1,61 +0,0 @@
|
||||||
import scrapy
|
|
||||||
import socket
|
|
||||||
import re
|
|
||||||
import json
|
|
||||||
from urllib.request import urlopen
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
class GFontsSpider(scrapy.Spider):
|
|
||||||
name = "gfonts"
|
|
||||||
start_urls = self.getUrls()
|
|
||||||
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
# check current url - german or eu (todo)
|
|
||||||
# check if api.gfonts.com is found
|
|
||||||
# todo: check if google analytics is found
|
|
||||||
if (self.isGerman(response.url)):
|
|
||||||
self.findGFonts(response)
|
|
||||||
self.parseOn(response)
|
|
||||||
|
|
||||||
def getCountryOfUrl(url):
|
|
||||||
ip = socket.gethostbyname(url)
|
|
||||||
api_url = 'https://ipinfo.io/' + ip + '/json'
|
|
||||||
response = urlopen(api_url)
|
|
||||||
data = json.load(response)
|
|
||||||
return data['country']
|
|
||||||
|
|
||||||
def isCountryGerman(url):
|
|
||||||
return 'DE' === self.getCountryOfUrl(url)
|
|
||||||
|
|
||||||
def isGermanTLD(url):
|
|
||||||
parts = urlparse(url)
|
|
||||||
tld = parts.hostname[-3:]
|
|
||||||
return tld === '.de'
|
|
||||||
|
|
||||||
def isGerman(url):
|
|
||||||
if (!self.isGermanTLD(url)):
|
|
||||||
return self.isCountryGerman(url)
|
|
||||||
return TRUE
|
|
||||||
|
|
||||||
def findGFonts(response):
|
|
||||||
for links in response.css('head links'):
|
|
||||||
if ('fonts.googleapis.com' in links.attrib['href']):
|
|
||||||
yield {
|
|
||||||
'url': response.url,
|
|
||||||
'gfonts': TRUE,
|
|
||||||
}
|
|
||||||
|
|
||||||
def parseOn(response):
|
|
||||||
for links in response.css('a'):
|
|
||||||
url = links.attrib['href']
|
|
||||||
if (bool(urlparse(url).netloc)):
|
|
||||||
current = urlparse(response.url)
|
|
||||||
found = urlparse(url)
|
|
||||||
if (current.hostname != found.hostname):
|
|
||||||
yield response.follow(url, callback=self.parse)
|
|
||||||
|
|
||||||
def getUrls():
|
|
||||||
with open('../sites.txt') as sites_file:
|
|
||||||
sites = sites_file.readlines()
|
|
||||||
return sites
|
|
Loading…
Reference in New Issue