Initial commit of searching for google fonts on german websites.
commit
1224833a1e
|
@ -0,0 +1,17 @@
|
||||||
|
# German google fonts pages
|
||||||
|
|
||||||
|
A spider that's looking for german page with google fonts hosted on google.
|
||||||
|
|
||||||
|
Also look for google analytics on a website.
|
||||||
|
|
||||||
|
## Checking website origin:
|
||||||
|
|
||||||
|
https://ipinfo.io/
|
||||||
|
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
Start checking for google analytics.
|
||||||
|
|
||||||
|
## IDEAS
|
||||||
|
|
||||||
|
Make it into browserextension that would notify you.
|
|
@ -0,0 +1 @@
|
||||||
|
scrapy
|
|
@ -0,0 +1,55 @@
|
||||||
|
import scrapy
|
||||||
|
import socket
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
from urllib.request import urlopen
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
class GFontsSpider(scrapy.Spider):
|
||||||
|
name = "gfonts"
|
||||||
|
start_urls = get_urls()
|
||||||
|
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
# check current url - german or eu (todo)
|
||||||
|
# check if api.gfonts.com is found
|
||||||
|
# todo: check if google analytics is found
|
||||||
|
if (isGerman(response.url)):
|
||||||
|
findGFonts(response)
|
||||||
|
parseOn(response)
|
||||||
|
|
||||||
|
def getCountryOfUrl(url):
|
||||||
|
ip = socket.gethostbyname(url)
|
||||||
|
api_url = 'https://ipinfo.io/' + ip + '/json'
|
||||||
|
response = urlopen(api_url)
|
||||||
|
data = json.load(response)
|
||||||
|
return data['country']
|
||||||
|
|
||||||
|
def isCountryGerman(url):
|
||||||
|
return 'DE' === getCountryOfUrl(url)
|
||||||
|
|
||||||
|
def isGermanTLD(url):
|
||||||
|
parts = urlparse(url)
|
||||||
|
tld = parts.hostname[-3:]
|
||||||
|
return tld === '.de'
|
||||||
|
|
||||||
|
def isGerman(url):
|
||||||
|
if (!isGermanTLD(url)):
|
||||||
|
return isCountryGerman(url)
|
||||||
|
return TRUE
|
||||||
|
|
||||||
|
def findGFonts(response):
|
||||||
|
for links in response.css('head links'):
|
||||||
|
if ('fonts.googleapis.com' in links.attrib['href']):
|
||||||
|
yield {
|
||||||
|
'url': response.url,
|
||||||
|
'gfonts': TRUE,
|
||||||
|
}
|
||||||
|
def parseOn(response):
|
||||||
|
for links in response.css('a'):
|
||||||
|
url = links.attrib['href']
|
||||||
|
if (bool(urlparse(url).netloc)):
|
||||||
|
current = urlparse(response.url)
|
||||||
|
found = urlparse(url)
|
||||||
|
if (current.hostname != found.hostname):
|
||||||
|
yield response.follow(url, callback=self.parse)
|
Loading…
Reference in New Issue