Improve gfonts spider start working on start_urls from search result spider

master
Lio Novelli 2022-02-06 14:39:12 +01:00
parent 1124d81489
commit a8944a4442
12 changed files with 130 additions and 17 deletions

View File

@ -1,14 +1,17 @@
import scrapy import scrapy
from scrapy.spiders import Spider,CrawlSpider, Rule
import socket import socket
import re import re
import json import json
from urllib.request import urlopen from urllib.request import urlopen
from urllib.parse import urlparse from urllib.parse import urlparse
from utility.countries import *
class GFontsSpider(scrapy.Spider): class GFontsSpider(Spider):
name = "gfonts" name = "gfonts"
#start_urls = self.getUrls() #start_urls = self.getUrls()
checked_domains = [] checked_domains = []
eu_domains = []
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
def __init__(self): def __init__(self):
@ -18,12 +21,14 @@ class GFontsSpider(scrapy.Spider):
# check current url - german or eu (todo) # check current url - german or eu (todo)
# check if api.gfonts.com is found # check if api.gfonts.com is found
# @todo: check if google analytics is found # @todo: check if google analytics is found
parsed = urlparse(response.url) if self.isEuropean(response.url):
self.checked_domains.append(parsed.hostname) self.writeTrackers(response)
self.logNewDomain(response.url) parsed = urlparse(response.url)
if self.isGerman(response.url): self.eu_domains.append(parsed.hostname)
self.findGFonts(response) self.logNewDomain(response.url)
self.parseOn(response) else:
print("NOT EUROPEAN: " + response.url)
self.parseOn(response)
def getCountryOfUrl(self, url): def getCountryOfUrl(self, url):
ip = socket.gethostbyname(url) ip = socket.gethostbyname(url)
@ -44,24 +49,60 @@ class GFontsSpider(scrapy.Spider):
if not self.isGermanTLD(url): if not self.isGermanTLD(url):
return self.isCountryGerman(url) return self.isCountryGerman(url)
return True return True
def isEuropean(self, url):
eu_tlds = self.getEuTlds()
parts = urlparse(url)
tld = parts.hostname[-3:]
if tld in eu_tlds:
return eu_tlds[tld]
country = self.getCountryOfUrl(url)
if country in eu_tlds.values():
return country
return False
def findGFonts(self, response): def findGFonts(self, response):
for links in response.css('head link'): for links in response.css('head link'):
if 'fonts.googleapis.com' in links.attrib['href']: return 'fonts.googleapis.com' in links.attrib['href']
yield { def findGTrackers(self, response):
'url': response.url, trackers = { 'ga' : 'www.google-analytics.com',
'gfonts': True, 'gt' : 'www.googletagmanager.com'}
result = {'ga':0, 'gt':0}
for script in response.css('script::text').getall():
if script.find(trackers['ga']) > 0:
result['ga'] = 1
if script.find(trackers['gt']) > 0:
result['gt'] = 1
return result
def findMetaPixel(self, response):
for img in response.css('img'):
if img.attrib['src'].find('www.facebook.com/tr?id='):
return TRUE
return FALSE
def writeTrackers(self,response):
gtrackers = self.findGTrackers(response)
yield {
'domain': urlparse(response.url).netloc,
'country': self.isEuropean(response.url),
'gf': self.findGFonts(response),
'ga': gtrackers['ga'],
'gt': gtrackers['gm'],
'mp': self.findMetaPixel(response)
} }
def parseOn(self, response): def parseOn(self, response):
for links in response.css('a'): links = response.css('a');
url = links.attrib['href'] print('FOUND: ' + str(len(links)) + ' LINKS')
for link in links:
url = link.attrib['href']
# parse valid urls # parse valid urls
if validators.url(url) and bool(urlparse(url).netloc): found = urlparse(url)
if validators.url(url) and bool(found.netloc):
current = urlparse(response.url) current = urlparse(response.url)
found = urlparse(url) if current.hostname != found.hostname:
if current.hostname != found.hostname and found.hostname not in self.checked_domains:
yield response.follow(url, callback=self.parse) yield response.follow(url, callback=self.parse)
else:
print("NOT FOLLOWING: " + url)
def getUrls(self): def getUrls(self):
with open('sites.txt') as sites_file: with open('sites.txt') as sites_file:
@ -72,3 +113,11 @@ class GFontsSpider(scrapy.Spider):
print('############################################') print('############################################')
print('###### ' + url + ' #######') print('###### ' + url + ' #######')
print('############################################') print('############################################')
def getEuTlds(self):
return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ',
'.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI',
'.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT',
'.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO',
'.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK',
'.bg':'BG', '.cy':'CY', '.mt':'MT'}

View File

@ -75,7 +75,7 @@ kik.de
wetterzentrale.de wetterzentrale.de
service.bund.de service.bund.de
katholisch.de katholisch.de
homboldt-foundation.de humboldt-foundation.de
deginvest.de deginvest.de
comdirect.de comdirect.de
standaard.be standaard.be

View File

@ -0,0 +1,17 @@
import scrapy
#import pandas
from scrapy.linkextractors import LinkExtractor
#from utility.countries import getEuTlds
from utility import countries
class firstSpider(scrapy.Spider):
name = "start_urls"
def __init__(self):
eu_tlds = countries.getEuTlds()
self.start_urls = map(lambda t: 'https://www.google.com/search?q=inurl%3A' + t, eu_tlds.keys())
def parse(self, response):
xlink = LinkExtractor()
for link in xlink.extract_links(response):
print(link)

Binary file not shown.

View File

View File

@ -0,0 +1,47 @@
# utility functions for countries
# import pycountry
import socket
from urllib.request import urlopen
import json
from urllib.parse import urlparse
def getEuTlds():
# map tld to alpha_2
return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ',
'.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI',
'.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT',
'.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO',
'.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK',
'.bg':'BG', '.cy':'CY', '.mt':'MT'}
def getCountryOfUrl(url):
ip = socket.gethostbyname(url)
api_url = 'https://ipinfo.io/' + ip + '/json'
response = urlopen(api_url)
data = json.load(response)
return data['country']
def isCountryGerman(self, url):
return 'DE' == getCountryOfUrl(url)
def isGermanTLD(url):
parts = urlparse(url)
tld = parts.hostname[-3:]
return tld == '.de'
def isGerman(self, url):
if not isGermanTLD(url):
return isCountryGerman(url)
return True
def isEuropean(url):
eu_tlds = getEuTlds()
parts = urlparse(url)
tld = parts.hostname[-3:]
if tld in eu_tlds:
return eu_tlds[tld]
country = getCountryOfUrl(url)
if country in eu_tlds.values():
return country
return False