Improve gfonts spider start working on start_urls from search result spider
parent
1124d81489
commit
a8944a4442
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,14 +1,17 @@
|
||||||
import scrapy
|
import scrapy
|
||||||
|
from scrapy.spiders import Spider,CrawlSpider, Rule
|
||||||
import socket
|
import socket
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from utility.countries import *
|
||||||
|
|
||||||
class GFontsSpider(scrapy.Spider):
|
class GFontsSpider(Spider):
|
||||||
name = "gfonts"
|
name = "gfonts"
|
||||||
#start_urls = self.getUrls()
|
#start_urls = self.getUrls()
|
||||||
checked_domains = []
|
checked_domains = []
|
||||||
|
eu_domains = []
|
||||||
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
|
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -18,12 +21,14 @@ class GFontsSpider(scrapy.Spider):
|
||||||
# check current url - german or eu (todo)
|
# check current url - german or eu (todo)
|
||||||
# check if api.gfonts.com is found
|
# check if api.gfonts.com is found
|
||||||
# @todo: check if google analytics is found
|
# @todo: check if google analytics is found
|
||||||
parsed = urlparse(response.url)
|
if self.isEuropean(response.url):
|
||||||
self.checked_domains.append(parsed.hostname)
|
self.writeTrackers(response)
|
||||||
self.logNewDomain(response.url)
|
parsed = urlparse(response.url)
|
||||||
if self.isGerman(response.url):
|
self.eu_domains.append(parsed.hostname)
|
||||||
self.findGFonts(response)
|
self.logNewDomain(response.url)
|
||||||
self.parseOn(response)
|
else:
|
||||||
|
print("NOT EUROPEAN: " + response.url)
|
||||||
|
self.parseOn(response)
|
||||||
|
|
||||||
def getCountryOfUrl(self, url):
|
def getCountryOfUrl(self, url):
|
||||||
ip = socket.gethostbyname(url)
|
ip = socket.gethostbyname(url)
|
||||||
|
@ -44,24 +49,60 @@ class GFontsSpider(scrapy.Spider):
|
||||||
if not self.isGermanTLD(url):
|
if not self.isGermanTLD(url):
|
||||||
return self.isCountryGerman(url)
|
return self.isCountryGerman(url)
|
||||||
return True
|
return True
|
||||||
|
def isEuropean(self, url):
|
||||||
|
eu_tlds = self.getEuTlds()
|
||||||
|
parts = urlparse(url)
|
||||||
|
tld = parts.hostname[-3:]
|
||||||
|
if tld in eu_tlds:
|
||||||
|
return eu_tlds[tld]
|
||||||
|
country = self.getCountryOfUrl(url)
|
||||||
|
if country in eu_tlds.values():
|
||||||
|
return country
|
||||||
|
return False
|
||||||
|
|
||||||
def findGFonts(self, response):
|
def findGFonts(self, response):
|
||||||
for links in response.css('head link'):
|
for links in response.css('head link'):
|
||||||
if 'fonts.googleapis.com' in links.attrib['href']:
|
return 'fonts.googleapis.com' in links.attrib['href']
|
||||||
yield {
|
def findGTrackers(self, response):
|
||||||
'url': response.url,
|
trackers = { 'ga' : 'www.google-analytics.com',
|
||||||
'gfonts': True,
|
'gt' : 'www.googletagmanager.com'}
|
||||||
|
result = {'ga':0, 'gt':0}
|
||||||
|
for script in response.css('script::text').getall():
|
||||||
|
if script.find(trackers['ga']) > 0:
|
||||||
|
result['ga'] = 1
|
||||||
|
if script.find(trackers['gt']) > 0:
|
||||||
|
result['gt'] = 1
|
||||||
|
return result
|
||||||
|
def findMetaPixel(self, response):
|
||||||
|
for img in response.css('img'):
|
||||||
|
if img.attrib['src'].find('www.facebook.com/tr?id='):
|
||||||
|
return TRUE
|
||||||
|
return FALSE
|
||||||
|
def writeTrackers(self,response):
|
||||||
|
gtrackers = self.findGTrackers(response)
|
||||||
|
yield {
|
||||||
|
'domain': urlparse(response.url).netloc,
|
||||||
|
'country': self.isEuropean(response.url),
|
||||||
|
'gf': self.findGFonts(response),
|
||||||
|
'ga': gtrackers['ga'],
|
||||||
|
'gt': gtrackers['gm'],
|
||||||
|
'mp': self.findMetaPixel(response)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parseOn(self, response):
|
def parseOn(self, response):
|
||||||
for links in response.css('a'):
|
links = response.css('a');
|
||||||
url = links.attrib['href']
|
print('FOUND: ' + str(len(links)) + ' LINKS')
|
||||||
|
for link in links:
|
||||||
|
url = link.attrib['href']
|
||||||
# parse valid urls
|
# parse valid urls
|
||||||
if validators.url(url) and bool(urlparse(url).netloc):
|
found = urlparse(url)
|
||||||
|
if validators.url(url) and bool(found.netloc):
|
||||||
current = urlparse(response.url)
|
current = urlparse(response.url)
|
||||||
found = urlparse(url)
|
if current.hostname != found.hostname:
|
||||||
if current.hostname != found.hostname and found.hostname not in self.checked_domains:
|
|
||||||
yield response.follow(url, callback=self.parse)
|
yield response.follow(url, callback=self.parse)
|
||||||
|
else:
|
||||||
|
print("NOT FOLLOWING: " + url)
|
||||||
|
|
||||||
def getUrls(self):
|
def getUrls(self):
|
||||||
with open('sites.txt') as sites_file:
|
with open('sites.txt') as sites_file:
|
||||||
|
@ -72,3 +113,11 @@ class GFontsSpider(scrapy.Spider):
|
||||||
print('############################################')
|
print('############################################')
|
||||||
print('###### ' + url + ' #######')
|
print('###### ' + url + ' #######')
|
||||||
print('############################################')
|
print('############################################')
|
||||||
|
|
||||||
|
def getEuTlds(self):
|
||||||
|
return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ',
|
||||||
|
'.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI',
|
||||||
|
'.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT',
|
||||||
|
'.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO',
|
||||||
|
'.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK',
|
||||||
|
'.bg':'BG', '.cy':'CY', '.mt':'MT'}
|
||||||
|
|
|
@ -75,7 +75,7 @@ kik.de
|
||||||
wetterzentrale.de
|
wetterzentrale.de
|
||||||
service.bund.de
|
service.bund.de
|
||||||
katholisch.de
|
katholisch.de
|
||||||
homboldt-foundation.de
|
humboldt-foundation.de
|
||||||
deginvest.de
|
deginvest.de
|
||||||
comdirect.de
|
comdirect.de
|
||||||
standaard.be
|
standaard.be
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
import scrapy
|
||||||
|
#import pandas
|
||||||
|
from scrapy.linkextractors import LinkExtractor
|
||||||
|
#from utility.countries import getEuTlds
|
||||||
|
from utility import countries
|
||||||
|
|
||||||
|
class firstSpider(scrapy.Spider):
|
||||||
|
name = "start_urls"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
eu_tlds = countries.getEuTlds()
|
||||||
|
self.start_urls = map(lambda t: 'https://www.google.com/search?q=inurl%3A' + t, eu_tlds.keys())
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
xlink = LinkExtractor()
|
||||||
|
for link in xlink.extract_links(response):
|
||||||
|
print(link)
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,47 @@
|
||||||
|
# utility functions for countries
|
||||||
|
# import pycountry
|
||||||
|
import socket
|
||||||
|
from urllib.request import urlopen
|
||||||
|
import json
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
def getEuTlds():
|
||||||
|
# map tld to alpha_2
|
||||||
|
return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ',
|
||||||
|
'.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI',
|
||||||
|
'.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT',
|
||||||
|
'.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO',
|
||||||
|
'.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK',
|
||||||
|
'.bg':'BG', '.cy':'CY', '.mt':'MT'}
|
||||||
|
|
||||||
|
def getCountryOfUrl(url):
|
||||||
|
ip = socket.gethostbyname(url)
|
||||||
|
api_url = 'https://ipinfo.io/' + ip + '/json'
|
||||||
|
response = urlopen(api_url)
|
||||||
|
data = json.load(response)
|
||||||
|
return data['country']
|
||||||
|
|
||||||
|
def isCountryGerman(self, url):
|
||||||
|
return 'DE' == getCountryOfUrl(url)
|
||||||
|
|
||||||
|
def isGermanTLD(url):
|
||||||
|
parts = urlparse(url)
|
||||||
|
tld = parts.hostname[-3:]
|
||||||
|
return tld == '.de'
|
||||||
|
|
||||||
|
def isGerman(self, url):
|
||||||
|
if not isGermanTLD(url):
|
||||||
|
return isCountryGerman(url)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def isEuropean(url):
|
||||||
|
eu_tlds = getEuTlds()
|
||||||
|
parts = urlparse(url)
|
||||||
|
tld = parts.hostname[-3:]
|
||||||
|
if tld in eu_tlds:
|
||||||
|
return eu_tlds[tld]
|
||||||
|
country = getCountryOfUrl(url)
|
||||||
|
if country in eu_tlds.values():
|
||||||
|
return country
|
||||||
|
return False
|
||||||
|
|
Loading…
Reference in New Issue