Fix return errors.

master
Lio Novelli 2022-02-06 19:09:10 +01:00
parent a8944a4442
commit 35d4439d28
12 changed files with 128 additions and 66 deletions

3
.gitignore vendored
View File

@ -1,2 +1,3 @@
__pycache__
gfonts.json gfonts.json
__pycache__
*/__pycache__

View File

@ -6,14 +6,39 @@ Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html
## Usage ## Usage
pip3 install -e .
scrapy startproject ger_gfonts scrapy startproject ger_gfonts
cd ger_gfonts cd ger_gfonts
scrapy crawl gfonts -O gfonts.json scrapy crawl gfonts -O gfonts.json
## TODO ## TODO
!Implement a crawling spider: https://doc.scrapy.org/en/latest/topics/spiders.html#crawlspider
Start checking for google analytics for all eu websites. Start checking for google analytics for all eu websites.
- eu countries tlds: https://www.whois365.com/en/listtld/europe
### meta pixel
<!-- Meta Pixel Code -->
<script>
!function(f,b,e,v,n,t,s)
{if(f.fbq)return;n=f.fbq=function(){n.callMethod?
n.callMethod.apply(n,arguments):n.queue.push(arguments)};
if(!f._fbq)f._fbq=n;n.push=n;n.loaded=!0;n.version='2.0';
n.queue=[];t=b.createElement(e);t.async=!0;
t.src=v;s=b.getElementsByTagName(e)[0];
s.parentNode.insertBefore(t,s)}(window, document,'script',
'https://connect.facebook.net/en_US/fbevents.js');
fbq('init', '898263220867925');
fbq('track', 'PageView');
</script>
<noscript><img height="1" width="1" style="display:none"
src="https://www.facebook.com/tr?id=898263220867925&ev=PageView&noscript=1"
/></noscript>
<!-- End Meta Pixel Code -->
## IDEAS ## IDEAS
Make it into browserextension that would notify you. Make it into browserextension that would notify you.

View File

@ -17,7 +17,7 @@ NEWSPIDER_MODULE = 'ger_gfonts.spiders'
#USER_AGENT = 'ger_gfonts (+http://www.yourdomain.com)' #USER_AGENT = 'ger_gfonts (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32

View File

@ -5,7 +5,8 @@ import re
import json import json
from urllib.request import urlopen from urllib.request import urlopen
from urllib.parse import urlparse from urllib.parse import urlparse
from utility.countries import * from utility.countries import isEuropean
import validators
class GFontsSpider(Spider): class GFontsSpider(Spider):
name = "gfonts" name = "gfonts"
@ -21,88 +22,73 @@ class GFontsSpider(Spider):
# check current url - german or eu (todo) # check current url - german or eu (todo)
# check if api.gfonts.com is found # check if api.gfonts.com is found
# @todo: check if google analytics is found # @todo: check if google analytics is found
if self.isEuropean(response.url): parsed = urlparse(response.url)
self.writeTrackers(response) if isEuropean(response.url):
parsed = urlparse(response.url) print("URL EUROPEAN: " + response.url)
self.eu_domains.append(parsed.hostname) if parsed.hostname not in self.eu_domains:
self.logNewDomain(response.url) self.eu_domains.append(parsed.hostname)
self.logNewDomain(response.url)
yield self.writeTrackers(response)
else: else:
print("NOT EUROPEAN: " + response.url) print("NOT EUROPEAN: " + response.url)
self.parseOn(response) self.checked_domains.append(parsed.hostname)
for link in self.parseOn(response):
def getCountryOfUrl(self, url): yield scrapy.Request(link, callback=self.parse)
ip = socket.gethostbyname(url)
api_url = 'https://ipinfo.io/' + ip + '/json'
response = urlopen(api_url)
data = json.load(response)
return data['country']
def isCountryGerman(self, url):
return 'DE' == self.getCountryOfUrl(url)
def isGermanTLD(self, url):
parts = urlparse(url)
tld = parts.hostname[-3:]
return tld == '.de'
def isGerman(self, url):
if not self.isGermanTLD(url):
return self.isCountryGerman(url)
return True
def isEuropean(self, url):
eu_tlds = self.getEuTlds()
parts = urlparse(url)
tld = parts.hostname[-3:]
if tld in eu_tlds:
return eu_tlds[tld]
country = self.getCountryOfUrl(url)
if country in eu_tlds.values():
return country
return False
def findGFonts(self, response): def findGFonts(self, response):
for links in response.css('head link'): for link in response.css('head link'):
return 'fonts.googleapis.com' in links.attrib['href'] try:
href = link.attrib['href']
if 'fonts.googleapis.com' in href:
return True
except:
continue
return False
def findGTrackers(self, response): def findGTrackers(self, response):
trackers = { 'ga' : 'www.google-analytics.com', trackers = { 'ga' : 'www.google-analytics.com',
'gt' : 'www.googletagmanager.com'} 'gt' : 'www.googletagmanager.com'}
result = {'ga':0, 'gt':0} result = {'ga':False, 'gt':False}
for script in response.css('script::text').getall(): for script in response.css('script::text').getall():
if script.find(trackers['ga']) > 0: if script.find(trackers['ga']) > 0: result['ga'] = True
result['ga'] = 1 if script.find(trackers['gt']) > 0: result['gt'] = True
if script.find(trackers['gt']) > 0:
result['gt'] = 1
return result return result
def findMetaPixel(self, response): def findMetaPixel(self, response):
for img in response.css('img'): for img in response.css('img'):
if img.attrib['src'].find('www.facebook.com/tr?id='): try:
return TRUE if img.attrib['src'].find('www.facebook.com/tr?id=') > 0: return True
return FALSE except:
continue
return False
def writeTrackers(self,response): def writeTrackers(self,response):
gtrackers = self.findGTrackers(response) gtrackers = self.findGTrackers(response)
yield { return {
'domain': urlparse(response.url).netloc, 'domain': urlparse(response.url).netloc,
'country': self.isEuropean(response.url), 'country': isEuropean(response.url),
'gf': self.findGFonts(response), 'gf': self.findGFonts(response),
'ga': gtrackers['ga'], 'ga': gtrackers['ga'],
'gt': gtrackers['gm'], 'gt': gtrackers['gt'],
'mp': self.findMetaPixel(response) 'mp': self.findMetaPixel(response)
} }
def parseOn(self, response): def parseOn(self, response):
links = response.css('a'); links = response.css('a');
print('FOUND: ' + str(len(links)) + ' LINKS') print('FOUND: ' + str(len(links)) + ' LINKS')
next_urls = []
for link in links: for link in links:
url = link.attrib['href'] try:
# parse valid urls url = link.attrib['href']
found = urlparse(url) found = urlparse(url)
if validators.url(url) and bool(found.netloc): if validators.url(url) and bool(found.netloc):
current = urlparse(response.url) current = urlparse(response.url)
if current.hostname != found.hostname: if current.hostname != found.hostname and found.hostname not in self.checked_domains:
yield response.follow(url, callback=self.parse) next_urls.append(url)
else: except:
print("NOT FOLLOWING: " + url) continue
print('FOLLOW: ' + str(len(next_urls)) + ' LINKS')
return next_urls
def getUrls(self): def getUrls(self):
with open('sites.txt') as sites_file: with open('sites.txt') as sites_file:

View File

@ -3,9 +3,13 @@ import scrapy
from scrapy.linkextractors import LinkExtractor from scrapy.linkextractors import LinkExtractor
#from utility.countries import getEuTlds #from utility.countries import getEuTlds
from utility import countries from utility import countries
from urllib.parse import urlencode, urlparse, parse_qs
class firstSpider(scrapy.Spider): class startUrls(scrapy.Spider):
name = "start_urls" name = "start_urls"
custom_settings = {
'ROBOTSTXT_OBEY': False
}
def __init__(self): def __init__(self):
eu_tlds = countries.getEuTlds() eu_tlds = countries.getEuTlds()

Binary file not shown.

View File

@ -15,13 +15,14 @@ def getEuTlds():
'.bg':'BG', '.cy':'CY', '.mt':'MT'} '.bg':'BG', '.cy':'CY', '.mt':'MT'}
def getCountryOfUrl(url): def getCountryOfUrl(url):
ip = socket.gethostbyname(url) parsed = urlparse(url)
ip = socket.gethostbyname(parsed.hostname)
api_url = 'https://ipinfo.io/' + ip + '/json' api_url = 'https://ipinfo.io/' + ip + '/json'
response = urlopen(api_url) response = urlopen(api_url)
data = json.load(response) data = json.load(response)
return data['country'] return data['country']
def isCountryGerman(self, url): def isCountryGerman(url):
return 'DE' == getCountryOfUrl(url) return 'DE' == getCountryOfUrl(url)
def isGermanTLD(url): def isGermanTLD(url):
@ -29,7 +30,7 @@ def isGermanTLD(url):
tld = parts.hostname[-3:] tld = parts.hostname[-3:]
return tld == '.de' return tld == '.de'
def isGerman(self, url): def isGerman(url):
if not isGermanTLD(url): if not isGermanTLD(url):
return isCountryGerman(url) return isCountryGerman(url)
return True return True

View File

@ -0,0 +1,45 @@
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
def get_source(url):
"""Return the source code for the provided URL.
Args:
url (string): URL of the page to scrape.
Returns:
response (object): HTTP response object from requests_html.
"""
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def scrape_google(query):
query = urllib.parse.quote_plus(query)
response = get_source("https://www.google.co.uk/search?q=" + query)
links = list(response.html.absolute_links)
google_domains = ('https://www.google.',
'https://google.',
'https://webcache.googleusercontent.',
'http://webcache.googleusercontent.',
'https://policies.google.',
'https://support.google.',
'https://maps.google.')
for url in links[:]:
if url.startswith(google_domains):
links.remove(url)
return links
scrape_google('inurl:.si')