Fix return errors.
parent
a8944a4442
commit
35d4439d28
|
@ -1,2 +1,3 @@
|
||||||
__pycache__
|
|
||||||
gfonts.json
|
gfonts.json
|
||||||
|
__pycache__
|
||||||
|
*/__pycache__
|
||||||
|
|
25
README.md
25
README.md
|
@ -6,14 +6,39 @@ Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
pip3 install -e .
|
||||||
scrapy startproject ger_gfonts
|
scrapy startproject ger_gfonts
|
||||||
cd ger_gfonts
|
cd ger_gfonts
|
||||||
scrapy crawl gfonts -O gfonts.json
|
scrapy crawl gfonts -O gfonts.json
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
|
!Implement a crawling spider: https://doc.scrapy.org/en/latest/topics/spiders.html#crawlspider
|
||||||
|
|
||||||
Start checking for google analytics for all eu websites.
|
Start checking for google analytics for all eu websites.
|
||||||
|
|
||||||
|
- eu countries tlds: https://www.whois365.com/en/listtld/europe
|
||||||
|
|
||||||
|
### meta pixel
|
||||||
|
|
||||||
|
<!-- Meta Pixel Code -->
|
||||||
|
<script>
|
||||||
|
!function(f,b,e,v,n,t,s)
|
||||||
|
{if(f.fbq)return;n=f.fbq=function(){n.callMethod?
|
||||||
|
n.callMethod.apply(n,arguments):n.queue.push(arguments)};
|
||||||
|
if(!f._fbq)f._fbq=n;n.push=n;n.loaded=!0;n.version='2.0';
|
||||||
|
n.queue=[];t=b.createElement(e);t.async=!0;
|
||||||
|
t.src=v;s=b.getElementsByTagName(e)[0];
|
||||||
|
s.parentNode.insertBefore(t,s)}(window, document,'script',
|
||||||
|
'https://connect.facebook.net/en_US/fbevents.js');
|
||||||
|
fbq('init', '898263220867925');
|
||||||
|
fbq('track', 'PageView');
|
||||||
|
</script>
|
||||||
|
<noscript><img height="1" width="1" style="display:none"
|
||||||
|
src="https://www.facebook.com/tr?id=898263220867925&ev=PageView&noscript=1"
|
||||||
|
/></noscript>
|
||||||
|
<!-- End Meta Pixel Code -->
|
||||||
|
|
||||||
## IDEAS
|
## IDEAS
|
||||||
|
|
||||||
Make it into browserextension that would notify you.
|
Make it into browserextension that would notify you.
|
||||||
|
|
Binary file not shown.
|
@ -17,7 +17,7 @@ NEWSPIDER_MODULE = 'ger_gfonts.spiders'
|
||||||
#USER_AGENT = 'ger_gfonts (+http://www.yourdomain.com)'
|
#USER_AGENT = 'ger_gfonts (+http://www.yourdomain.com)'
|
||||||
|
|
||||||
# Obey robots.txt rules
|
# Obey robots.txt rules
|
||||||
ROBOTSTXT_OBEY = True
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
#CONCURRENT_REQUESTS = 32
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -5,7 +5,8 @@ import re
|
||||||
import json
|
import json
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from utility.countries import *
|
from utility.countries import isEuropean
|
||||||
|
import validators
|
||||||
|
|
||||||
class GFontsSpider(Spider):
|
class GFontsSpider(Spider):
|
||||||
name = "gfonts"
|
name = "gfonts"
|
||||||
|
@ -21,88 +22,73 @@ class GFontsSpider(Spider):
|
||||||
# check current url - german or eu (todo)
|
# check current url - german or eu (todo)
|
||||||
# check if api.gfonts.com is found
|
# check if api.gfonts.com is found
|
||||||
# @todo: check if google analytics is found
|
# @todo: check if google analytics is found
|
||||||
if self.isEuropean(response.url):
|
|
||||||
self.writeTrackers(response)
|
|
||||||
parsed = urlparse(response.url)
|
parsed = urlparse(response.url)
|
||||||
|
if isEuropean(response.url):
|
||||||
|
print("URL EUROPEAN: " + response.url)
|
||||||
|
if parsed.hostname not in self.eu_domains:
|
||||||
self.eu_domains.append(parsed.hostname)
|
self.eu_domains.append(parsed.hostname)
|
||||||
self.logNewDomain(response.url)
|
self.logNewDomain(response.url)
|
||||||
|
yield self.writeTrackers(response)
|
||||||
else:
|
else:
|
||||||
print("NOT EUROPEAN: " + response.url)
|
print("NOT EUROPEAN: " + response.url)
|
||||||
self.parseOn(response)
|
self.checked_domains.append(parsed.hostname)
|
||||||
|
for link in self.parseOn(response):
|
||||||
def getCountryOfUrl(self, url):
|
yield scrapy.Request(link, callback=self.parse)
|
||||||
ip = socket.gethostbyname(url)
|
|
||||||
api_url = 'https://ipinfo.io/' + ip + '/json'
|
|
||||||
response = urlopen(api_url)
|
|
||||||
data = json.load(response)
|
|
||||||
return data['country']
|
|
||||||
|
|
||||||
def isCountryGerman(self, url):
|
|
||||||
return 'DE' == self.getCountryOfUrl(url)
|
|
||||||
|
|
||||||
def isGermanTLD(self, url):
|
|
||||||
parts = urlparse(url)
|
|
||||||
tld = parts.hostname[-3:]
|
|
||||||
return tld == '.de'
|
|
||||||
|
|
||||||
def isGerman(self, url):
|
|
||||||
if not self.isGermanTLD(url):
|
|
||||||
return self.isCountryGerman(url)
|
|
||||||
return True
|
|
||||||
def isEuropean(self, url):
|
|
||||||
eu_tlds = self.getEuTlds()
|
|
||||||
parts = urlparse(url)
|
|
||||||
tld = parts.hostname[-3:]
|
|
||||||
if tld in eu_tlds:
|
|
||||||
return eu_tlds[tld]
|
|
||||||
country = self.getCountryOfUrl(url)
|
|
||||||
if country in eu_tlds.values():
|
|
||||||
return country
|
|
||||||
return False
|
|
||||||
|
|
||||||
def findGFonts(self, response):
|
def findGFonts(self, response):
|
||||||
for links in response.css('head link'):
|
for link in response.css('head link'):
|
||||||
return 'fonts.googleapis.com' in links.attrib['href']
|
try:
|
||||||
|
href = link.attrib['href']
|
||||||
|
if 'fonts.googleapis.com' in href:
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
return False
|
||||||
|
|
||||||
def findGTrackers(self, response):
|
def findGTrackers(self, response):
|
||||||
trackers = { 'ga' : 'www.google-analytics.com',
|
trackers = { 'ga' : 'www.google-analytics.com',
|
||||||
'gt' : 'www.googletagmanager.com'}
|
'gt' : 'www.googletagmanager.com'}
|
||||||
result = {'ga':0, 'gt':0}
|
result = {'ga':False, 'gt':False}
|
||||||
for script in response.css('script::text').getall():
|
for script in response.css('script::text').getall():
|
||||||
if script.find(trackers['ga']) > 0:
|
if script.find(trackers['ga']) > 0: result['ga'] = True
|
||||||
result['ga'] = 1
|
if script.find(trackers['gt']) > 0: result['gt'] = True
|
||||||
if script.find(trackers['gt']) > 0:
|
|
||||||
result['gt'] = 1
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def findMetaPixel(self, response):
|
def findMetaPixel(self, response):
|
||||||
for img in response.css('img'):
|
for img in response.css('img'):
|
||||||
if img.attrib['src'].find('www.facebook.com/tr?id='):
|
try:
|
||||||
return TRUE
|
if img.attrib['src'].find('www.facebook.com/tr?id=') > 0: return True
|
||||||
return FALSE
|
except:
|
||||||
|
continue
|
||||||
|
return False
|
||||||
|
|
||||||
def writeTrackers(self,response):
|
def writeTrackers(self,response):
|
||||||
gtrackers = self.findGTrackers(response)
|
gtrackers = self.findGTrackers(response)
|
||||||
yield {
|
return {
|
||||||
'domain': urlparse(response.url).netloc,
|
'domain': urlparse(response.url).netloc,
|
||||||
'country': self.isEuropean(response.url),
|
'country': isEuropean(response.url),
|
||||||
'gf': self.findGFonts(response),
|
'gf': self.findGFonts(response),
|
||||||
'ga': gtrackers['ga'],
|
'ga': gtrackers['ga'],
|
||||||
'gt': gtrackers['gm'],
|
'gt': gtrackers['gt'],
|
||||||
'mp': self.findMetaPixel(response)
|
'mp': self.findMetaPixel(response)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parseOn(self, response):
|
def parseOn(self, response):
|
||||||
links = response.css('a');
|
links = response.css('a');
|
||||||
print('FOUND: ' + str(len(links)) + ' LINKS')
|
print('FOUND: ' + str(len(links)) + ' LINKS')
|
||||||
|
next_urls = []
|
||||||
for link in links:
|
for link in links:
|
||||||
|
try:
|
||||||
url = link.attrib['href']
|
url = link.attrib['href']
|
||||||
# parse valid urls
|
|
||||||
found = urlparse(url)
|
found = urlparse(url)
|
||||||
if validators.url(url) and bool(found.netloc):
|
if validators.url(url) and bool(found.netloc):
|
||||||
current = urlparse(response.url)
|
current = urlparse(response.url)
|
||||||
if current.hostname != found.hostname:
|
if current.hostname != found.hostname and found.hostname not in self.checked_domains:
|
||||||
yield response.follow(url, callback=self.parse)
|
next_urls.append(url)
|
||||||
else:
|
except:
|
||||||
print("NOT FOLLOWING: " + url)
|
continue
|
||||||
|
print('FOLLOW: ' + str(len(next_urls)) + ' LINKS')
|
||||||
|
return next_urls
|
||||||
|
|
||||||
def getUrls(self):
|
def getUrls(self):
|
||||||
with open('sites.txt') as sites_file:
|
with open('sites.txt') as sites_file:
|
||||||
|
|
|
@ -3,9 +3,13 @@ import scrapy
|
||||||
from scrapy.linkextractors import LinkExtractor
|
from scrapy.linkextractors import LinkExtractor
|
||||||
#from utility.countries import getEuTlds
|
#from utility.countries import getEuTlds
|
||||||
from utility import countries
|
from utility import countries
|
||||||
|
from urllib.parse import urlencode, urlparse, parse_qs
|
||||||
|
|
||||||
class firstSpider(scrapy.Spider):
|
class startUrls(scrapy.Spider):
|
||||||
name = "start_urls"
|
name = "start_urls"
|
||||||
|
custom_settings = {
|
||||||
|
'ROBOTSTXT_OBEY': False
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
eu_tlds = countries.getEuTlds()
|
eu_tlds = countries.getEuTlds()
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -15,13 +15,14 @@ def getEuTlds():
|
||||||
'.bg':'BG', '.cy':'CY', '.mt':'MT'}
|
'.bg':'BG', '.cy':'CY', '.mt':'MT'}
|
||||||
|
|
||||||
def getCountryOfUrl(url):
|
def getCountryOfUrl(url):
|
||||||
ip = socket.gethostbyname(url)
|
parsed = urlparse(url)
|
||||||
|
ip = socket.gethostbyname(parsed.hostname)
|
||||||
api_url = 'https://ipinfo.io/' + ip + '/json'
|
api_url = 'https://ipinfo.io/' + ip + '/json'
|
||||||
response = urlopen(api_url)
|
response = urlopen(api_url)
|
||||||
data = json.load(response)
|
data = json.load(response)
|
||||||
return data['country']
|
return data['country']
|
||||||
|
|
||||||
def isCountryGerman(self, url):
|
def isCountryGerman(url):
|
||||||
return 'DE' == getCountryOfUrl(url)
|
return 'DE' == getCountryOfUrl(url)
|
||||||
|
|
||||||
def isGermanTLD(url):
|
def isGermanTLD(url):
|
||||||
|
@ -29,7 +30,7 @@ def isGermanTLD(url):
|
||||||
tld = parts.hostname[-3:]
|
tld = parts.hostname[-3:]
|
||||||
return tld == '.de'
|
return tld == '.de'
|
||||||
|
|
||||||
def isGerman(self, url):
|
def isGerman(url):
|
||||||
if not isGermanTLD(url):
|
if not isGermanTLD(url):
|
||||||
return isCountryGerman(url)
|
return isCountryGerman(url)
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
import requests
|
||||||
|
import urllib
|
||||||
|
import pandas as pd
|
||||||
|
from requests_html import HTML
|
||||||
|
from requests_html import HTMLSession
|
||||||
|
|
||||||
|
def get_source(url):
|
||||||
|
"""Return the source code for the provided URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (string): URL of the page to scrape.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
response (object): HTTP response object from requests_html.
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
session = HTMLSession()
|
||||||
|
response = session.get(url)
|
||||||
|
return response
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
def scrape_google(query):
|
||||||
|
|
||||||
|
query = urllib.parse.quote_plus(query)
|
||||||
|
response = get_source("https://www.google.co.uk/search?q=" + query)
|
||||||
|
|
||||||
|
links = list(response.html.absolute_links)
|
||||||
|
google_domains = ('https://www.google.',
|
||||||
|
'https://google.',
|
||||||
|
'https://webcache.googleusercontent.',
|
||||||
|
'http://webcache.googleusercontent.',
|
||||||
|
'https://policies.google.',
|
||||||
|
'https://support.google.',
|
||||||
|
'https://maps.google.')
|
||||||
|
|
||||||
|
for url in links[:]:
|
||||||
|
if url.startswith(google_domains):
|
||||||
|
links.remove(url)
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
scrape_google('inurl:.si')
|
Loading…
Reference in New Issue