diff --git a/.gitignore b/.gitignore
index b096d08..63cc484 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
-__pycache__
gfonts.json
+__pycache__
+*/__pycache__
diff --git a/README.md b/README.md
index 16ccdf6..83a6be2 100644
--- a/README.md
+++ b/README.md
@@ -6,14 +6,39 @@ Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html
## Usage
+ pip3 install -e .
scrapy startproject ger_gfonts
cd ger_gfonts
scrapy crawl gfonts -O gfonts.json
## TODO
+!Implement a crawling spider: https://doc.scrapy.org/en/latest/topics/spiders.html#crawlspider
+
Start checking for google analytics for all eu websites.
+- eu countries tlds: https://www.whois365.com/en/listtld/europe
+
+### meta pixel
+
+
+
+
+
+
## IDEAS
Make it into browserextension that would notify you.
diff --git a/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc b/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc
index 912c08d..4b31e16 100644
Binary files a/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc and b/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc differ
diff --git a/ger_gfonts/ger_gfonts/settings.py b/ger_gfonts/ger_gfonts/settings.py
index facd1d3..0442b11 100644
--- a/ger_gfonts/ger_gfonts/settings.py
+++ b/ger_gfonts/ger_gfonts/settings.py
@@ -17,7 +17,7 @@ NEWSPIDER_MODULE = 'ger_gfonts.spiders'
#USER_AGENT = 'ger_gfonts (+http://www.yourdomain.com)'
# Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc
index 1dc051d..69a306d 100644
Binary files a/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc and b/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc differ
diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc
index 76470eb..ca557f2 100644
Binary files a/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc and b/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc differ
diff --git a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py
index 14be6ba..1bbed47 100644
--- a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py
+++ b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py
@@ -5,7 +5,8 @@ import re
import json
from urllib.request import urlopen
from urllib.parse import urlparse
-from utility.countries import *
+from utility.countries import isEuropean
+import validators
class GFontsSpider(Spider):
name = "gfonts"
@@ -21,88 +22,73 @@ class GFontsSpider(Spider):
# check current url - german or eu (todo)
# check if api.gfonts.com is found
# @todo: check if google analytics is found
- if self.isEuropean(response.url):
- self.writeTrackers(response)
- parsed = urlparse(response.url)
- self.eu_domains.append(parsed.hostname)
- self.logNewDomain(response.url)
+ parsed = urlparse(response.url)
+ if isEuropean(response.url):
+ print("URL EUROPEAN: " + response.url)
+ if parsed.hostname not in self.eu_domains:
+ self.eu_domains.append(parsed.hostname)
+ self.logNewDomain(response.url)
+ yield self.writeTrackers(response)
else:
print("NOT EUROPEAN: " + response.url)
- self.parseOn(response)
-
- def getCountryOfUrl(self, url):
- ip = socket.gethostbyname(url)
- api_url = 'https://ipinfo.io/' + ip + '/json'
- response = urlopen(api_url)
- data = json.load(response)
- return data['country']
-
- def isCountryGerman(self, url):
- return 'DE' == self.getCountryOfUrl(url)
-
- def isGermanTLD(self, url):
- parts = urlparse(url)
- tld = parts.hostname[-3:]
- return tld == '.de'
-
- def isGerman(self, url):
- if not self.isGermanTLD(url):
- return self.isCountryGerman(url)
- return True
- def isEuropean(self, url):
- eu_tlds = self.getEuTlds()
- parts = urlparse(url)
- tld = parts.hostname[-3:]
- if tld in eu_tlds:
- return eu_tlds[tld]
- country = self.getCountryOfUrl(url)
- if country in eu_tlds.values():
- return country
- return False
+ self.checked_domains.append(parsed.hostname)
+ for link in self.parseOn(response):
+ yield scrapy.Request(link, callback=self.parse)
def findGFonts(self, response):
- for links in response.css('head link'):
- return 'fonts.googleapis.com' in links.attrib['href']
+ for link in response.css('head link'):
+ try:
+ href = link.attrib['href']
+ if 'fonts.googleapis.com' in href:
+ return True
+ except:
+ continue
+ return False
+
def findGTrackers(self, response):
trackers = { 'ga' : 'www.google-analytics.com',
'gt' : 'www.googletagmanager.com'}
- result = {'ga':0, 'gt':0}
+ result = {'ga':False, 'gt':False}
for script in response.css('script::text').getall():
- if script.find(trackers['ga']) > 0:
- result['ga'] = 1
- if script.find(trackers['gt']) > 0:
- result['gt'] = 1
+ if script.find(trackers['ga']) > 0: result['ga'] = True
+ if script.find(trackers['gt']) > 0: result['gt'] = True
return result
+
def findMetaPixel(self, response):
for img in response.css('img'):
- if img.attrib['src'].find('www.facebook.com/tr?id='):
- return TRUE
- return FALSE
+ try:
+ if img.attrib['src'].find('www.facebook.com/tr?id=') > 0: return True
+ except:
+ continue
+ return False
+
def writeTrackers(self,response):
gtrackers = self.findGTrackers(response)
- yield {
+ return {
'domain': urlparse(response.url).netloc,
- 'country': self.isEuropean(response.url),
+ 'country': isEuropean(response.url),
'gf': self.findGFonts(response),
'ga': gtrackers['ga'],
- 'gt': gtrackers['gm'],
+ 'gt': gtrackers['gt'],
'mp': self.findMetaPixel(response)
}
-
def parseOn(self, response):
links = response.css('a');
print('FOUND: ' + str(len(links)) + ' LINKS')
+ next_urls = []
for link in links:
- url = link.attrib['href']
- # parse valid urls
- found = urlparse(url)
- if validators.url(url) and bool(found.netloc):
- current = urlparse(response.url)
- if current.hostname != found.hostname:
- yield response.follow(url, callback=self.parse)
- else:
- print("NOT FOLLOWING: " + url)
+ try:
+ url = link.attrib['href']
+ found = urlparse(url)
+ if validators.url(url) and bool(found.netloc):
+ current = urlparse(response.url)
+ if current.hostname != found.hostname and found.hostname not in self.checked_domains:
+ next_urls.append(url)
+ except:
+ continue
+ print('FOLLOW: ' + str(len(next_urls)) + ' LINKS')
+ return next_urls
def getUrls(self):
with open('sites.txt') as sites_file:
diff --git a/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py
index dc0028f..062cccf 100644
--- a/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py
+++ b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py
@@ -3,9 +3,13 @@ import scrapy
from scrapy.linkextractors import LinkExtractor
#from utility.countries import getEuTlds
from utility import countries
+from urllib.parse import urlencode, urlparse, parse_qs
-class firstSpider(scrapy.Spider):
+class startUrls(scrapy.Spider):
name = "start_urls"
+ custom_settings = {
+ 'ROBOTSTXT_OBEY': False
+ }
def __init__(self):
eu_tlds = countries.getEuTlds()
diff --git a/ger_gfonts/utility/.countries.py.swp b/ger_gfonts/utility/.countries.py.swp
deleted file mode 100644
index 6aec163..0000000
Binary files a/ger_gfonts/utility/.countries.py.swp and /dev/null differ
diff --git a/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc b/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc
index c102c3d..2576808 100644
Binary files a/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc and b/ger_gfonts/utility/__pycache__/countries.cpython-37.pyc differ
diff --git a/ger_gfonts/utility/countries.py b/ger_gfonts/utility/countries.py
index 7f64de3..17634f0 100644
--- a/ger_gfonts/utility/countries.py
+++ b/ger_gfonts/utility/countries.py
@@ -15,13 +15,14 @@ def getEuTlds():
'.bg':'BG', '.cy':'CY', '.mt':'MT'}
def getCountryOfUrl(url):
- ip = socket.gethostbyname(url)
+ parsed = urlparse(url)
+ ip = socket.gethostbyname(parsed.hostname)
api_url = 'https://ipinfo.io/' + ip + '/json'
response = urlopen(api_url)
data = json.load(response)
return data['country']
-def isCountryGerman(self, url):
+def isCountryGerman(url):
return 'DE' == getCountryOfUrl(url)
def isGermanTLD(url):
@@ -29,7 +30,7 @@ def isGermanTLD(url):
tld = parts.hostname[-3:]
return tld == '.de'
-def isGerman(self, url):
+def isGerman(url):
if not isGermanTLD(url):
return isCountryGerman(url)
return True
diff --git a/ger_gfonts/utility/google_scrapy.py b/ger_gfonts/utility/google_scrapy.py
new file mode 100644
index 0000000..fa5604b
--- /dev/null
+++ b/ger_gfonts/utility/google_scrapy.py
@@ -0,0 +1,45 @@
+import requests
+import urllib
+import pandas as pd
+from requests_html import HTML
+from requests_html import HTMLSession
+
+def get_source(url):
+ """Return the source code for the provided URL.
+
+ Args:
+ url (string): URL of the page to scrape.
+
+ Returns:
+ response (object): HTTP response object from requests_html.
+ """
+
+ try:
+ session = HTMLSession()
+ response = session.get(url)
+ return response
+
+ except requests.exceptions.RequestException as e:
+ print(e)
+
+def scrape_google(query):
+
+ query = urllib.parse.quote_plus(query)
+ response = get_source("https://www.google.co.uk/search?q=" + query)
+
+ links = list(response.html.absolute_links)
+ google_domains = ('https://www.google.',
+ 'https://google.',
+ 'https://webcache.googleusercontent.',
+ 'http://webcache.googleusercontent.',
+ 'https://policies.google.',
+ 'https://support.google.',
+ 'https://maps.google.')
+
+ for url in links[:]:
+ if url.startswith(google_domains):
+ links.remove(url)
+
+ return links
+
+scrape_google('inurl:.si')