Fix return errors.
parent
a8944a4442
commit
35d4439d28
|
@ -1,2 +1,3 @@
|
|||
__pycache__
|
||||
gfonts.json
|
||||
__pycache__
|
||||
*/__pycache__
|
||||
|
|
25
README.md
25
README.md
|
@ -6,14 +6,39 @@ Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html
|
|||
|
||||
## Usage
|
||||
|
||||
pip3 install -e .
|
||||
scrapy startproject ger_gfonts
|
||||
cd ger_gfonts
|
||||
scrapy crawl gfonts -O gfonts.json
|
||||
|
||||
## TODO
|
||||
|
||||
!Implement a crawling spider: https://doc.scrapy.org/en/latest/topics/spiders.html#crawlspider
|
||||
|
||||
Start checking for google analytics for all eu websites.
|
||||
|
||||
- eu countries tlds: https://www.whois365.com/en/listtld/europe
|
||||
|
||||
### meta pixel
|
||||
|
||||
<!-- Meta Pixel Code -->
|
||||
<script>
|
||||
!function(f,b,e,v,n,t,s)
|
||||
{if(f.fbq)return;n=f.fbq=function(){n.callMethod?
|
||||
n.callMethod.apply(n,arguments):n.queue.push(arguments)};
|
||||
if(!f._fbq)f._fbq=n;n.push=n;n.loaded=!0;n.version='2.0';
|
||||
n.queue=[];t=b.createElement(e);t.async=!0;
|
||||
t.src=v;s=b.getElementsByTagName(e)[0];
|
||||
s.parentNode.insertBefore(t,s)}(window, document,'script',
|
||||
'https://connect.facebook.net/en_US/fbevents.js');
|
||||
fbq('init', '898263220867925');
|
||||
fbq('track', 'PageView');
|
||||
</script>
|
||||
<noscript><img height="1" width="1" style="display:none"
|
||||
src="https://www.facebook.com/tr?id=898263220867925&ev=PageView&noscript=1"
|
||||
/></noscript>
|
||||
<!-- End Meta Pixel Code -->
|
||||
|
||||
## IDEAS
|
||||
|
||||
Make it into browserextension that would notify you.
|
||||
|
|
Binary file not shown.
|
@ -17,7 +17,7 @@ NEWSPIDER_MODULE = 'ger_gfonts.spiders'
|
|||
#USER_AGENT = 'ger_gfonts (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -5,7 +5,8 @@ import re
|
|||
import json
|
||||
from urllib.request import urlopen
|
||||
from urllib.parse import urlparse
|
||||
from utility.countries import *
|
||||
from utility.countries import isEuropean
|
||||
import validators
|
||||
|
||||
class GFontsSpider(Spider):
|
||||
name = "gfonts"
|
||||
|
@ -21,88 +22,73 @@ class GFontsSpider(Spider):
|
|||
# check current url - german or eu (todo)
|
||||
# check if api.gfonts.com is found
|
||||
# @todo: check if google analytics is found
|
||||
if self.isEuropean(response.url):
|
||||
self.writeTrackers(response)
|
||||
parsed = urlparse(response.url)
|
||||
if isEuropean(response.url):
|
||||
print("URL EUROPEAN: " + response.url)
|
||||
if parsed.hostname not in self.eu_domains:
|
||||
self.eu_domains.append(parsed.hostname)
|
||||
self.logNewDomain(response.url)
|
||||
yield self.writeTrackers(response)
|
||||
else:
|
||||
print("NOT EUROPEAN: " + response.url)
|
||||
self.parseOn(response)
|
||||
|
||||
def getCountryOfUrl(self, url):
|
||||
ip = socket.gethostbyname(url)
|
||||
api_url = 'https://ipinfo.io/' + ip + '/json'
|
||||
response = urlopen(api_url)
|
||||
data = json.load(response)
|
||||
return data['country']
|
||||
|
||||
def isCountryGerman(self, url):
|
||||
return 'DE' == self.getCountryOfUrl(url)
|
||||
|
||||
def isGermanTLD(self, url):
|
||||
parts = urlparse(url)
|
||||
tld = parts.hostname[-3:]
|
||||
return tld == '.de'
|
||||
|
||||
def isGerman(self, url):
|
||||
if not self.isGermanTLD(url):
|
||||
return self.isCountryGerman(url)
|
||||
return True
|
||||
def isEuropean(self, url):
|
||||
eu_tlds = self.getEuTlds()
|
||||
parts = urlparse(url)
|
||||
tld = parts.hostname[-3:]
|
||||
if tld in eu_tlds:
|
||||
return eu_tlds[tld]
|
||||
country = self.getCountryOfUrl(url)
|
||||
if country in eu_tlds.values():
|
||||
return country
|
||||
return False
|
||||
self.checked_domains.append(parsed.hostname)
|
||||
for link in self.parseOn(response):
|
||||
yield scrapy.Request(link, callback=self.parse)
|
||||
|
||||
def findGFonts(self, response):
|
||||
for links in response.css('head link'):
|
||||
return 'fonts.googleapis.com' in links.attrib['href']
|
||||
for link in response.css('head link'):
|
||||
try:
|
||||
href = link.attrib['href']
|
||||
if 'fonts.googleapis.com' in href:
|
||||
return True
|
||||
except:
|
||||
continue
|
||||
return False
|
||||
|
||||
def findGTrackers(self, response):
|
||||
trackers = { 'ga' : 'www.google-analytics.com',
|
||||
'gt' : 'www.googletagmanager.com'}
|
||||
result = {'ga':0, 'gt':0}
|
||||
result = {'ga':False, 'gt':False}
|
||||
for script in response.css('script::text').getall():
|
||||
if script.find(trackers['ga']) > 0:
|
||||
result['ga'] = 1
|
||||
if script.find(trackers['gt']) > 0:
|
||||
result['gt'] = 1
|
||||
if script.find(trackers['ga']) > 0: result['ga'] = True
|
||||
if script.find(trackers['gt']) > 0: result['gt'] = True
|
||||
return result
|
||||
|
||||
def findMetaPixel(self, response):
|
||||
for img in response.css('img'):
|
||||
if img.attrib['src'].find('www.facebook.com/tr?id='):
|
||||
return TRUE
|
||||
return FALSE
|
||||
try:
|
||||
if img.attrib['src'].find('www.facebook.com/tr?id=') > 0: return True
|
||||
except:
|
||||
continue
|
||||
return False
|
||||
|
||||
def writeTrackers(self,response):
|
||||
gtrackers = self.findGTrackers(response)
|
||||
yield {
|
||||
return {
|
||||
'domain': urlparse(response.url).netloc,
|
||||
'country': self.isEuropean(response.url),
|
||||
'country': isEuropean(response.url),
|
||||
'gf': self.findGFonts(response),
|
||||
'ga': gtrackers['ga'],
|
||||
'gt': gtrackers['gm'],
|
||||
'gt': gtrackers['gt'],
|
||||
'mp': self.findMetaPixel(response)
|
||||
}
|
||||
|
||||
|
||||
def parseOn(self, response):
|
||||
links = response.css('a');
|
||||
print('FOUND: ' + str(len(links)) + ' LINKS')
|
||||
next_urls = []
|
||||
for link in links:
|
||||
try:
|
||||
url = link.attrib['href']
|
||||
# parse valid urls
|
||||
found = urlparse(url)
|
||||
if validators.url(url) and bool(found.netloc):
|
||||
current = urlparse(response.url)
|
||||
if current.hostname != found.hostname:
|
||||
yield response.follow(url, callback=self.parse)
|
||||
else:
|
||||
print("NOT FOLLOWING: " + url)
|
||||
if current.hostname != found.hostname and found.hostname not in self.checked_domains:
|
||||
next_urls.append(url)
|
||||
except:
|
||||
continue
|
||||
print('FOLLOW: ' + str(len(next_urls)) + ' LINKS')
|
||||
return next_urls
|
||||
|
||||
def getUrls(self):
|
||||
with open('sites.txt') as sites_file:
|
||||
|
|
|
@ -3,9 +3,13 @@ import scrapy
|
|||
from scrapy.linkextractors import LinkExtractor
|
||||
#from utility.countries import getEuTlds
|
||||
from utility import countries
|
||||
from urllib.parse import urlencode, urlparse, parse_qs
|
||||
|
||||
class firstSpider(scrapy.Spider):
|
||||
class startUrls(scrapy.Spider):
|
||||
name = "start_urls"
|
||||
custom_settings = {
|
||||
'ROBOTSTXT_OBEY': False
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
eu_tlds = countries.getEuTlds()
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -15,13 +15,14 @@ def getEuTlds():
|
|||
'.bg':'BG', '.cy':'CY', '.mt':'MT'}
|
||||
|
||||
def getCountryOfUrl(url):
|
||||
ip = socket.gethostbyname(url)
|
||||
parsed = urlparse(url)
|
||||
ip = socket.gethostbyname(parsed.hostname)
|
||||
api_url = 'https://ipinfo.io/' + ip + '/json'
|
||||
response = urlopen(api_url)
|
||||
data = json.load(response)
|
||||
return data['country']
|
||||
|
||||
def isCountryGerman(self, url):
|
||||
def isCountryGerman(url):
|
||||
return 'DE' == getCountryOfUrl(url)
|
||||
|
||||
def isGermanTLD(url):
|
||||
|
@ -29,7 +30,7 @@ def isGermanTLD(url):
|
|||
tld = parts.hostname[-3:]
|
||||
return tld == '.de'
|
||||
|
||||
def isGerman(self, url):
|
||||
def isGerman(url):
|
||||
if not isGermanTLD(url):
|
||||
return isCountryGerman(url)
|
||||
return True
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
import requests
|
||||
import urllib
|
||||
import pandas as pd
|
||||
from requests_html import HTML
|
||||
from requests_html import HTMLSession
|
||||
|
||||
def get_source(url):
|
||||
"""Return the source code for the provided URL.
|
||||
|
||||
Args:
|
||||
url (string): URL of the page to scrape.
|
||||
|
||||
Returns:
|
||||
response (object): HTTP response object from requests_html.
|
||||
"""
|
||||
|
||||
try:
|
||||
session = HTMLSession()
|
||||
response = session.get(url)
|
||||
return response
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(e)
|
||||
|
||||
def scrape_google(query):
|
||||
|
||||
query = urllib.parse.quote_plus(query)
|
||||
response = get_source("https://www.google.co.uk/search?q=" + query)
|
||||
|
||||
links = list(response.html.absolute_links)
|
||||
google_domains = ('https://www.google.',
|
||||
'https://google.',
|
||||
'https://webcache.googleusercontent.',
|
||||
'http://webcache.googleusercontent.',
|
||||
'https://policies.google.',
|
||||
'https://support.google.',
|
||||
'https://maps.google.')
|
||||
|
||||
for url in links[:]:
|
||||
if url.startswith(google_domains):
|
||||
links.remove(url)
|
||||
|
||||
return links
|
||||
|
||||
scrape_google('inurl:.si')
|
Loading…
Reference in New Issue