Compare commits

...

4 Commits

Author SHA1 Message Date
Lio Novelli 74e2d03a00 Add logging and url validations 2022-02-02 19:57:31 +01:00
Lio Novelli d3c8b6e45c Small readme improvements. 2022-02-02 19:38:22 +01:00
Lio Novelli 9c176dcbe6 proper file structure 2022-02-02 19:35:03 +01:00
Lio Novelli 376c308638 scrapy start project 2022-02-02 17:23:41 +01:00
14 changed files with 547 additions and 15 deletions

2
.gitignore vendored 100644
View File

@ -0,0 +1,2 @@
__pycache__
gfonts.json

View File

@ -2,16 +2,22 @@
A spider that's looking for german page with google fonts hosted on google.
Also look for google analytics on a website.
Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html
## Usage
scrapy crawl gfonts -O gfonts.json
## TODO
Start checking for google analytics for all eu websites.
## IDEAS
Make it into browserextension that would notify you.
## Checking website origin:
https://ipinfo.io/
## TODO
Start checking for google analytics.
## IDEAS
Make it into browserextension that would notify you.

View File

View File

@ -0,0 +1,12 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class GerGfontsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View File

@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class GerGfontsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class GerGfontsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View File

@ -0,0 +1,13 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class GerGfontsPipeline:
def process_item(self, item, spider):
return item

View File

@ -0,0 +1,88 @@
# Scrapy settings for ger_gfonts project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'ger_gfonts'
SPIDER_MODULES = ['ger_gfonts.spiders']
NEWSPIDER_MODULE = 'ger_gfonts.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ger_gfonts (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'ger_gfonts.middlewares.GerGfontsSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'ger_gfonts.middlewares.GerGfontsDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'ger_gfonts.pipelines.GerGfontsPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,74 @@
import scrapy
import socket
import re
import json
from urllib.request import urlopen
from urllib.parse import urlparse
class GFontsSpider(scrapy.Spider):
name = "gfonts"
#start_urls = self.getUrls()
checked_domains = []
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
def __init__(self):
self.start_urls = self.getUrls()
def parse(self, response):
# check current url - german or eu (todo)
# check if api.gfonts.com is found
# @todo: check if google analytics is found
parsed = urlparse(response.url)
self.checked_domains.append(parsed.hostname)
self.logNewDomain(response.url)
if self.isGerman(response.url):
self.findGFonts(response)
self.parseOn(response)
def getCountryOfUrl(self, url):
ip = socket.gethostbyname(url)
api_url = 'https://ipinfo.io/' + ip + '/json'
response = urlopen(api_url)
data = json.load(response)
return data['country']
def isCountryGerman(self, url):
return 'DE' == self.getCountryOfUrl(url)
def isGermanTLD(self, url):
parts = urlparse(url)
tld = parts.hostname[-3:]
return tld == '.de'
def isGerman(self, url):
if not self.isGermanTLD(url):
return self.isCountryGerman(url)
return True
def findGFonts(self, response):
for links in response.css('head links'):
if 'fonts.googleapis.com' in links.attrib['href']:
yield {
'url': response.url,
'gfonts': True,
}
def parseOn(self, response):
for links in response.css('a'):
url = links.attrib['href']
# parse valid urls
if validators.url(url) and bool(urlparse(url).netloc):
current = urlparse(response.url)
found = urlparse(url)
if current.hostname != found.hostname and found.hostname not in self.checked_domains:
yield response.follow(url, callback=self.parse)
def getUrls(self):
with open('sites.txt') as sites_file:
sites = sites_file.readlines()
return map(lambda s: 'https://' + s,sites)
def logNewDomain(self, url):
print('############################################')
print('###### ' + url + ' #######')
print('############################################')

View File

@ -0,0 +1,106 @@
porsche.com
kenhub.com
trademachines.com
canyon.com
pinterest.de
buecher.de
conrad.com
bosch-home.com
kaercher.com
mobile.de
deutschland.de
google.de
amazon.de
einreiseanmeldung.de
bmw.de
daad.de
volkswagen.de
mtu.de
aldi.com
degruyter.com
web.de
uni-assist.de
mercedes-benz.de
embl.de
denic.de
bahn.de
dedon.de
giz.de
berlin.de
giz.de
bundestag.de
deutsche-rentenversicherung.de
bild.de
tum.de
rewe.de
commerzbank.de
goethe.de
spiegel.de
pangaea.de
tagesshau.de
fussball.de
adac.de
boys-day.de
audi.de
lidl.de
ndr.de
visitberlin.de
gesetze-im-internet.de
dresden.de
baden-wuttemberg.de
hessen.de
sparkasse.de
museum-folkwang.de
posteo.de
wetter.de
stern.de
biontech.de
mediamarkt.de
rki.de
deutsche-bank.de
duden.de
magro-aktuell.de
notebooksbilliger.de
idealo.de
wg-gesucht.de
bundesregierung.de
ebay.de
rlp.de
taz.de
antolin.westermann.de
electricbrands.de
suedzucker.de
wetteronline.de
kik.de
wetterzentrale.de
service.bund.de
katholisch.de
homboldt-foundation.de
deginvest.de
comdirect.de
standaard.be
jocobs-university.de
naspa.de
uni-bonn.de
zalando.de
ausbildung.de
uni-hamburg.de
auswaertiges-amt.de
jab.de
de.dwa.de
mdc-berlin.de
dlr.de
rwth-aachen.de
studip.uni-goettingen.de
bzga.de
gtai.de
bunte.de
decathlon.de
denbi.de
hannover.de
ewe.de
impfdashboard.de

View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = ger_gfonts.settings
[deploy]
#url = http://localhost:6800/
project = ger_gfonts

View File

@ -0,0 +1,106 @@
porsche.com
kenhub.com
trademachines.com
canyon.com
pinterest.de
buecher.de
conrad.com
bosch-home.com
kaercher.com
mobile.de
deutschland.de
google.de
amazon.de
einreiseanmeldung.de
bmw.de
daad.de
volkswagen.de
mtu.de
aldi.com
degruyter.com
web.de
uni-assist.de
mercedes-benz.de
embl.de
denic.de
bahn.de
dedon.de
giz.de
berlin.de
giz.de
bundestag.de
deutsche-rentenversicherung.de
bild.de
tum.de
rewe.de
commerzbank.de
goethe.de
spiegel.de
pangaea.de
tagesshau.de
fussball.de
adac.de
boys-day.de
audi.de
lidl.de
ndr.de
visitberlin.de
gesetze-im-internet.de
dresden.de
baden-wuttemberg.de
hessen.de
sparkasse.de
museum-folkwang.de
posteo.de
wetter.de
stern.de
biontech.de
mediamarkt.de
rki.de
deutsche-bank.de
duden.de
magro-aktuell.de
notebooksbilliger.de
idealo.de
wg-gesucht.de
bundesregierung.de
ebay.de
rlp.de
taz.de
antolin.westermann.de
electricbrands.de
suedzucker.de
wetteronline.de
kik.de
wetterzentrale.de
service.bund.de
katholisch.de
homboldt-foundation.de
deginvest.de
comdirect.de
standaard.be
jocobs-university.de
naspa.de
uni-bonn.de
zalando.de
ausbildung.de
uni-hamburg.de
auswaertiges-amt.de
jab.de
de.dwa.de
mdc-berlin.de
dlr.de
rwth-aachen.de
studip.uni-goettingen.de
bzga.de
gtai.de
bunte.de
decathlon.de
denbi.de
hannover.de
ewe.de
impfdashboard.de

View File

@ -7,16 +7,16 @@ from urllib.parse import urlparse
class GFontsSpider(scrapy.Spider):
name = "gfonts"
start_urls = get_urls()
start_urls = self.getUrls()
custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000}
def parse(self, response):
# check current url - german or eu (todo)
# check if api.gfonts.com is found
# todo: check if google analytics is found
if (isGerman(response.url)):
findGFonts(response)
parseOn(response)
if (self.isGerman(response.url)):
self.findGFonts(response)
self.parseOn(response)
def getCountryOfUrl(url):
ip = socket.gethostbyname(url)
@ -26,7 +26,7 @@ class GFontsSpider(scrapy.Spider):
return data['country']
def isCountryGerman(url):
return 'DE' === getCountryOfUrl(url)
return 'DE' === self.getCountryOfUrl(url)
def isGermanTLD(url):
parts = urlparse(url)
@ -34,8 +34,8 @@ class GFontsSpider(scrapy.Spider):
return tld === '.de'
def isGerman(url):
if (!isGermanTLD(url)):
return isCountryGerman(url)
if (!self.isGermanTLD(url)):
return self.isCountryGerman(url)
return TRUE
def findGFonts(response):
@ -45,6 +45,7 @@ class GFontsSpider(scrapy.Spider):
'url': response.url,
'gfonts': TRUE,
}
def parseOn(response):
for links in response.css('a'):
url = links.attrib['href']
@ -53,3 +54,8 @@ class GFontsSpider(scrapy.Spider):
found = urlparse(url)
if (current.hostname != found.hostname):
yield response.follow(url, callback=self.parse)
def getUrls():
with open('../sites.txt') as sites_file:
sites = sites_file.readlines()
return sites

View File

@ -1 +1,2 @@
scrapy
validators