From 35d4439d28f500fccd965df0035a7f215eaa9d58 Mon Sep 17 00:00:00 2001 From: Lio Novelli Date: Sun, 6 Feb 2022 19:09:10 +0100 Subject: [PATCH] Fix return errors. --- .gitignore | 3 +- README.md | 25 +++++ .../__pycache__/settings.cpython-37.pyc | Bin 271 -> 267 bytes ger_gfonts/ger_gfonts/settings.py | 2 +- .../__pycache__/gfonts_spider.cpython-37.pyc | Bin 4834 -> 4077 bytes .../start_urls_spider.cpython-37.pyc | Bin 1023 -> 1143 bytes .../ger_gfonts/spiders/gfonts_spider.py | 106 ++++++++---------- .../ger_gfonts/spiders/start_urls_spider.py | 6 +- ger_gfonts/utility/.countries.py.swp | Bin 12288 -> 0 bytes .../__pycache__/countries.cpython-37.pyc | Bin 1673 -> 1692 bytes ger_gfonts/utility/countries.py | 7 +- ger_gfonts/utility/google_scrapy.py | 45 ++++++++ 12 files changed, 128 insertions(+), 66 deletions(-) delete mode 100644 ger_gfonts/utility/.countries.py.swp create mode 100644 ger_gfonts/utility/google_scrapy.py diff --git a/.gitignore b/.gitignore index b096d08..63cc484 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ -__pycache__ gfonts.json +__pycache__ +*/__pycache__ diff --git a/README.md b/README.md index 16ccdf6..83a6be2 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,39 @@ Based on: https://docs.scrapy.org/en/latest/intro/tutorial.html ## Usage + pip3 install -e . scrapy startproject ger_gfonts cd ger_gfonts scrapy crawl gfonts -O gfonts.json ## TODO +!Implement a crawling spider: https://doc.scrapy.org/en/latest/topics/spiders.html#crawlspider + Start checking for google analytics for all eu websites. +- eu countries tlds: https://www.whois365.com/en/listtld/europe + +### meta pixel + + + + + + ## IDEAS Make it into browserextension that would notify you. diff --git a/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc b/ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc index 912c08d20354f4e966a7084fbd8d317a7a940164..4b31e1616d2a9723fdd85b302320c5409ee7017b 100644 GIT binary patch delta 34 ocmeBY>Sp3~;^pOH0D{Du{}ZJr@`f_HO-%M-begz9dg6Hp0FizQ00000 delta 38 scmeBX>SyA0;^pOH0D|r7el5Wl_uKHHZ+geEC%n$o7!trgLSA`z;nlq9HXYFBOAfKf>=zKwIv`Oev0 z(Mb0=&S?otw5%C4Abs@$ByG&ToD*_s7)p zGwzLI(NW;nGnLh60|)LH5eM&8FZqgZgoReIRwN4OBy?-;r;2a-*1qCfRNd2R13tN@ zlWu((aVsE+<#{8Uh`!{F_MXPY`~7*BRc&n$xJpK9_ic_*1rXUTB# zP`%XC!Dzrh2mD76Lg|tw*(1BEP&17^5SmX=A651UQMzhV?46Y<|5A zkU%7QWyigLcX4w5{@v=G`Hw3zlRIvuy7bC_z{Re6VYuqmH%cxuAtb4FSja5Yj5Z4* zA?@{;i^yxyo^)tu#gAKF5OHaG?KX{kSqS4sg+883&9eBI1%fUya7Y=K1NC(Znt!y* zYI}^+Y9viwXI^_-T6gJYhjIa@;Y>+q1w7kkl(*xEQ)#AlU+Ur2AlhJTgj{H#qhFmo8wCouR85L?%}J%m@Hyz8!2oy-fY+?-wWR{Jrr({L_3gJKfJ>yb!jEr|S27vf0p3oy+GK`3O9s1TA3k4Xy2g8jUF9?X9c z5NrxWNn>Z>CCvsEkVvVcmy(f!$WA9Kxrq;sol{KDd6A*_VKW~Fp};{l95q6Q6&ptK zG1vWMzPC>;+_ps;@mE?85&6mW(^+A2E!MVF3NRvv<*OF*9KDGoZ?- zz3J|^A<{d+77agy6&zWR!8F(h=?7=pO!GXO{N{{WLWe3-_maPzv%Rz^7$TT~XG$}p zbIq#9ok+0ln%Y=9?-r$xffp?MkvM00fcTCO4bCGctp zIb1sm;zsgQ{v+20?FivtlRxuQy#~;4i*;3~?dGn&{h+HibzcWk8p7xr&}d9yLfbF^ z?R{Z)&1P;7H3LGlVAkqdO$V&bhQ{{s4SgXGt!KZh9TspGK|+`Ngf0W2%RuPzvLc9Y z2vzie`FqgMMipP{cONZfx+LC?wNSlZnS&-`PA=Z8+`5;tc*kC-E-qG=XC_Ob2+XAD-Cw2&ca&X)=^N8oJa26v6Wd z2xz_?F5=b-r$Pi#gR=pg!j74-Hl;OEH|1msR(e~87hwx1ULKihY0E|0=3#L3WD{N0 z0aRPHHG9Z*ZrV9`@}L^F!EkA5ke!5=bhCy(&D((wp9Y2>UZ$bI<_zD17(Nt2s*Fm# z$>S*bX87V4wV{p(!a!_KXZ0CSo|WBI+=%YqbVbNiEA~4foy9hjwl^CZ8+K6}l8uk8 F{s%!I%7p*` literal 4834 zcmb7IO>Y~=8Qz`!;F6*wTahg}NmC_uU9(7}exN{VIBsl-Qq(AvD#~%XXu+&FD{84F zm!26#GC`k|9*g$aLk`k0|D!*k=ehQjb8iI-_jzYYT9ktTr7-Wz&d2V&&phwP=<)P) z$-=MukC)N)>z4H|dKmp&G(JX=UqWz;+o=^&%~HF=Le_CYr{jjMosW5;XXf~!Z|Xu= zK<%VKrx+H^95*d>rot)P+OW9C{jV+V$L!1r%iOM8mBM%UidCyP+OPK#9t%CSDx#Bg z_~7myr*XyBg=3Lsy|{~JfMzceGH%k!(T}0=F^c>Qmd#HUb~c$rV5p5iln7Im3lKQ)IZ=jy#H~9kU zYkZO4LVcaT$=^af$KU2ZLj4ARhyNJ$JYVAPqQ1f3gSNMId1F26Dw(Sn`mFU#V71+IZ@}6<^ zKtZm9!%V7f)QNRD&04kiWX(*_UQZ-lr3;3g+ue#Irsx%ch-EM9%2@kmgqvf?#WYRu zO!pgTnsc?>vR%U8j6K6(K%s00ko~tOP`I*S%qYTz{FlLlYy>D6PF>hX4BE1qW^2`oE#AXeg}vJ9 zYga7P=l=tSM0V*ZHLlUunmal94U#`tL7!Gi*|4T)LX<}lemvs*Ztfy0^Enp6Ak zzb{|D>^sXm#?I(u#XLmCeYH!&6u1jVrLU1_Dv|kC_qNu4hINwOU;#W(XmJ9(OB2Vt zCE&)prIQ-LCg;&Il&S+v1K}RFcMns<9f%(SF&KSm1rvw+A!1+g3!_=%%}lU*Yq*&U z>jhKT2p|M#ZSZ>tAs9ViN4{7duz^dQ#)jcBa+VL+xqD$@^pk-H1&9MqP5cD!TnDF% z;8FE>FXg$h8@vrQwEtC<9>?-Bo5KQJ0f3j{E|v>ogm{a@B%8%7`hJBX=OE|+=m zmtDKRaOE&YK4`2yox_;Kgz@P+1N$84U)XEbpRU2Q8C;-E27Kh#t)~dYxgi%1<_(EZk`@3mRB^=rGuo)^Ibp0 zb|h`ivaelL<<+dQD&!rg7<=Ss;M5R@zhO|q#UHht)885OybJr>zObO;hc9rZ@cxYZ zx2y}ByGO2g%nL9T|A93e3C1IL%t-LSdUofjHE;k$vDRNaIXM|>|3TD^(&s8^8VoLv zsHg?gju7bx9pJ)R5sDHNYWro`6iH9rzpvs`C4Paf@2Fq=5(1!ofJ*xyP?VRqd@m+NxJC(@Fe>rc9cLHu`=B*mjcBIQ zTAhlo{XFU$6Gzv988e5kS*KSih!64I+)bNU=XYb3X3enlDmDp<$l@c2%cz7!Fl?A& zOkyLDid#D;ZEe;*tqWo*rS?r2(>}22 z&r-IwzP+`z{dBXofpl#;C*4g<1U;3|P8xS}4pO+#B{=K^$wh|5&76k|+NR=Tj2XAC z{evt`vlH2DRCkrj2tfEK?)Cx}V%>L`*3j{ZEfH*5~lsjL8t7iyUzqfX+ zFGdXQ7bQ|uxvWmr_>H5BL7m=6WuQRy}>BCGkO4H`!)9-JV>L?K93%Du=d|5 zR{!pV3Scy{GP(Q!{Ekxj5RH5RZf!2z%5+g6@&LjZxk)GL4e1`1%w`)0Ng9WqXGVm^6kX{ld=7%oNT zNDTaeq(|Jqc=k%8fa2t>2{>4r+TQ@WYbt(0!0 zbQ`6cDBVKo21>V2x_Q#A^P445;JsSN9<6EiXfI@st07zc1SJ%=@r97BeHyZARkLax z^^=gTZ=&pmYy;&JD(Kt9qD?H`+SF`|>L;i_*Q~aMvK_LWEfn$XN-pF+)p1x?HkAqvFK~k?)Frga!3?Bl)>NiVS`*7sZWLNgHyx(ewTTnN)-e_6!OkY%g<7Fiy48jqPrR@&G1ZCD2na#SCZ^E2U+ M954mTfR2a#55Xl_`~Uy| diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc index 76470eb6ca6a86826e2a80180cd910fbaa73a7a8..ca557f2c40e419953b97f074de81d46c4c9168c1 100644 GIT binary patch delta 568 zcmZut!D<^Z5S6sMTG_FkV0HqdMcA&^T7r36D$D5Zwl1xFWQdz7$Y*G{F~9DM1a z`2n;2I^`Qm4?Xngj@C4Rk^wzEjr8WtXcPYt?LoWUBp8!lo9W%U-KQVcuU;MjfiR15 zE_oiyI2Te-@}58h{GI^UjnZU#F<*jq0_pA4SY-94zS_C> z_!kVxIWBvEpoyeFBm*h~U=XBKh7dw@PbAtwj5`N`Mh(}n#ncpEOlDoPrV5R$>wNAE z2gC7bJQ=Iu;N9iL`r!xnLz!k((phg0>2F+Qwf|V4`bl432+SqSg;r-*F4m-LIquQpGqxQM!VGmdvW%kG^(fXdL6HIobJ)p?xXU!zW)f)(8filr`GC@2FW;>MkuqKj;XY9fq|ZB3>{W@|4{#!Gks z_a49t%o*Ida9Mw*s73tYKOR5tJ$~{jra^cThBboq{5I`+lW-08C#{XuQ=)vu#zZkq zZrGzIxq=R9^T}xQ{L0C{iy}G3w(AI*NKiyl1=3Sgd1E5|S&n3iqdzDYt`lRk&LC0R zlr@`mO%@MLYKs;;agG^=7q4)+Qb+oMP4cZdM{z(SV8uOc?#{H%AP`Cs$4M{A;@CaL z`UnIBg`YSs>lcW*D^vp=au|M^RCc-L3MR3E`l2(9D! E0{(ztO#lD@ diff --git a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py index 14be6ba..1bbed47 100644 --- a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py +++ b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py @@ -5,7 +5,8 @@ import re import json from urllib.request import urlopen from urllib.parse import urlparse -from utility.countries import * +from utility.countries import isEuropean +import validators class GFontsSpider(Spider): name = "gfonts" @@ -21,88 +22,73 @@ class GFontsSpider(Spider): # check current url - german or eu (todo) # check if api.gfonts.com is found # @todo: check if google analytics is found - if self.isEuropean(response.url): - self.writeTrackers(response) - parsed = urlparse(response.url) - self.eu_domains.append(parsed.hostname) - self.logNewDomain(response.url) + parsed = urlparse(response.url) + if isEuropean(response.url): + print("URL EUROPEAN: " + response.url) + if parsed.hostname not in self.eu_domains: + self.eu_domains.append(parsed.hostname) + self.logNewDomain(response.url) + yield self.writeTrackers(response) else: print("NOT EUROPEAN: " + response.url) - self.parseOn(response) - - def getCountryOfUrl(self, url): - ip = socket.gethostbyname(url) - api_url = 'https://ipinfo.io/' + ip + '/json' - response = urlopen(api_url) - data = json.load(response) - return data['country'] - - def isCountryGerman(self, url): - return 'DE' == self.getCountryOfUrl(url) - - def isGermanTLD(self, url): - parts = urlparse(url) - tld = parts.hostname[-3:] - return tld == '.de' - - def isGerman(self, url): - if not self.isGermanTLD(url): - return self.isCountryGerman(url) - return True - def isEuropean(self, url): - eu_tlds = self.getEuTlds() - parts = urlparse(url) - tld = parts.hostname[-3:] - if tld in eu_tlds: - return eu_tlds[tld] - country = self.getCountryOfUrl(url) - if country in eu_tlds.values(): - return country - return False + self.checked_domains.append(parsed.hostname) + for link in self.parseOn(response): + yield scrapy.Request(link, callback=self.parse) def findGFonts(self, response): - for links in response.css('head link'): - return 'fonts.googleapis.com' in links.attrib['href'] + for link in response.css('head link'): + try: + href = link.attrib['href'] + if 'fonts.googleapis.com' in href: + return True + except: + continue + return False + def findGTrackers(self, response): trackers = { 'ga' : 'www.google-analytics.com', 'gt' : 'www.googletagmanager.com'} - result = {'ga':0, 'gt':0} + result = {'ga':False, 'gt':False} for script in response.css('script::text').getall(): - if script.find(trackers['ga']) > 0: - result['ga'] = 1 - if script.find(trackers['gt']) > 0: - result['gt'] = 1 + if script.find(trackers['ga']) > 0: result['ga'] = True + if script.find(trackers['gt']) > 0: result['gt'] = True return result + def findMetaPixel(self, response): for img in response.css('img'): - if img.attrib['src'].find('www.facebook.com/tr?id='): - return TRUE - return FALSE + try: + if img.attrib['src'].find('www.facebook.com/tr?id=') > 0: return True + except: + continue + return False + def writeTrackers(self,response): gtrackers = self.findGTrackers(response) - yield { + return { 'domain': urlparse(response.url).netloc, - 'country': self.isEuropean(response.url), + 'country': isEuropean(response.url), 'gf': self.findGFonts(response), 'ga': gtrackers['ga'], - 'gt': gtrackers['gm'], + 'gt': gtrackers['gt'], 'mp': self.findMetaPixel(response) } - def parseOn(self, response): links = response.css('a'); print('FOUND: ' + str(len(links)) + ' LINKS') + next_urls = [] for link in links: - url = link.attrib['href'] - # parse valid urls - found = urlparse(url) - if validators.url(url) and bool(found.netloc): - current = urlparse(response.url) - if current.hostname != found.hostname: - yield response.follow(url, callback=self.parse) - else: - print("NOT FOLLOWING: " + url) + try: + url = link.attrib['href'] + found = urlparse(url) + if validators.url(url) and bool(found.netloc): + current = urlparse(response.url) + if current.hostname != found.hostname and found.hostname not in self.checked_domains: + next_urls.append(url) + except: + continue + print('FOLLOW: ' + str(len(next_urls)) + ' LINKS') + return next_urls def getUrls(self): with open('sites.txt') as sites_file: diff --git a/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py index dc0028f..062cccf 100644 --- a/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py +++ b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py @@ -3,9 +3,13 @@ import scrapy from scrapy.linkextractors import LinkExtractor #from utility.countries import getEuTlds from utility import countries +from urllib.parse import urlencode, urlparse, parse_qs -class firstSpider(scrapy.Spider): +class startUrls(scrapy.Spider): name = "start_urls" + custom_settings = { + 'ROBOTSTXT_OBEY': False + } def __init__(self): eu_tlds = countries.getEuTlds() diff --git a/ger_gfonts/utility/.countries.py.swp b/ger_gfonts/utility/.countries.py.swp deleted file mode 100644 index 6aec163319535a0eb877bec5ac207a0e71dd9bf1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeHNzmFS57@b3+2#_Cw3egOl(E7x&&W{FJxJd4NKa@*yN_>&H6Ig3|)}GvYciEXe zVGz)QkWxbm8la+u5)ibA9#K&8A6!8N@0%H4971G?F6dk7-M;zeoA=GUnO(`c?VrEB zeNDa=Ul6#?332?(U&$wLom+b61tBtBP;}vC%4DD>$-peg`~ZT@-Kw~w2G%qWi%D77 z<|r+4v_`7x1zKC3X*+EWiaNKIRwgc|3zWH=9bg9*J8)WDx_IF{JHQUG z1MC1hzz(nj>;OCP7&?#)h4}H55T8AT$>ab3v)})}pB3U~;5*=>k z;wxYecoO*bv=AQy9|1XV6?h)_^BEz21AYL$2R;XmfEwrl9UubE0%w5Zr-k?x_y+hA z_yV{KSYRJ`9e5cy1^j_|{{(yu+yiJGp8|Zb1MC1hzz(nj>;OB!4m@rSNL-b&b(PDF zBr{4-fUXeqpNCEbg>))IQ|er*x@WVYiI0-3R;IDiayhevwj3$D=DsbR-bnYVtWj6l zic2A>pdK_1AoQl|LkaLXd9O;bwk(s%ns{6on@amM!pMIL8GGHg`Jltpfo-|tz=?+`rC8%e2qC+8tF&>fAfElsQ0)TPeTBF2Zzk$gq! z5`Us5>3fe5XiHNJ4wa3uv2-H(Q#uoV=V?9vcKU+z5?2)WM^P)fvKg&{52oO2Z#X`& z;Mcm*`_4X}TV{yu?zsMv`ggmIR|WXZ8;%YQFi{X4!tBfTB3 zm(|4g+_>zB_-)5?s(;uVen;isBs~SBCg4A1CdO*G_3|8(5pu3lG$L7@Q2FmW(d1yAcVh zqT!d_C)q1rb}$BC+j2>RcZu(VJFR6oNlN#6u!T&ra-8%ox@URr*~S*9RWYFlEz|wj zy$+^(aL&T z(fA2g7XFA_V`t+hXyNPy0`4-q^Jd=co1F*u(RKG+H)G*?<5uJD)qNrd91%s^N0Vhd zloeU`Fa!l75owGk$EdX8&YLQ6!W%XBOck9Wf9X~Wc?^5%sVlcy=FU9v3n7! z%brw?+iv|jj2+_}PY}bhTe8=U?M|;A_&lZq(TRD#E@T+QoOhjh1+MWO``WN|x? z0kMg=2_}lm+>^!)HGpyJZBn&-QW0s4`QswI3i^XCLr(wFv$R+aIQLm6h*_>*6(Z?R`e%ZfU^E!i;&kI_vXwb5~;Ch$-uHc M=EaobP=`6}56^~T<^TWy delta 467 zcmY*VO-}+b5bdFwG-25MD&D z?;~}?_D^CtNJ0uMM6rcZZK^Hc(mTUA!thfF9grk|A@pd2IP_Rd;h8hJdvAL{gnFs_0(;)Vc{S(xpv0w(tM9!a)_~s!r&_FDF z>d^S2*ufs;+qmKUZzDIGNBTG40#Q>%{uSTy&FguwgNc`#PV4I2tRxZ^v$(~70hm=; A0ssI2 diff --git a/ger_gfonts/utility/countries.py b/ger_gfonts/utility/countries.py index 7f64de3..17634f0 100644 --- a/ger_gfonts/utility/countries.py +++ b/ger_gfonts/utility/countries.py @@ -15,13 +15,14 @@ def getEuTlds(): '.bg':'BG', '.cy':'CY', '.mt':'MT'} def getCountryOfUrl(url): - ip = socket.gethostbyname(url) + parsed = urlparse(url) + ip = socket.gethostbyname(parsed.hostname) api_url = 'https://ipinfo.io/' + ip + '/json' response = urlopen(api_url) data = json.load(response) return data['country'] -def isCountryGerman(self, url): +def isCountryGerman(url): return 'DE' == getCountryOfUrl(url) def isGermanTLD(url): @@ -29,7 +30,7 @@ def isGermanTLD(url): tld = parts.hostname[-3:] return tld == '.de' -def isGerman(self, url): +def isGerman(url): if not isGermanTLD(url): return isCountryGerman(url) return True diff --git a/ger_gfonts/utility/google_scrapy.py b/ger_gfonts/utility/google_scrapy.py new file mode 100644 index 0000000..fa5604b --- /dev/null +++ b/ger_gfonts/utility/google_scrapy.py @@ -0,0 +1,45 @@ +import requests +import urllib +import pandas as pd +from requests_html import HTML +from requests_html import HTMLSession + +def get_source(url): + """Return the source code for the provided URL. + + Args: + url (string): URL of the page to scrape. + + Returns: + response (object): HTTP response object from requests_html. + """ + + try: + session = HTMLSession() + response = session.get(url) + return response + + except requests.exceptions.RequestException as e: + print(e) + +def scrape_google(query): + + query = urllib.parse.quote_plus(query) + response = get_source("https://www.google.co.uk/search?q=" + query) + + links = list(response.html.absolute_links) + google_domains = ('https://www.google.', + 'https://google.', + 'https://webcache.googleusercontent.', + 'http://webcache.googleusercontent.', + 'https://policies.google.', + 'https://support.google.', + 'https://maps.google.') + + for url in links[:]: + if url.startswith(google_domains): + links.remove(url) + + return links + +scrape_google('inurl:.si')