From a8944a44420e2f43a71fc963ebb8a06409846c55 Mon Sep 17 00:00:00 2001 From: Lio Novelli Date: Sun, 6 Feb 2022 14:39:12 +0100 Subject: [PATCH] Improve gfonts spider start working on start_urls from search result spider --- .../__pycache__/__init__.cpython-37.pyc | Bin 0 -> 157 bytes .../__pycache__/settings.cpython-37.pyc | Bin 0 -> 271 bytes .../__pycache__/__init__.cpython-37.pyc | Bin 0 -> 165 bytes .../__pycache__/gfonts_spider.cpython-37.pyc | Bin 0 -> 4834 bytes .../start_urls_spider.cpython-37.pyc | Bin 0 -> 1023 bytes .../ger_gfonts/spiders/gfonts_spider.py | 81 ++++++++++++++---- ger_gfonts/ger_gfonts/spiders/sites.txt | 2 +- .../ger_gfonts/spiders/start_urls_spider.py | 17 ++++ ger_gfonts/utility/.countries.py.swp | Bin 0 -> 12288 bytes ger_gfonts/utility/__init.py | 0 .../__pycache__/countries.cpython-37.pyc | Bin 0 -> 1673 bytes ger_gfonts/utility/countries.py | 47 ++++++++++ 12 files changed, 130 insertions(+), 17 deletions(-) create mode 100644 ger_gfonts/ger_gfonts/__pycache__/__init__.cpython-37.pyc create mode 100644 ger_gfonts/ger_gfonts/__pycache__/settings.cpython-37.pyc create mode 100644 ger_gfonts/ger_gfonts/spiders/__pycache__/__init__.cpython-37.pyc create mode 100644 ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc create mode 100644 ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc create mode 100644 ger_gfonts/ger_gfonts/spiders/start_urls_spider.py create mode 100644 ger_gfonts/utility/.countries.py.swp create mode 100644 ger_gfonts/utility/__init.py create mode 100644 ger_gfonts/utility/__pycache__/countries.cpython-37.pyc create mode 100644 ger_gfonts/utility/countries.py diff --git a/ger_gfonts/ger_gfonts/__pycache__/__init__.cpython-37.pyc b/ger_gfonts/ger_gfonts/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08d15f7cf06959ba97d7c3b0e192a6aca82418f0 GIT binary patch literal 157 zcmZ?b<>g`kg2e1!i6Hthh=2h`Aj1KOi&=m~3PUi1CZpdg`kg6->nB}(xyFgylvV1NBV{lmA5!-a`RJ4b5iXg`kg2e1!i3@@BV-N=h7=a82ATDMB5-AM944RC7D;bJF!U*D*pMFMuZmNDx zX1;zvQGQlxa!Ij%c7AR_eu;j1YLRYwT7Di-1jvlXWEK}>rlb}X>&M4u=4F<|$LkeT U-r}&y%}*)KNwovn^BIU40M45%DgXcg literal 0 HcmV?d00001 diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/gfonts_spider.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1dc051dbfb40f7474fda1e4fe0f06feff400d6f1 GIT binary patch literal 4834 zcmb7IO>Y~=8Qz`!;F6*wTahg}NmC_uU9(7}exN{VIBsl-Qq(AvD#~%XXu+&FD{84F zm!26#GC`k|9*g$aLk`k0|D!*k=ehQjb8iI-_jzYYT9ktTr7-Wz&d2V&&phwP=<)P) z$-=MukC)N)>z4H|dKmp&G(JX=UqWz;+o=^&%~HF=Le_CYr{jjMosW5;XXf~!Z|Xu= zK<%VKrx+H^95*d>rot)P+OW9C{jV+V$L!1r%iOM8mBM%UidCyP+OPK#9t%CSDx#Bg z_~7myr*XyBg=3Lsy|{~JfMzceGH%k!(T}0=F^c>Qmd#HUb~c$rV5p5iln7Im3lKQ)IZ=jy#H~9kU zYkZO4LVcaT$=^af$KU2ZLj4ARhyNJ$JYVAPqQ1f3gSNMId1F26Dw(Sn`mFU#V71+IZ@}6<^ zKtZm9!%V7f)QNRD&04kiWX(*_UQZ-lr3;3g+ue#Irsx%ch-EM9%2@kmgqvf?#WYRu zO!pgTnsc?>vR%U8j6K6(K%s00ko~tOP`I*S%qYTz{FlLlYy>D6PF>hX4BE1qW^2`oE#AXeg}vJ9 zYga7P=l=tSM0V*ZHLlUunmal94U#`tL7!Gi*|4T)LX<}lemvs*Ztfy0^Enp6Ak zzb{|D>^sXm#?I(u#XLmCeYH!&6u1jVrLU1_Dv|kC_qNu4hINwOU;#W(XmJ9(OB2Vt zCE&)prIQ-LCg;&Il&S+v1K}RFcMns<9f%(SF&KSm1rvw+A!1+g3!_=%%}lU*Yq*&U z>jhKT2p|M#ZSZ>tAs9ViN4{7duz^dQ#)jcBa+VL+xqD$@^pk-H1&9MqP5cD!TnDF% z;8FE>FXg$h8@vrQwEtC<9>?-Bo5KQJ0f3j{E|v>ogm{a@B%8%7`hJBX=OE|+=m zmtDKRaOE&YK4`2yox_;Kgz@P+1N$84U)XEbpRU2Q8C;-E27Kh#t)~dYxgi%1<_(EZk`@3mRB^=rGuo)^Ibp0 zb|h`ivaelL<<+dQD&!rg7<=Ss;M5R@zhO|q#UHht)885OybJr>zObO;hc9rZ@cxYZ zx2y}ByGO2g%nL9T|A93e3C1IL%t-LSdUofjHE;k$vDRNaIXM|>|3TD^(&s8^8VoLv zsHg?gju7bx9pJ)R5sDHNYWro`6iH9rzpvs`C4Paf@2Fq=5(1!ofJ*xyP?VRqd@m+NxJC(@Fe>rc9cLHu`=B*mjcBIQ zTAhlo{XFU$6Gzv988e5kS*KSih!64I+)bNU=XYb3X3enlDmDp<$l@c2%cz7!Fl?A& zOkyLDid#D;ZEe;*tqWo*rS?r2(>}22 z&r-IwzP+`z{dBXofpl#;C*4g<1U;3|P8xS}4pO+#B{=K^$wh|5&76k|+NR=Tj2XAC z{evt`vlH2DRCkrj2tfEK?)Cx}V%>L`*3j{ZEfH*5~lsjL8t7iyUzqfX+ zFGdXQ7bQ|uxvWmr_>H5BL7m=6WuQRy}>BCGkO4H`!)9-JV>L?K93%Du=d|5 zR{!pV3Scy{GP(Q!{Ekxj5RH5RZf!2z%5+g6@&LjZxk)GL4e1`1%w`)0Ng9WqXGVm^6kX{ld=7%oNT zNDTaeq(|Jqc=k%8fa2t>2{>4r+TQ@WYbt(0!0 zbQ`6cDBVKo21>V2x_Q#A^P445;JsSN9<6EiXfI@st07zc1SJ%=@r97BeHyZARkLax z^^=gTZ=&pmYy;&JD(Kt9qD?H`+SF`|>L;i_*Q~aMvK_LWEfn$XN-pF+)p1x?HkAqvFK~k?)Frga!3?Bl)>NiVS`*7sZWLNgHyx(ewTTnN)-e_6!OkY%g<7Fiy48jqPrR@&G1ZCD2na#SCZ^E2U+ M954mTfR2a#55Xl_`~Uy| literal 0 HcmV?d00001 diff --git a/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc b/ger_gfonts/ger_gfonts/spiders/__pycache__/start_urls_spider.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76470eb6ca6a86826e2a80180cd910fbaa73a7a8 GIT binary patch literal 1023 zcmZuvOK;RL5VoDirezluM1{nu7b^C$IUsSUN_kdX& zpmXvVo!EHxY}dKUZp887;o)ei^;F4Gri<9h)MUG_4qoOZWb*X+n+pPbsWTw}wCeXH z-qnSSRj%VtM(@eY*?3B*WqaJV63ur5t>g^mMdvEYP8boS`31TrisE_B1dZEx1WaAZMF*wK?YJEL=YPT!NS z{S(izxwoK{96bC2LkGpD}FzWtkR|^JbazLW^3V-RJzEPSxd&kGoj`B(sX@ zy6c^V)iOxD2hv82sLvw5J%fP`ZDvyF=4!W(Fy%{<+SrzL(yLvra(CQyE$pVh1-L63 Q3`Upg7LIcllJ(*H3*c=2asU7T literal 0 HcmV?d00001 diff --git a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py index a30aa9c..14be6ba 100644 --- a/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py +++ b/ger_gfonts/ger_gfonts/spiders/gfonts_spider.py @@ -1,14 +1,17 @@ import scrapy +from scrapy.spiders import Spider,CrawlSpider, Rule import socket import re import json from urllib.request import urlopen from urllib.parse import urlparse +from utility.countries import * -class GFontsSpider(scrapy.Spider): +class GFontsSpider(Spider): name = "gfonts" #start_urls = self.getUrls() checked_domains = [] + eu_domains = [] custom_settings = {'CLOSESPIDER_PAGECOUNT': 2000} def __init__(self): @@ -18,12 +21,14 @@ class GFontsSpider(scrapy.Spider): # check current url - german or eu (todo) # check if api.gfonts.com is found # @todo: check if google analytics is found - parsed = urlparse(response.url) - self.checked_domains.append(parsed.hostname) - self.logNewDomain(response.url) - if self.isGerman(response.url): - self.findGFonts(response) - self.parseOn(response) + if self.isEuropean(response.url): + self.writeTrackers(response) + parsed = urlparse(response.url) + self.eu_domains.append(parsed.hostname) + self.logNewDomain(response.url) + else: + print("NOT EUROPEAN: " + response.url) + self.parseOn(response) def getCountryOfUrl(self, url): ip = socket.gethostbyname(url) @@ -44,24 +49,60 @@ class GFontsSpider(scrapy.Spider): if not self.isGermanTLD(url): return self.isCountryGerman(url) return True + def isEuropean(self, url): + eu_tlds = self.getEuTlds() + parts = urlparse(url) + tld = parts.hostname[-3:] + if tld in eu_tlds: + return eu_tlds[tld] + country = self.getCountryOfUrl(url) + if country in eu_tlds.values(): + return country + return False def findGFonts(self, response): for links in response.css('head link'): - if 'fonts.googleapis.com' in links.attrib['href']: - yield { - 'url': response.url, - 'gfonts': True, + return 'fonts.googleapis.com' in links.attrib['href'] + def findGTrackers(self, response): + trackers = { 'ga' : 'www.google-analytics.com', + 'gt' : 'www.googletagmanager.com'} + result = {'ga':0, 'gt':0} + for script in response.css('script::text').getall(): + if script.find(trackers['ga']) > 0: + result['ga'] = 1 + if script.find(trackers['gt']) > 0: + result['gt'] = 1 + return result + def findMetaPixel(self, response): + for img in response.css('img'): + if img.attrib['src'].find('www.facebook.com/tr?id='): + return TRUE + return FALSE + def writeTrackers(self,response): + gtrackers = self.findGTrackers(response) + yield { + 'domain': urlparse(response.url).netloc, + 'country': self.isEuropean(response.url), + 'gf': self.findGFonts(response), + 'ga': gtrackers['ga'], + 'gt': gtrackers['gm'], + 'mp': self.findMetaPixel(response) } + def parseOn(self, response): - for links in response.css('a'): - url = links.attrib['href'] + links = response.css('a'); + print('FOUND: ' + str(len(links)) + ' LINKS') + for link in links: + url = link.attrib['href'] # parse valid urls - if validators.url(url) and bool(urlparse(url).netloc): + found = urlparse(url) + if validators.url(url) and bool(found.netloc): current = urlparse(response.url) - found = urlparse(url) - if current.hostname != found.hostname and found.hostname not in self.checked_domains: + if current.hostname != found.hostname: yield response.follow(url, callback=self.parse) + else: + print("NOT FOLLOWING: " + url) def getUrls(self): with open('sites.txt') as sites_file: @@ -72,3 +113,11 @@ class GFontsSpider(scrapy.Spider): print('############################################') print('###### ' + url + ' #######') print('############################################') + + def getEuTlds(self): + return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ', + '.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI', + '.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT', + '.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO', + '.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK', + '.bg':'BG', '.cy':'CY', '.mt':'MT'} diff --git a/ger_gfonts/ger_gfonts/spiders/sites.txt b/ger_gfonts/ger_gfonts/spiders/sites.txt index e98e4f3..bcaa116 100644 --- a/ger_gfonts/ger_gfonts/spiders/sites.txt +++ b/ger_gfonts/ger_gfonts/spiders/sites.txt @@ -75,7 +75,7 @@ kik.de wetterzentrale.de service.bund.de katholisch.de -homboldt-foundation.de +humboldt-foundation.de deginvest.de comdirect.de standaard.be diff --git a/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py new file mode 100644 index 0000000..dc0028f --- /dev/null +++ b/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py @@ -0,0 +1,17 @@ +import scrapy +#import pandas +from scrapy.linkextractors import LinkExtractor +#from utility.countries import getEuTlds +from utility import countries + +class firstSpider(scrapy.Spider): + name = "start_urls" + + def __init__(self): + eu_tlds = countries.getEuTlds() + self.start_urls = map(lambda t: 'https://www.google.com/search?q=inurl%3A' + t, eu_tlds.keys()) + + def parse(self, response): + xlink = LinkExtractor() + for link in xlink.extract_links(response): + print(link) diff --git a/ger_gfonts/utility/.countries.py.swp b/ger_gfonts/utility/.countries.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..6aec163319535a0eb877bec5ac207a0e71dd9bf1 GIT binary patch literal 12288 zcmeHNzmFS57@b3+2#_Cw3egOl(E7x&&W{FJxJd4NKa@*yN_>&H6Ig3|)}GvYciEXe zVGz)QkWxbm8la+u5)ibA9#K&8A6!8N@0%H4971G?F6dk7-M;zeoA=GUnO(`c?VrEB zeNDa=Ul6#?332?(U&$wLom+b61tBtBP;}vC%4DD>$-peg`~ZT@-Kw~w2G%qWi%D77 z<|r+4v_`7x1zKC3X*+EWiaNKIRwgc|3zWH=9bg9*J8)WDx_IF{JHQUG z1MC1hzz(nj>;OCP7&?#)h4}H55T8AT$>ab3v)})}pB3U~;5*=>k z;wxYecoO*bv=AQy9|1XV6?h)_^BEz21AYL$2R;XmfEwrl9UubE0%w5Zr-k?x_y+hA z_yV{KSYRJ`9e5cy1^j_|{{(yu+yiJGp8|Zb1MC1hzz(nj>;OB!4m@rSNL-b&b(PDF zBr{4-fUXeqpNCEbg>))IQ|er*x@WVYiI0-3R;IDiayhevwj3$D=DsbR-bnYVtWj6l zic2A>pdK_1AoQl|LkaLXd9O;bwk(s%ns{6on@amM!pMIL8GGHg`Jltpfo-|tz=?+`rC8%e2qC+8tF&>fAfElsQ0)TPeTBF2Zzk$gq! z5`Us5>3fe5XiHNJ4wa3uv2-H(Q#uoV=V?9vcKU+z5?2)WM^P)fvKg&{52oO2Z#X`& z;Mcm*`_4X}TV{yu?zsMv`ggmIR|WXZ8;%YQFi{X4!tBfTB3 zm(|4g+_>zB_-)5?s(;uVen;isBs~SBCg4A1CdO*G_3|8(5pu3lG$L7@Q2FmW(d1yAcVh zqT!d_C)q1rb}$BC+j2>RcZu(VJFR6oNlN#6u!T&ra-8%ox@URr*~S*9RWYFlEz|wj zy$++@yBoHai=20?>xC2 z3dRG7`2j$JWF}gyRVKTtrR2~Tv;z2oEbLZV75GA`{3Kdcs!|QQMiGrcM>I|o&|@@7 zQ_$lyP1m3&NN$Vz%yVQA&AN28zRvO@>A{R(7O^%-n}e6*E{tV}Sp$$HMFFSebexLQ za0))2X6QQ2(o1xMUZz)Qj&9OCy-K&}HF}+H(;Zr%yT2?T{TofEzFvXsIJLCu)ba+T z<^@B>yQl=_#Ptu5PLTf+hh&j=66mtHzALl+JfGK|1s=s ztlh@CZJfKbjy&Hx_3ci5;W$`~iNON)5_Xfc=y~13C-`9ypNl zx|EN~cuD3h4rEUOy^J9T6l@h7)jT!4&F~Jx+4ees_5jeezY6^(7{7u)A2bf~Zqmro zys@S8!=!CZ<0$VIxovdz^B!iMMDJdheVb;f9W>f`zh`xtn8jk?#yW{z?eAn{BL1S2 z0HMa^5o=B3VoI_vEi4$d2sZQldV1u);3=j>= zkb-%E&m1<7XAi6`%$JQuTBN=Gd=a%DR~m;V@44Erv_ZY%swQtACDu)$zJuJ@y+JST zCYtYq3;p6S%VWaI06pqdTDV$Vq`N$cbYhCUXA&1uY~w4w^jp0B8jf7{91I@qKhfEB zIE}x8l3zRm>JQ*+b21*RN3YEck+hvq=3B>=DtNZItmgCdem1DM&s9)g*pa^B}rZqd#@lKm`}P3^6wVLn6$K zMQNw_kNAIyYVM#4-soQo_J?f$A0^Rc<^`-6rH!Qi0`n0cFEri;B37J=6N#kE3V1kF zBUXYkDCJRA-z#P5o6ZJ)6`HV=XW=t}*h(3qRQ`HFQ-6%}*6J1C>Yh6N2|T_9T=i+3 z^%H|A7w>9Ge;4!_{VA{bjNu)It83{yY&AKv<; zeizD3qj9rzZ&4>dBA@k*uHWOxEIrX#T VD{c*657z)u(O6WAf~XQz{sxT>e^3Ab literal 0 HcmV?d00001 diff --git a/ger_gfonts/utility/countries.py b/ger_gfonts/utility/countries.py new file mode 100644 index 0000000..7f64de3 --- /dev/null +++ b/ger_gfonts/utility/countries.py @@ -0,0 +1,47 @@ +# utility functions for countries +# import pycountry +import socket +from urllib.request import urlopen +import json +from urllib.parse import urlparse + +def getEuTlds(): + # map tld to alpha_2 + return { '.ad': 'AD', '.at': 'AT', '.be':'BE', '.ch':'CH', '.cz':'CZ', + '.de':'DE', '.dk':'DK', '.ee':'EE', '.es':'ES', '.eu':'EU', '.fi':'FI', + '.fr':'FR', '.gr':'GR', '.hr':'HR', '.hu':'HU', '.ie':'IE', '.it':'IT', + '.li':'LI', '.lt':'LT', '.lu':'LU', '.lv':'LV', '.nl':'NL', '.no':'NO', + '.pl':'PL', '.pt':'PT', '.ro':'RO', '.se':'SE', '.si':'SI', '.sk':'SK', + '.bg':'BG', '.cy':'CY', '.mt':'MT'} + +def getCountryOfUrl(url): + ip = socket.gethostbyname(url) + api_url = 'https://ipinfo.io/' + ip + '/json' + response = urlopen(api_url) + data = json.load(response) + return data['country'] + +def isCountryGerman(self, url): + return 'DE' == getCountryOfUrl(url) + +def isGermanTLD(url): + parts = urlparse(url) + tld = parts.hostname[-3:] + return tld == '.de' + +def isGerman(self, url): + if not isGermanTLD(url): + return isCountryGerman(url) + return True + +def isEuropean(url): + eu_tlds = getEuTlds() + parts = urlparse(url) + tld = parts.hostname[-3:] + if tld in eu_tlds: + return eu_tlds[tld] + country = getCountryOfUrl(url) + if country in eu_tlds.values(): + return country + return False +