gfonts/ger_gfonts/ger_gfonts/spiders/start_urls_spider.py

22 lines
630 B
Python

import scrapy
#import pandas
from scrapy.linkextractors import LinkExtractor
#from utility.countries import getEuTlds
from utility import countries
from urllib.parse import urlencode, urlparse, parse_qs
class startUrls(scrapy.Spider):
name = "start_urls"
custom_settings = {
'ROBOTSTXT_OBEY': False
}
def __init__(self):
eu_tlds = countries.getEuTlds()
self.start_urls = map(lambda t: 'https://www.google.com/search?q=inurl%3A' + t, eu_tlds.keys())
def parse(self, response):
xlink = LinkExtractor()
for link in xlink.extract_links(response):
print(link)