22 lines
630 B
Python
22 lines
630 B
Python
import scrapy
|
|
#import pandas
|
|
from scrapy.linkextractors import LinkExtractor
|
|
#from utility.countries import getEuTlds
|
|
from utility import countries
|
|
from urllib.parse import urlencode, urlparse, parse_qs
|
|
|
|
class startUrls(scrapy.Spider):
|
|
name = "start_urls"
|
|
custom_settings = {
|
|
'ROBOTSTXT_OBEY': False
|
|
}
|
|
|
|
def __init__(self):
|
|
eu_tlds = countries.getEuTlds()
|
|
self.start_urls = map(lambda t: 'https://www.google.com/search?q=inurl%3A' + t, eu_tlds.keys())
|
|
|
|
def parse(self, response):
|
|
xlink = LinkExtractor()
|
|
for link in xlink.extract_links(response):
|
|
print(link)
|