python - Dynamically add to allowed_domains in a Scrapy spider -
i have spider starts small list of allowed_domains
@ beginning of spidering. need add more domains dynamically whitelist spidering continues within parser, following piece of code not accomplished since subsequent requests still being filtered. there of updating allowed_domains
within parser?
class apspider(basespider): name = "apspider" allowed_domains = ["www.somedomain.com"] start_urls = [ "http://www.somedomain.com/list-of-websites", ] ... def parse(self, response): soup = beautifulsoup( response.body ) link_tag in soup.findall('td',{'class':'half-width'}): _website = link_tag.find('a')['href'] u = urlparse.urlparse(_website) self.allowed_domains.append(u.netloc) yield request(url=_website, callback=self.parse_secondary_site) ...
you try following:
class apspider(basespider): name = "apspider" start_urls = [ "http://www.somedomain.com/list-of-websites", ] def __init__(self): self.allowed_domains = none def parse(self, response): soup = beautifulsoup( response.body ) if not self.allowed_domains: link_tag in soup.findall('td',{'class':'half-width'}): _website = link_tag.find('a')['href'] u = urlparse.urlparse(_website) self.allowed_domains.append(u.netloc) yield request(url=_website, callback=self.parse_secondary_site) if response.url in self.allowed_domains: yield request(...) ...
Comments
Post a Comment