javascript - Python, scrapy: scrape links then iterate over those links to scrape further links -
i trying create scrapy spider focuses on site called weedmaps.com. weedmaps uses googlemaps api generate information on dispensaries based on state regional location information. want spider start top layer, diver states, scrape regional links within states, go regional links 1 @ time, scrape dispensary links, , go dispensary links 1 @ time , scrape specific information regarding individual dispensaries. given site dynamic, have been using selenium account javascript. site, have been able scrape regional links, , dispensary links, separately. when try combine them, spider collects first regional link, , goes directly regional link collect dispensary information , ends. somehow create list of regional links, , dispensary links populated spider runs.
any insights on how accomplish fantastic. below code have far, , can't seem figure thing out. thank in advance!
import scrapy import time selenium import webdriver selenium.webdriver.support.wait import webdriverwait selenium.webdriver.common.by import selenium.webdriver.support import expected_conditions ec scrapybot import __init__ class scrapybot_spider(scrapy.spider): name = "scrapybot_spider" allowed_domains = ['https://weedmaps.com'] regionlinks = [] dispensarylinks = [] start_urls = ["https://weedmaps.com/dispensaries/in/united-states/colorado"] #initialize selenium webdriver via firefox def __init__(self): self.browser = webdriver.firefox() #scraping regional links in states def parse(self, response): self.browser.get('https://weedmaps.com/dispensaries/in/united-states/colorado') wait = webdriverwait(self.browser, 10) wait.until(ec.visibility_of_element_located((by.css_selector, "div.subregion a.region-ajax-link"))) region in self.browser.find_elements_by_css_selector("div.subregion a.region-ajax-link"): time.sleep(5) region = region.get_attribute("href") self.regionlinks.append(region) print regionlinks #scraping dispensary links within regional links above def dispensaryparse(self, response): global region self.browser.get(region) wait = webdriverwait(self.browser, 10) wait.until(ec.visibility_of_element_located((by.css_selector, "div.dispensary div.name a"))) dispensary in self.browser.find_elements_by_css_selector("div.dispensary div.name a"): dispensary = dispensary.get_attribute("href") self.dispensarylinks.append(dispensary) print dispensary return dispensary
Comments
Post a Comment