import scrapy species = ['H', 'C', 'N', 'O'] formula = ''.join([x + '*' for x in species]) urltemplate = "https://webbook.nist.gov/cgi/cbook.cgi?Value={}-{}&VType=MW&Formula=" + formula + "&NoIon=on&MatchIso=on" # noqa: E501 min_weight = 0 max_weight = 9999999 class NISTSpider(scrapy.Spider): name = "NIST" def start_requests(self): start_url = urltemplate.format(min_weight, max_weight) yield scrapy.Request( url=start_url, callback=lambda x: self.parse_range(x, min_weight, max_weight)) def parse_range(self, response, from_, to): search_result_list = response.xpath('//*[@id="main"]/ol/li/a') # if there are 400 result, then the range is too large, # half the range and repeat the search if len(search_result_list) == 400 and to - from_ > 1: mid = (from_ + to) // 2 next_url1 = urltemplate.format(from_, mid) next_url2 = urltemplate.format(mid, to) yield scrapy.Request( url=next_url1, callback=lambda x: self.parse_range(x, from_, mid)) yield scrapy.Request( url=next_url2, callback=lambda x: self.parse_range(x, mid, to)) elif len(search_result_list) == 400 and to - from_ == 1: next_url1 = urltemplate.format(from_, from_) next_url2 = urltemplate.format(to, to) yield scrapy.Request( url=next_url1, callback=lambda x: self.parse_range(x, from_, from_)) yield scrapy.Request( url=next_url2, callback=lambda x: self.parse_range(x, to, to)) else: for i in search_result_list: href = i.css('a::attr("href")').extract_first() prefix = '/cgi/cbook.cgi?ID=' suffix = '&Units=SI' nist_id = href[len(prefix): len(href) - len(suffix)] url3d = 'https://webbook.nist.gov/cgi/cbook.cgi?Str3File=' + nist_id # noqa: E501 yield scrapy.Request( url=url3d, callback=lambda x, n=nist_id: self.parse_sdf(x, n)) def parse_sdf(self, response, nist_id): if response.text: text = response.text lines = [x.strip() for x in text.split("\r\n")] lines = lines[3:] count = int(lines[0].split()[0]) lines = lines[1: count + 1] lines = [x.split()[:4] for x in lines] # double check if it contain unexpected elements good = True atoms = [] for x, y, z, atype in lines: if atype not in species: good = False break atoms.append((atype, float(x), float(y), float(z))) if good: yield {'id': nist_id, 'size': count, 'atoms': atoms}