去国际站后台的排名查询页面,F12,获取cookies和token填入 https://hz-productposting.alibaba.com/product/ranksearch/rankSearch.htm
import requests from lxml import etree import time import csv from multiprocessing import Pool def get_words(): with open('key.txt', 'r', encoding='utf-8') as f:#需要查询的单词保存到key.txt文件,每行一个单词 words = [] for word in f.readlines(): word = (word.strip()) words.append(word) #words = words[13:] num = len(words) print('一共有%s个单词,需要查询'%num) return words,num def get_result(word,url): headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6', 'origin': 'https://hz-productposting.alibaba.com', 'referer': 'https://hz-productposting.alibaba.com/product/ranksearch/rankSearch.htm', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'content-type': 'application/x-www-form-urlencoded', 'cache-control': 'max-age=0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'cookie':'自己的cookies'#突然无法正常运行的时候需要更换 } data = { '_csrf_token_': '自己的token',#突然无法正常运行的时候需要更换 'queryString': word, } res = requests.post(url=url,headers =headers,data = data) source = res.text return source def parse_source(source,file_path,sk): html = etree.HTML(source) try: tbody = html.xpath("//tbody")[0] except: line = [sk, '页面解析错误!检查cookies或稍后重试。'] print(line) save_file(line, file_path) return None trs = tbody.xpath('./tr')[:1] result = html.xpath("//div[@class='search-result']/text()")[1].strip()[1:-4]#控制每次读取几个产品 for tr in trs: products = tr.xpath("td[@class='products']/a/text()") if len(products) == 0: print(sk,"无匹配产品") line = [sk,'无匹配产品'] save_file(line, file_path) else: ranking = tr.xpath("td[@class='ranking']/a/text()") charge = tr.xpath("td[@class='charge']/span/text()") if charge == []: line = [sk,products[0],ranking[0],result] print(sk,ranking[0]) else: line = [sk,products[0],ranking[0],result,charge[0]] print(sk,ranking[0],charge[0]) save_file(line,file_path) def save_file(line,file_path): print(line) csvfile = open('%s.csv' % file_path, 'a+',newline="") writer = csv.writer(csvfile, dialect='excel') writer.writerow(line) def run(word, num, a, total_start): file_path = '结果保存的文件名' url = 'https://hz-productposting.alibaba.com/product/ranksearch/rankSearch.htm' time_start = time.time() print('开始查询%s的排名!---------%s/%s'%(word,a,num)) source = get_result(word,url) sk = word parse_source(source,file_path,sk) print('保存成功!') time_now = time.time() - total_start time_end = time.time() time_total = time_end - time_start print('总花费时间%.2f秒,平均花费%.2f秒'%(time_now,(time_now/a))) print('*'*40) if __name__ == '__main__': total_start = time.time() words, num = get_words() po = Pool(16)#16进程数,调太高后会弹验证码,解决方法:需要休息一段时间重试 a = 1 for word in words: try: po.apply_async(run, args=(word, num, a, total_start)) a = a + 1 except: print('进程报错!') po.close() po.join() total_end = time.time() total_total = total_end - total_start print('总用时%.2f秒,平均用时%.2f秒' % (total_total, total_total / num))