(1) requests:数据爬取,import requests
(2) lxml中的xpath:数据解析,from lxml import etree
(3) json:数据存储,import json
下面直接放代码:
# json + lxml + xpath + requests 爬取 “糗事百科” from lxml import etree import requests import json class QiuShiBK(object): def __init__(self): self.init_url = "https://www.qiushibaike.com/text/page/{}/" self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.13 Safari/537.36'} def get_url_list(self): #根据url地址的规律,通过列表生成式 构造url_list url_list = [self.init_url.format(i) for i in range(1, 14)] return url_list def parse_url(self,url):#发送requests请求 response = requests.get(url,headers=self.headers) response.raise_for_status() return response.content.decode('UTF-8') def get_content_list(self,html_str): content_list = [] html = etree.HTML(html_str) #1. 按div分组 div_list = html.xpath("//div[@class='col1 old-style-col1']/div") for div in div_list: item = {} item["girl_name"] = div.xpath(".//h2/text()")[0].strip() if len(div.xpath(".//h2/text()"))>0 else None item["girl_content"] = div.xpath(".//div[@class='content']/span/text()") #因为不知道列表中是否只有一个元素,所以不同其他几个 直接【0】 item["girl_content"] = [i.strip() for i in item["girl_content"]] item["girl_content"] = "".join(item["girl_content"]) item["girl_laugh"] = div.xpath(".//span[@class='stats-vote']/i/text()")[0].strip() #if len(div.xpath(".//span[@class='stats-vote']/i/text()"))>0 else None item["girl_comment"] = div.xpath(".//span[@class='stats-comments']//i/text()")[0] #if len(div.xpath(".//span[@class='stats-comments']//i/text()"))>0 else None content_list.append(item) return content_list def save_content_list(self,content_list): with open('qiushi.txt','a',encoding='UTF-8') as f: for content in content_list: f.write(json.dumps(content,ensure_ascii=False)) f.write("\n") print("保存成功") def run(self): #1.根据url地址规律,拼接url #2.发送requests请求 #3.提取数据 #4.保存数据 for url in self.get_url_list(): #1.根据url地址规律,拼接url html_str = self.parse_url(url) #2.发送requests请求 content_list = self.get_content_list(html_str) #3.提取数据 self.save_content_list(content_list) #4.保存数据 if __name__ == '__main__': qiushi = QiuShiBK() qiushi.run()