07

    技术2022-07-11  89

    注意事项:

    对于下一页抓取不用下标提取,使用下一页按钮提取保存为文本时,提前对字典内容编码encoding='utf-8',否则会报错,保存文本时,不能使用f’‘格式,f’'格式代表生成字符串的表达式,而非真真意义上的字符串,只能打印,不能存储 import requests from lxml import etree class TieBa(object): def __init__(self, name): self.url = 'https://tieba.baidu.com/f/good?kw=' + name self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' } self.data_list = {} # 抓取网页完整数据 def get_data(self): response = requests.get(self.url, headers = self.headers) # print(response.content) # with open('tieba.html', 'wb') as f: # f.write(response.content) return response.content # 对数据提取解析,分离出每条帖子的名称和链接地址 def parse_data(self, data): # 数据解码 data = data.decode() # 将HTML字符串转化为element对象,可以使用xpath方法 html = etree.HTML(data) # print(html.xpath('//')) #定位所需元素位置 e = html.xpath('//a[@class="j_th_tit"]') for s in e: # print(s.xpath('./text()'), s.xpath('./@href')) # 所需内容提取形成字典 self.data_list[s.xpath('./text()')[0]] = 'https://tieba.baidu.com' + s.xpath('./@href')[0] try: # 当下一页存在时,提取下一页地址 next_url = html.xpath('//a[@class="next"]/@href')[0] except: # 当下一页不存在时,返回空 next_url = None # 返回下一页地址 return next_url def run(self): next_url = self.url # 当下一页不为空,存在时不断提取 while True: response_data = self.get_data() next_url = self.parse_data(response_data) if next_url == None: break # 拼接下一页具体地址 self.url = 'https://tieba.baidu.com' + next_url # print(self.data_list) # for key, value in self.data_list.items(): # print(f'{key} : {value}') # 对所需内容字典形式保存为字符串文本 with open('tieba.txt', 'w', encoding='utf-8') as f: for key, value in self.data_list.items(): f.write(key + ',' + value) f.write('\n') if __name__ == "__main__": tieba = TieBa('一击男') tieba.run()
    Processed: 0.012, SQL: 9