requests库+re正则表达式爬取并解析古诗文网

    技术2022-07-15  69

    # requests + re # requests: 数据爬取 # re:数据解析 import requests import re def parse(url): # 定义请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', } response = requests.get(url,headers) text = response.text # 核心: 利用re来解析数据 # 限定符后面的?表示非贪婪模式 # re.DOTALL 可以让 . 运算符匹配到\n,.运算符默认是匹配不到\n的 titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL) dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL) authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL) contents_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL) contents = [] for content in contents_tags: content = re.sub(r'<.*?>', '', content) contents.append(content.strip()) poems = [] for value in zip(titles, dynasties, authors, contents): title, dynasty, author, content = value poem = [ { 'title': title, 'dynasties': dynasty, 'authors': author, 'contents': content } ] # 将字典作为元素添加到列表中 poems.append(poem) for poem in poems: print(poem) print('~'*100) def main(): # 爬取指定的页数 for page in range(1, 51): url = 'https://www.gushiwen.org/default_{}.aspx'.format(page) parse(url) if __name__ == '__main__': main()

     

    Processed: 0.010, SQL: 9