把爵迹弄下来看看

技术2023-11-12 64

import pdb import requests import schedule import spacy import time from lxml import etree def text_info(url): ret = "" r = requests.get(url) content = r.content html = etree.HTML(content) text_info_xpath = html.xpath('//html/body/div[3]/div[6]')[0] for text_s in text_info_xpath.itertext(): ret += text_s return ret def get_directory(): ret = [] url = "http://www.mhtwx.la/101/101877/" r = requests.get(url) content = r.content html = etree.HTML(content) all_directory = html.xpath('//*[@id="novel101877"]/dl')[0] section = [] chapter_name = "" for children in all_directory.getchildren(): section_s = {} if children.tag == "dt": if section and chapter_name: ret.append({"chapter": chapter_name, "section": section}) section = [] chapter_name = children.text else: section_s["name"] = children.xpath("a")[0].text section_s["url"] = "http://www.mhtwx.la/101/101877/" + \ children.xpath("a")[0].get("href") section.append(section_s) ret.append({"chapter": chapter_name, "section": section}) return ret def run(): directory = get_directory() for chapter in directory: print(chapter["chapter"]) for section in chapter["section"]: section["text"] = text_info(section["url"]) pdb.set_trace() if __name__ == "__main__": run() """ [ { "chapter": "", "section": [ { "name": "", "url": "" "text": "" }, { ... } ] }, { ... } ] """

Processed: 0.009, SQL: 9