import requests from lxml import etree import time def page_url(num): url_base= ‘http://www.zjvtit.edu.cn/xwzx/jyxw.htm’ url_list=[] url_list.append(url_base) for i in range(num): headers = { ‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36’} response = requests.get(url_base, headers=headers, timeout=5) response.encoding = ‘utf-8’ html = response.text root = etree.HTML(html) next_url=root.xpath("//a[@class=‘Next’][1]/@href") next_url=’’.join(next_url) if next_url.startswith(‘jyxw’): url_base=‘http://www.zjvtit.edu.cn/xwzx/’+next_url else: url_base=‘http://www.zjvtit.edu.cn/xwzx/jyxw/’+next_url url_list.append(url_base) return url_list
def page_new_url(url): url_list=[] url_base=‘http://www.zjvtit.edu.cn/’ headers={‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36’} response=requests.get(url,headers=headers,timeout=5) response.encoding=‘utf-8’ html=response.text root=etree.HTML(html) url_l=root.xpath("//div[@class=‘list-cont-ul fr’]/ul/li/a/@href") for i in url_l: if i.startswith(’…/…/’): url_a=url_base+i.lstrip(’…/’) else : url_a=url_base+i.lstrip(’…/’) url_list.append(url_a) if url!=‘http://www.wzvcst.cn/xwzx/xyyw.htm’: url_list=url_list[0:len(url_list)-19] return url_list
def spider(url): headers={‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36’} response=requests.get(url,headers=headers,timeout=5) response.encoding=‘utf-8’ html=response.text root=etree.HTML(html) item={} title=root.xpath("//div[@class=‘zw-title’]/text()") time=root.xpath("//div[@class=‘zw-other’]/text()") content=root.xpath("//div[@id=‘vsb_content_2’]//text()") content=’’.join(content).replace(’,’,’’).replace(’\r’,’ ‘).replace(’\n’,’ ‘).lstrip(’ ‘) title=’’.join(title) time=’’.join(time).lstrip(‘发布时间:’).rstrip(’\xa0’)
item=[title,time,content] return itemif name==‘main’: starttime=time.time() list=[] num=int(input(“请输入要爬取的页数:”)) url_list=page_url(num) for i in page_url(num): for j in page_new_url(i): print(j) list.append(spider(j)) # for i in page_new_url(url): # print(i) # list.append(spider(i)) #url_list= {}.fromkeys(url_list).keys()#去掉重复的url_news with open(‘交院.txt’,‘w+’,encoding=‘utf-8’) as file: for i in list: file.write(str(i)+’\n’) file.close() endtime=time.time() print(endtime-starttime)