浙江高职高校网站爬虫——温州科技职业技术学院

技术2022-07-10 167

import requests from lxml import etree import xlwt def page_url(num):#爬取每个页面的url并返回 url_list=[] url_base=‘http://www.wzvcst.cn/xwzx/xyyw.htm’ url_list.append(url_base) for i in range(num-1): headers = { ‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36’} response = requests.get(url_base, headers=headers, timeout=5) response.encoding = ‘utf-8’ html = response.text root = etree.HTML(html) next_url=root.xpath("//a[@class=‘Next’][1]/@href") next_url=’’.join(next_url) if next_url.startswith(‘xyyw’): url_base=‘http://www.wzvcst.cn/xwzx/’+next_url url_list.append(url_base) else: url_base=‘http://www.wzvcst.cn/xwzx/xyyw/’+next_url url_list.append(url_base) return url_list def page_news_url(url):#爬取一个页面所有通知的url 并返回 url_list=[] url_base=‘http://www.wzvcst.cn/’ headers={‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36’} response=requests.get(url,headers=headers,timeout=5) response.encoding=‘utf-8’ html=response.text root=etree.HTML(html) url_l=root.xpath("//ul[@class=‘newslist’]/li/a/@href") for i in url_l: url_a=url_base+i.lstrip(’./’) url_list.append(url_a) if url!=‘http://www.wzvcst.cn/xwzx/xyyw.htm’: url_list=url_list[0:len(url_list)-3] return url_list

def spider(url):#爬取数据整理并返回 headers={‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36’} response=requests.get(url,headers=headers,timeout=5) response.encoding=‘utf-8’ html=response.text root=etree.HTML(html) item=[] title=root.xpath("//h1/text()") title=’’.join(title) zz=root.xpath("//div[@class=‘sm’]/text()") zz=’’.join(zz).replace(’\n’,’’).replace(’\r’,’’).replace(’ ‘,’ ‘).rstrip(‘浏览次数：’) content=root.xpath("//div[@class=‘v_news_content’]//text()") content=’’.join(content).replace(’\n’,’’).replace(’\r’,’’).lstrip(’ ') item=[title,zz,content] return item

def save_txt(list):#保存数据 with open(‘温科院.txt’, ‘w+’, encoding=‘utf-8’) as file: for i in list: file.write(’’.join(i) + ‘\n’) file.close() def save_excel(list): book = xlwt.Workbook(encoding=‘utf-8’, style_compression=0) sheet = book.add_sheet(‘温科院’) col = ( “标题”, “信息”, “内容”) for i in range(2): sheet.write(0, i, col[i]) for i in range(len(list)): data = list[i] sheet.write(i + 1, 0, data[0]) sheet.write(i + 1, 1, data[1]) sheet.write(i + 1, 2, data[2]) book.save(‘温科院.xls’) if name==‘main’: num=int(input(“请输入要爬取的页数:”)) url_list=[]#存放每条通知的url list=[]#爬取的数据

for i in page_url(num): for j in page_news_url(i): url_list.append(j) url_list= {}.fromkeys(url_list).keys()#去掉重复的url_news for i in url_list:#循环url 爬取数据，添加到list list.append(spider(i)) save_txt(list) # save_excel(list)

Processed: 0.009, SQL: 9