工贸啊啊啊

技术2022-07-10 134

# 自动翻页爬爬爬 By：计算机1801 马康太 from urllib.request import urlopen import requests import urllib.parse from lxml import etree from bs4 import BeautifulSoup # 获取页数函数 def huoquyeshu(): response = urllib.request.urlopen('http://www.zjitc.net/xwzx/xyxw.htm') html = response.read().decode('utf-8') # print(html) s = etree.HTML(html) result = s.xpath('//*[@id="fanye114233"]/text()') # print(result) # print(type(result)) # print(len(result)) result_str = [ ] for c in result: result_str.append(c) # print(c) # print(len(c)) yeshu = c[ -4:-1 ] # print(yeshu) return yeshu # 转换格式函数 def tran_url(url): xiuzheng_url = [ ] for xiuzheng in url: if xiuzheng.startswith('../..'): xiuzheng_url.append('http://www.zjitc.net/' + xiuzheng.lstrip('../..')) # print('问题1') elif xiuzheng.startswith('../'): xiuzheng_url.append('http://www.zjitc.net/' + xiuzheng.lstrip('../')) # print('问题2') else: pass # print('没啥问题') print(xiuzheng_url) return xiuzheng_url # ''' # 获取爬取页面详细信息 def neirong(url): # url = url if url == '': pass else: res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') biaoti = soup.select('h2')[ 0 ].text zuozhe = soup.select('.zz')[ 0 ].text time = soup.select('.zz')[ 0 ].text result = {} author = zuozhe.split()[ 0 ].lstrip('作者:') resource = zuozhe.split()[ 1 ].lstrip('来源:') timesource = time.split('浏')[ 0 ].strip().lstrip('发布时间:') result[ 'title' ] = biaoti result[ 'author' ] = author result[ 'resource' ] = resource result[ 'timesource' ] = timesource result[ 'article' ] = ' '.join([ p.text.strip() for p in soup.select('#vsb_content p')[ :-1 ] ]) print(result) # ''' # 首页函数 def shouye_spider(): url = 'http://www.zjitc.net/xwzx/xyxw.htm' print('首页是' + url) s = etree.HTML(url) for d in range(0, 8): r = urlopen(url) data = requests.get(url) data.encoding = 'utf-8' s = etree.HTML(data.text) result_xpath_biaoti = '//*[@id="line_u29_' + str(d) + '"]' + '/a/div[2]/div/h3/text()' result_xpath_url = '//*[@id="line_u29_' + str(d) + '"]' + '/a/@href' result_pachong = s.xpath(result_xpath_biaoti) print(result_pachong) result_pachong = s.xpath(result_xpath_url) result_pachong = tran_url(result_pachong) # '''下句代码有问题，请反复调试''' neirong("".join(result_pachong)) # print(type(result_pachong)) # print(result_pachong) # shouye_spider() # 多页 def fanye_spider(yeshu, paquyeshu): for b in range(eval(yeshu) - paquyeshu + 1, eval(yeshu)): url = 'http://www.zjitc.net/xwzx/xyxw/' data = b url = url + str(data) + '.htm' print('第' + str(eval(yeshu) - b) + '页是' + url) s = etree.HTML(url) for d in range(0, 8): r = urlopen(url) data = requests.get(url) data.encoding = 'utf-8' s = etree.HTML(data.text) # print(r.read().decode('utf-8')) result_xpath_biaoti = '//*[@id="line_u29_' + str(d) + '"]' + '/a/div[2]/div/h3/text()' result_xpath_url = '//*[@id="line_u29_' + str(d) + '"]' + '/a/@href' result_pachong = s.xpath(result_xpath_url) xiuzheng_url = [ ] # print('标记') # print(type(result_pachong)) for xiuzheng in result_pachong: if xiuzheng.startswith('../..'): xiuzheng_url.append('http://www.zjitc.net/' + xiuzheng.lstrip('../..')) # print('问题1') elif xiuzheng.startswith('../'): xiuzheng_url.append('http://www.zjitc.net/' + xiuzheng.lstrip('../')) # print('问题2') else: print('没啥问题') print(xiuzheng_url) neirong("".join(xiuzheng_url)) # result_pachong=s.xpath(result_xpath_biaoti) # print(result_pachong) # global back_urlandbiaoti # back_urlandbiaotik=result_xpath_url if __name__ == "__main__": paquyeshu = eval(input('请输入要爬取多少页:')) fanye_spider(huoquyeshu(), paquyeshu) shouye_spider()

Processed: 0.014, SQL: 9