阿鲁巴交通写文件思密达

    技术2022-07-10  127

    from bs4 import BeautifulSoup import urllib.request import urllib.parse from urllib.request import urlopen import requests from lxml import etree import json # def load_page(url): # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'} # request=urllib.request.Request(url,headers=headers) # return urllib.request.urlopen(request).read().decode('utf-8') def headers_page(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'} request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) return request,response def tran_url(url): xiuzheng_url = [ ] for xiuzheng in url: if xiuzheng.startswith('../..'): xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..')) # print('问题1') elif xiuzheng.startswith('../'): xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../')) # print('问题2') else: print('没啥问题') print(xiuzheng_url) return xiuzheng_url def huoquyeshu_page(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'} request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) # html = response.read().decode('utf-8') html=response.read().decode('utf-8') s = etree.HTML(html) result=s.xpath('//*[@id="fanye126908"]/text()') # print(result) # print(type(result)) # print(len(result)) result_str = [ ] for c in result: result_str.append(c) # print(c) # print(len(c)) yeshu = c[ -4:-1 ] # print(yeshu) return yeshu def neirong(url): # url = url if url == '': pass else: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'} res = requests.get(url,headers=headers) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') biaoti = soup.select('.zw-title')[0].text # zuozhe = soup.select('.zz')[ 0 ].text time = soup.select('.zw-other')[0].text result = {} # author = zuozhe.split()[ 0 ].lstrip('作者:') # resource = zuozhe.split()[ 1 ].lstrip('来源:') timesource = time.split('浏')[ 0 ].strip().lstrip('发布时间:') result[ 'title' ] = biaoti # result[ 'author' ] = author # result[ 'resource' ] = resource result[ 'timesource' ] = timesource result[ 'article' ] = ' '.join([ p.text.strip() for p in soup.select('.zw p')[ :-1 ] ]) return result def shouye_spider(): list_content_1=[] url = 'http://www.zjvtit.edu.cn//xwzx/jyxw.htm' print('首页是' + url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'} request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) # html = response.read().decode('utf-8') html = response.read().decode('utf-8') s = etree.HTML(html) for d in range(0, 20): # r = urlopen(url) # data = requests.get(url) # data.encoding = 'utf-8' # s = etree.HTML(data.text) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'} request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) # html = response.read().decode('utf-8') html = response.read().decode('utf-8') s = etree.HTML(html) r = urlopen(request) result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()' result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href' result_pachong = s.xpath(result_xpath_url) print(result_pachong) result_pachong = s.xpath(result_xpath_url) result_pachong = tran_url(result_pachong) list_content_1.append(neirong("".join(result_pachong))) return list_content_1 def fanye_page(yeshu, paquyeshu): list_content_2=[] for b in range(eval(yeshu) - paquyeshu + 1, eval(yeshu)): url = 'http://www.zjvtit.edu.cn//xwzx/jyxw/' data = b url = url + str(data) + '.htm' print('第' + str(eval(yeshu) - b) + '页是' + url) s = etree.HTML(url) for d in range(0, 20): # r = urlopen(url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'} request = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(request) # html = response.read().decode('utf-8') html = response.read().decode('utf-8') s = etree.HTML(html) r = urlopen(request) # data = requests.get(url) # data.encoding = 'utf-8' # s = etree.HTML(data.text) # print(r.read().decode('utf-8')) result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()' result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href' result_pachong = s.xpath(result_xpath_url) xiuzheng_url = [ ] # print('标记') # print(type(result_pachong)) for xiuzheng in result_pachong: if xiuzheng.startswith('../..'): xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..')) # print('问题1') elif xiuzheng.startswith('../'): xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../')) # print('问题2') else: print('没啥问题') print(xiuzheng_url) list_content_2.append(neirong("".join(xiuzheng_url))) return list_content_2 # result_pachong=s.xpath(result_xpath_biaoti) # print(result_pachong) # global back_urlandbiaoti # back_urlandbiaotik=result_xpath_url if __name__ == '__main__': url = 'http://www.zjvtit.edu.cn/xwzx/jyxw.htm' print('正在爬取浙江交通职业技术学院相关信息') paquyeshu = eval(input('请输入要爬取多少页:')) # huoquyeshu_page(url) list_content_fanye=[] list_content_shouye=[] list_content_fanye.append(fanye_page(huoquyeshu_page(url), paquyeshu)) print(list_content_fanye) list_content_shouye.append(shouye_spider()) print(list_content_shouye) # for i in list_content[0]: # print(i) # print(type(i)) # print(list_content) with open('zjjt.txt','w',encoding='utf-8') as file: for i in list_content_shouye[0]: file.write(json.dumps(i,ensure_ascii=False)+'\n') # file.write(i) for j in list_content_fanye[0]: file.write(json.dumps(j,ensure_ascii=False)+'\n') # file.write(i) file.close() # with open('zjjt.txt','w',encoding='utf-8') as file: # for i in list_content_fanye[0]: # file.write(json.dumps(i,ensure_ascii=False)+'\n') # # file.write(i) # file.close() # load_page(url) # print(load_page(url))
    Processed: 0.012, SQL: 9