from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from urllib.request import urlopen
import requests
from lxml import etree
import json
def headers_page():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
return request,response
def tran_url(url):
xiuzheng_url = [ ]
for xiuzheng in url:
if xiuzheng.startswith('../..'):
xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..'))
elif xiuzheng.startswith('../'):
xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../'))
else:
print('没啥问题')
print(xiuzheng_url)
return xiuzheng_url
def huoquyeshu_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
html=response.read().decode('utf-8')
s = etree.HTML(html)
result=s.xpath('//*[@id="fanye126908"]/text()')
result_str = [ ]
for c in result:
result_str.append(c)
yeshu = c[ -4:-1 ]
return yeshu
def neirong(url):
if url == '':
pass
else:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
res = requests.get(url,headers=headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
biaoti = soup.select('.zw-title')[0].text
time = soup.select('.zw-other')[0].text
result = {}
timesource = time.split('浏')[ 0 ].strip().lstrip('发布时间:')
result[ 'title' ] = biaoti
result[ 'timesource' ] = timesource
result[ 'article' ] = ' '.join([ p.text.strip() for p in soup.select('.zw p')[ :-1 ] ])
return result
def shouye_spider():
list_content_1=[]
url = 'http://www.zjvtit.edu.cn//xwzx/jyxw.htm'
print('首页是' + url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
s = etree.HTML(html)
for d in range(0, 20):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
s = etree.HTML(html)
r = urlopen(request)
result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()'
result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href'
result_pachong = s.xpath(result_xpath_url)
print(result_pachong)
result_pachong = s.xpath(result_xpath_url)
result_pachong = tran_url(result_pachong)
list_content_1.append(neirong("".join(result_pachong)))
return list_content_1
def fanye_page(yeshu, paquyeshu):
list_content_2=[]
for b in range(eval(yeshu) - paquyeshu + 1, eval(yeshu)):
url = 'http://www.zjvtit.edu.cn//xwzx/jyxw/'
data = b
url = url + str(data) + '.htm'
print('第' + str(eval(yeshu) - b) + '页是' + url)
s = etree.HTML(url)
for d in range(0, 20):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
s = etree.HTML(html)
r = urlopen(request)
result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()'
result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href'
result_pachong = s.xpath(result_xpath_url)
xiuzheng_url = [ ]
for xiuzheng in result_pachong:
if xiuzheng.startswith('../..'):
xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..'))
elif xiuzheng.startswith('../'):
xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../'))
else:
print('没啥问题')
print(xiuzheng_url)
list_content_2.append(neirong("".join(xiuzheng_url)))
return list_content_2
if __name__ == '__main__':
url = 'http://www.zjvtit.edu.cn/xwzx/jyxw.htm'
print('正在爬取浙江交通职业技术学院相关信息')
paquyeshu = eval(input('请输入要爬取多少页:'))
list_content_fanye=[]
list_content_shouye=[]
list_content_fanye.append(fanye_page(huoquyeshu_page(url), paquyeshu))
print(list_content_fanye)
list_content_shouye.append(shouye_spider())
print(list_content_shouye)
with open('zjjt.txt','w',encoding='utf-8') as file:
for i in list_content_shouye[0]:
file.write(json.dumps(i,ensure_ascii=False)+'\n')
for j in list_content_fanye[0]:
file.write(json.dumps(j,ensure_ascii=False)+'\n')
file.close()
转载请注明原文地址:https://ipadbbs.8miu.com/read-8392.html