from urllib.request import urlopen
import requests
import urllib.parse
from lxml import etree
from bs4 import BeautifulSoup
def huoquyeshu():
response = urllib.request.urlopen('http://www.zjitc.net/xwzx/xyxw.htm')
html = response.read().decode('utf-8')
s = etree.HTML(html)
result = s.xpath('//*[@id="fanye114233"]/text()')
result_str = [ ]
for c in result:
result_str.append(c)
yeshu = c[ -4:-1 ]
return yeshu
def tran_url(url):
xiuzheng_url = [ ]
for xiuzheng in url:
if xiuzheng.startswith('../..'):
xiuzheng_url.append('http://www.zjitc.net/' + xiuzheng.lstrip('../..'))
elif xiuzheng.startswith('../'):
xiuzheng_url.append('http://www.zjitc.net/' + xiuzheng.lstrip('../'))
else:
pass
print(xiuzheng_url)
return xiuzheng_url
def neirong(url):
if url == '':
pass
else:
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
biaoti = soup.select('h2')[ 0 ].text
zuozhe = soup.select('.zz')[ 0 ].text
time = soup.select('.zz')[ 0 ].text
result = {}
author = zuozhe.split()[ 0 ].lstrip('作者:')
resource = zuozhe.split()[ 1 ].lstrip('来源:')
timesource = time.split('浏')[ 0 ].strip().lstrip('发布时间:')
result[ 'title' ] = biaoti
result[ 'author' ] = author
result[ 'resource' ] = resource
result[ 'timesource' ] = timesource
result[ 'article' ] = ' '.join([ p.text.strip() for p in soup.select('#vsb_content p')[ :-1 ] ])
print(result)
def shouye_spider():
url = 'http://www.zjitc.net/xwzx/xyxw.htm'
print('首页是' + url)
s = etree.HTML(url)
for d in range(0, 8):
r = urlopen(url)
data = requests.get(url)
data.encoding = 'utf-8'
s = etree.HTML(data.text)
result_xpath_biaoti = '//*[@id="line_u29_' + str(d) + '"]' + '/a/div[2]/div/h3/text()'
result_xpath_url = '//*[@id="line_u29_' + str(d) + '"]' + '/a/@href'
result_pachong = s.xpath(result_xpath_biaoti)
print(result_pachong)
result_pachong = s.xpath(result_xpath_url)
result_pachong = tran_url(result_pachong)
neirong("".join(result_pachong))
def fanye_spider(yeshu, paquyeshu):
for b in range(eval(yeshu) - paquyeshu + 1, eval(yeshu)):
url = 'http://www.zjitc.net/xwzx/xyxw/'
data = b
url = url + str(data) + '.htm'
print('第' + str(eval(yeshu) - b) + '页是' + url)
s = etree.HTML(url)
for d in range(0, 8):
r = urlopen(url)
data = requests.get(url)
data.encoding = 'utf-8'
s = etree.HTML(data.text)
result_xpath_biaoti = '//*[@id="line_u29_' + str(d) + '"]' + '/a/div[2]/div/h3/text()'
result_xpath_url = '//*[@id="line_u29_' + str(d) + '"]' + '/a/@href'
result_pachong = s.xpath(result_xpath_url)
xiuzheng_url = [ ]
for xiuzheng in result_pachong:
if xiuzheng.startswith('../..'):
xiuzheng_url.append('http://www.zjitc.net/' + xiuzheng.lstrip('../..'))
elif xiuzheng.startswith('../'):
xiuzheng_url.append('http://www.zjitc.net/' + xiuzheng.lstrip('../'))
else:
print('没啥问题')
print(xiuzheng_url)
neirong("".join(xiuzheng_url))
if __name__ == "__main__":
paquyeshu = eval(input('请输入要爬取多少页:'))
fanye_spider(huoquyeshu(), paquyeshu)
shouye_spider()
转载请注明原文地址:https://ipadbbs.8miu.com/read-8529.html