'''
python获取51外包网所有投资合作商的公开信息
作者:Mac
时间:2020.7.1
----------------------------------------
这种爬虫属于:
根据一个网页的url去访问详情页的url,
对详情页的源码进行解析并获取想要的数据。
对于这种爬虫有一个简单的思路(三步):
第一步:先获取一页中(如图1),所有目标链接的url,
然后对这些url进行拼接
第二步:尝试解析详情页源代码(如图2),获取详情页中想要的数据
第三步:在一个逻辑函数当中循环执行第一、二步
'''
import requests
import json
from lxml
import etree
import time
BASE_URL
= "http://www.51waibao.net/"
TARGET
= "http://www.51waibao.net/Cooperation.aspx"
HEADERS
= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'
}
def get_coos_index_url(page
):
data
= {
"page": page
,
}
session
= requests
.session
()
session
.post
(url
=TARGET
, data
=data
, headers
=HEADERS
)
response
= requests
.get
(url
=TARGET
)
a_hrefs
= response
.content
.decode
()
html
= etree
.HTML
(a_hrefs
)
div_details
= html
.xpath
('//div[@class="xiangmu_item"]')
hrefs
= []
for div_detail
in div_details
:
a_href
= BASE_URL
+ div_detail
.xpath
('.//a/@href')[0]
hrefs
.append
(a_href
)
return hrefs
def analy_coos_index_url(url
):
response
= requests
.get
(url
=url
, headers
=HEADERS
)
details
= response
.content
.decode
()
html
= etree
.HTML
(details
)
introduces
= html
.xpath
("//div[@class='content_left']")
coos_info
= {}
for introduce
in introduces
:
title
= introduce
.xpath
(".//h1/text()")
contact_info
= introduce
.xpath
(".//div[@class='project_info_box']/ul/li//text()")
introduce_info
= introduce
.xpath
(".//p/text()")
introduce_info
= [x
.strip
() for x
in introduce_info
if x
.strip
() != '']
coos_info
['标题'] = title
[0]
coos_info
['联系信息'] = contact_info
coos_info
['简介'] = introduce_info
return coos_info
def save_as_json(data
):
json_str
= json
.dumps
(data
, ensure_ascii
=False)
with open('51coos_info.json', 'a') as fp
:
fp
.write
(json_str
)
def main():
start
= time
.time
()
for i
in range(114):
print("第%d页" % (i
+ 1))
urls
= get_coos_index_url
(page
=i
)
for url
in urls
:
json_data
= analy_coos_index_url
(url
)
save_as_json
(json_data
)
end
= time
.time
()
print("完成!!运行时间:%.2f秒" % (end
- start
))
if __name__
== '__main__':
main
()
图1如下 图2如下 效果图如下:
转载请注明原文地址:https://ipadbbs.8miu.com/read-18333.html