why ? 昨天看中国网站总排名,发现每页只有30个排名,一页页翻太慢了,于是准备用Python将排名爬取下来 ,并保存到文件中。 爬取网站 : https://top.chinaz.com/all/
源码
import requests
import re
from requests
.exceptions
import RequestException
headers
= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
def get_one_page(targetUrl
):
try:
response
= requests
.get
(targetUrl
, headers
)
if (response
.status_code
== 200):
parse_one_page
(response
.content
.decode
())
return None
except RequestException
:
return None
def parse_one_page(target_text
):
pattern
= re
.compile(r
'<a href=".*? target="_blank" class="pr10 fz14">(.*?)</a>')
myList
= pattern
.findall
(target_text
)
result
.extend
(myList
)
def crawl_each_url():
before
= "https://top.chinaz.com/all/index_"
after
= ".html"
get_one_page
("https://top.chinaz.com/all/")
for i
in range(2, 51):
get_one_page
(before
+ str(i
) + after
)
def write_to_file(result_text
):
num
= 1;
with open("中国网站排名.txt", "a", encoding
='utf-8') as f
:
f
.write
("排名 网站 \n")
for i
in result_text
:
f
.writelines
(str(num
) + " " + str(i
) + "\n")
num
+= 1
if __name__
== '__main__':
result
= []
crawl_each_url
()
write_to_file
(result
)
爬取内容如下 :
转载请注明原文地址:https://ipadbbs.8miu.com/read-54391.html