微博关键词搜索并爬取前40页内容与图片
"""
@author: tanderick
"""
import requests
import re
import os
import urllib.parse
import time
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0)'}
keyword = '简历'
filepath = str(r'C:/weibo/'+keyword)
if not os.path.exists(filepath):
os.mkdir(filepath)
kw=urllib.parse.quote(keyword)
s_url ='https://s.weibo.com/weibo?q='+kw+'&wvr=6&b=1&Refer=SWeibo_box'
f = requests.get(s_url,headers = headers)
for i in range(40):
html = requests.get(s_url+'&page='+str(i),headers = headers)
html = html.text
html =urllib.parse.unquote(html)
print(i)
with open(filepath+'/'+keyword+'.txt','a',encoding ="utf-8") as f:
f.write(html)
time.sleep(0.5)
with open(filepath+'/'+keyword+'.txt','r',encoding ="utf-8") as h:
html = h.read()
uids = re.findall('<a href="//weibo.com/(.*?)?refer_flag=1001030103_" class=".*?" target=".*?" nick-name="(.*?)" suda-data=".*?">.*?</a>',html)
contents = re.findall(' <p class="txt" node-type="feed_list_content" nick-name=".*?">(.*?)</p>',html,re.S)
pic_id = re.findall('<!--card-wrap-->(.*?)<!--/card-wrap-->',html,re.S)
for i in range(len(uids)):
uid,nickname = uids[i]
out_filepath =filepath+'/'+nickname
if not os.path.exists(out_filepath):
os.mkdir(out_filepath)
with open(out_filepath+'/微博内容.txt','a',encoding ="utf-8") as f:
f.write(str(uids[i])+'\r\n'+re.sub('<.*?>','',contents[i],re.S))
pic_urls1 = re.findall('img src="(.*?)jpg".*?',pic_id[i])
pic_urls2 = re.findall('cover_img=(.*?)jpg.*?',pic_id[i])
for url1 in pic_urls1:
url1 = re.sub(r'https:','',str(url1))
filename = url1.split('/')[-1]
response = requests.get(r'http:'+url1+'jpg',headers=headers)
with open(out_filepath+'/'+filename+'jpg','wb') as f:
f.write(response.content)
print(r'http:'+url1+'jpg'+'下载完成')
for url2 in pic_urls2:
url2 = re.sub(r'https:','',str(url2))
filename = url1.split('/')[-1]
response = requests.get(r'http:'+url2+'jpg',headers=headers)
with open(out_filepath+'/'+filename+'jpg','wb') as f:
f.write(response.content)
print(r'http:'+url2+'jpg'+'下载完成')
转载请注明原文地址:https://ipadbbs.8miu.com/read-53369.html