微博关键词搜索并爬取前40页内容与图片

技术2024-12-03 79

微博关键词搜索并爬取前40页内容与图片

# -*- coding: utf-8 -*- """ @author: tanderick """ import requests import re import os import urllib.parse import time #header文件 headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0)'} #搜索名词 keyword = '简历' #创建同名文件夹 filepath = str(r'C:/weibo/'+keyword) if not os.path.exists(filepath): os.mkdir(filepath) #搜索名词下40页源码并保存为txt文件 kw=urllib.parse.quote(keyword) s_url ='https://s.weibo.com/weibo?q='+kw+'&wvr=6&b=1&Refer=SWeibo_box' f = requests.get(s_url,headers = headers) for i in range(40): html = requests.get(s_url+'&page='+str(i),headers = headers) html = html.text html =urllib.parse.unquote(html) print(i) with open(filepath+'/'+keyword+'.txt','a',encoding ="utf-8") as f: f.write(html) time.sleep(0.5) #打开该文件 with open(filepath+'/'+keyword+'.txt','r',encoding ="utf-8") as h: html = h.read() #解析内容并下载 uids = re.findall('<a href="//weibo.com/(.*?)?refer_flag=1001030103_" class=".*?" target=".*?" nick-name="(.*?)" suda-data=".*?">.*?</a>',html) contents = re.findall(' <p class="txt" node-type="feed_list_content" nick-name=".*?">(.*?)</p>',html,re.S) pic_id = re.findall('(.*?)',html,re.S) for i in range(len(uids)): uid,nickname = uids[i] out_filepath =filepath+'/'+nickname if not os.path.exists(out_filepath): os.mkdir(out_filepath) with open(out_filepath+'/微博内容.txt','a',encoding ="utf-8") as f: f.write(str(uids[i])+'\r\n'+re.sub('<.*?>','',contents[i],re.S)) #获取用户名与微博内容 pic_urls1 = re.findall('img src="(.*?)jpg".*?',pic_id[i]) pic_urls2 = re.findall('cover_img=(.*?)jpg.*?',pic_id[i]) for url1 in pic_urls1: url1 = re.sub(r'https:','',str(url1)) filename = url1.split('/')[-1] response = requests.get(r'http:'+url1+'jpg',headers=headers) with open(out_filepath+'/'+filename+'jpg','wb') as f: f.write(response.content) print(r'http:'+url1+'jpg'+'下载完成') for url2 in pic_urls2: url2 = re.sub(r'https:','',str(url2)) filename = url1.split('/')[-1] response = requests.get(r'http:'+url2+'jpg',headers=headers) with open(out_filepath+'/'+filename+'jpg','wb') as f: f.write(response.content) print(r'http:'+url2+'jpg'+'下载完成') #下载图片

Processed: 0.016, SQL: 9