批量爬取B站收藏夹封面 Python 爬虫多线程

技术2022-07-16 81

批量爬取B站收藏夹封面

一个视频要想获得较高的点击量，那么精美的封面和吸引人标题则是必不可少的因素之一，有的封面就很有收藏的欲望，因此我开始百度搜索如何获取B站视频的封面，发现一般都是根据BV号或者AV号获取视频的封面，这样有一个缺点，就是效率太低，一个一个视频要一个个复制Bv号进去才能获取到封面。

Python根据AV号获取封面源码：源码来自： https://zhuanlan.zhihu.com/p/63782740

import re import requests def getcover(): avnum=input("please input the av number of your video:") url="https://www.bilibili.com/video/av"+avnum headers={ 'Host':'www.bilibili.com', 'User-Agent': 'Chrome/73.0.3683.103' } text=requests.get(url,headers=headers).text#解决反爬虫问题 index1=text.find('itemprop=\"image\"')+len('itemprop=\"image\" content=\"') index2=text.find('\"',index1) cover=text[index1:index2] print(cover) getcover()

正所谓懒是第一生产力，因此，我用Python写了个批量爬取B站收藏夹封面的工具，工具使用方式很简单，双击打开软件后输入收藏夹的ID和要获取的页数就可以开始爬取了

在使用此工具前，需要安装you-get库，安装方式很简单，在你配置好python环境变量的情况下，按win+r打开运行，输入cmd打开命令提示符，输入下面的代码即可安装好you-get库

pip3 install you-get

当打开cmd输入you-get -h，然后按回车没有报错则表示安装成功

展示：

工具源码：代码部分功能参考了此文章：https://zhuanlan.zhihu.com/p/84559073

from concurrent.futures import ThreadPoolExecutor import threading import requests import json import os import re # 解析指定收藏夹的ID，获取收藏夹信息，可指定获取多少页 def parseUrl(id, page): url = [] # 添加收藏夹第一页信息地址 url.append("https://api.bilibili.com/medialist/gateway/base/spaceDetail?media_id=" + str(id) + "&pn=1&ps=20&keyword=&order=mtime&type=0&tid=0&jsonp=jsonp") # 获取页码前和后的内容 f_e = url[0] text1 = f_e[0:f_e.find('&pn=') + len('&pn=')] text2 = f_e[f_e.find('&ps'):len(f_e)] # 根据页码添加收藏夹信息 for i in range(1, page): url.append(text1 + str(i + 1) + text2) return url # 遍历收藏夹信息集合类，创建线程分别解析每一页收藏夹信息，并获取其指定的信息 class getFavorites: # 定义存储标题和封面的集合 title = [] cover = [] # 获取锁对象 lock = threading.Lock() # 构造方法 def __init__(self, url): self.url = url # 创建线程池，允许同时开启10个线程 threadPool = ThreadPoolExecutor(max_workers=10) # 遍历url集合 for i in url: threadPool.submit(self.run, i, self.title, self.cover) # 等待线程执行完毕 threadPool.shutdown(wait=True) # 解析线程 def run(self, url, title, cover): # 获取网页源码 html = requests.get(url) # 设置编码，防止乱码 html.encoding = 'UTF-8' # 解析网页源码 res = json.loads(html.text) # 循环添加本页的信息 res_len = len(res['data']['medias']) for id in range(0, res_len): self.lock.acquire() title.append(res['data']['medias'][id]['title']) cover.append(res['data']['medias'][id]['cover']) self.lock.release() # 下载封面类，创建线程下载封面 class downloadCover: def __init__(self, title, cover): # 生成输出目录 if not os.path.exists("./output"): os.mkdir("./output") # 创建线程池，允许同时开启10个线程 threadPool = ThreadPoolExecutor(max_workers=10) # 遍历title for i in title: # 获取当前封面内容 c = cover[title.index(i)] # 获取下载文件名 index = c.find('archive/') + len('archive/') d_name = c[index:len(c)] # 获取后缀名 tail = c[len(c) - 4:len(c)] # 获取规范后标题 cop = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]") # 匹配不是中文、大小写、数字的其他字符 r_name = i r_name = cop.sub('', r_name) # 将标题中匹配到的字符替换成空字符 if not (os.path.exists("./output/" + d_name) or os.path.exists( "./output/" + r_name + tail) or r_name == '已失效视频'): threadPool.submit(self.run, c, d_name, r_name, tail) # 等待线程执行完毕 threadPool.shutdown(wait=True) # 下载线程 def run(self, cover, d_name, r_name, tail): # 下载封面 os.system('you-get -o ./output/ ' + cover) # 修改下载文件名为标题 os.rename('./output/' + d_name, './output/' + r_name + tail) # 程序入口 def main(): print("欢迎使用\"批量爬取B站收藏夹封面\"工具") print("收藏夹地址示例：https://space.bilibili.com/3334737/favlist?fid=291299537") print("收藏夹ID示例：291299537") id = int(input("请输入收藏夹ID:")) page = int(input("请输入要获取的页数 [1-n页]:")) print("正在获取收藏夹信息. . .") api = parseUrl(id, page) col = getFavorites(api) # 遍历col对象中的title集合 for i in col.title: print(str(col.title.index(i) + 1) + '&' + i) input("→请按回车键开始下载") print('正在获取下载地址. . .') downloadCover(col.title, col.cover) print("下载完成！") input("请按回车键继续. . .") if __name__ == "__main__": main()

已编译好的可执行文件(EXE)：链接: https://pan.baidu.com/s/1qxeuZKjwtdrbpCc2X5VTww 提取码: gygq 本程序已更新至2.0版本，点此传送

Processed: 0.012, SQL: 9

批量爬取B站收藏夹封面 Python 爬虫 多线程

批量爬取B站收藏夹封面

批量爬取B站收藏夹封面 Python 爬虫多线程