Python爬虫爬取ok资源网电影播放地址

技术2022-07-11 115

#爬取ok资源网电影播放地址 #www.okzy.co #入口一：http://okzy.co/index.php?m=vod-search&wd={关键字}&submit=search #入口二：http://www.okzy.co/?m=vod-type-id-{1-34}.html # http://www.okzy.co/?m=vod-index-pg-{1-1110}.html # for x in range(1110): # print("http://www.okzy.co/?m=vod-index-pg-{}.html".format(x)) #请求，响应，分析保存 #目标入口：首页->列表->子页面->内容（播放地址，对应名称）->保存（电影标题） import requests from lxml import etree #表格模块 #pip install prettytable from prettytable import PrettyTable host = "http://www.okzy.co" rooturl = "/?m=vod-index-pg-{}.html".format(1) #请求入口页 response = requests.get(host+rooturl) #输出页面内容-HTML response.encoding = 'utf-8' # print(response.text) if response.status_code==200: #请求成功，可以爬取 print("==========爬虫工作开始==========") page_index = response.text #将文本转为xpath page_index_xp = etree.HTML(page_index) #以下为xpath的解析规则 #电影名称，分类，更新日期，对应的详情页链接入口 #电影名称 page_index_xp_title = page_index_xp.xpath("//div[@class='xing_vb']/ul/li/span[@class='xing_vb4']/a/text()") #详情页链接 page_index_xp_titleurl = page_index_xp.xpath("//div[@class='xing_vb']/ul/li/span[@class='xing_vb4']/a/@href") # #分类 # page_index_xp_leibie = page_index_xp.xpath("//div[@class='xing_vb']/ul/li/span[@class='xing_vb5']/text()") # #更新日期 # page_index_xp_date = page_index_xp.xpath("//div[@class='xing_vb']/ul/li/span[@class='xing_vb6']/text()") # print(page_index_xp_title) # print(page_index_xp_titleurl) # print(page_index_xp_leibie) # print(page_index_xp_date) #解析数据，并拼装URL page_index_num = 0 for p_i_title in page_index_xp_title: p_i_titleurl = host+page_index_xp_titleurl[page_index_num] # strip()：去掉头尾转移、空白、换行符 # lstrip():去掉开头的 # rstrip():去掉尾部的 # replace("oldstr","newstr")：替换 # print("标题:{},地址{}".format(p_i_title.strip(),p_i_titleurl)) #制作表格 table_index = PrettyTable(['《{}》'.format(p_i_title.strip())]) table_info = PrettyTable(['资源名称','播放地址']) #业务判断-去重 page_index_num+=1 #二层访问和解析 #详细信息，播放地址(集) page_sec_info = requests.get(p_i_titleurl) if page_sec_info.status_code==200: #详情页访问正常 page_sec = page_sec_info.text page_sec_xp = etree.HTML(page_sec) #播放地址/封面/ page_sec_xp_playurl = page_sec_xp.xpath("//div[@class='vodplayinfo']//ul/li/text()") # print(page_sec_xp_playurl) #解析地址和对应的别名 for piurl in page_sec_xp_playurl: # print(piurl.split("$")[0]) # print(piurl.split("$")[1]) piname = piurl.split("$")[0] pilink = piurl.split("$")[1] # print("播放地址为：{}-{}".format(piname,pilink)) table_info.add_row([piname,pilink]) #存储：文本/sqlite #拼接为代码 htmltemp = '''<video autoplay="autoplay" controls="controls" src="{}"></video>'''.format(pilink) #动态生成一个HTML文件 table_index.add_row([table_info]) print(table_index) else: print("==========爬虫不能正确工作，原因：{}==========".format(response.status_code))

Processed: 0.011, SQL: 9