python利用selenium爬取网易云音乐

    技术2022-07-11  103

    这里爬取得是网易云音乐的歌单,大约5w。每个歌单可以继续爬取对应的歌曲(暂未实现)

    from selenium import webdriver import time class WY: def __init__(self): self.url = "https://music.163.com/#/discover/playlist/" self.driver = webdriver.Chrome() def all_url_1(self): ''' 定义各个大分类的名称和url ''' self.driver.get(self.url) self.driver.switch_to_frame("contentFrame") # self.driver.find_element_by_xpath("./html/body/div[@class='g-bd']/div[@class='g-wrap p-pl f-pr']/div/h3/a").click() # time.sleep(1) reqs = self.driver.find_elements_by_xpath("./html/body/div[@class='g-bd']/div[@class='g-wrap p-pl f-pr']/div[@class='n-sltlyr d-flag ']/div[@class='bd']/dl/dd/a") list_1 = [] for req in reqs: dict_a = {} dict_a["name"] = req.get_attribute("data-cat") dict_a['url'] = req.get_attribute("href") list_1.append(dict_a) # print(dict_a) return list_1 #------------------------------------------------------------------------------------- def page_all(self,url): self.driver.get(url) self.driver.switch_to_frame('contentFrame') res_1 = self.driver.find_elements_by_xpath("./html/body/div/div/ul/li/p/a") list_5 = [] for cd in res_1: dict_2 = {} # print(cd.text) dict_2['name'] = cd.text dict_2['url'] = cd.get_attribute('href') # list_5.append(dict_2) list_5.append(dict_2) print("lis_5的列表中有{}个大范围".format(len(list_5))) # time.sleep(50) for ss in list_5: self.save_1(ss) time.sleep(1) next_url = self.driver.find_element_by_xpath("./html/body/div[@class='g-bd']/div[@class='g-wrap p-pl f-pr']/div/div/a[@class='zbtn znxt']").get_attribute("href") a = 2 # 下一页 while True: try: self.driver.get(next_url) self.driver.switch_to_frame('contentFrame') res_1 = self.driver.find_elements_by_xpath("./html/body/div/div/ul/li/p/a") list_6 = [] for cd in res_1: dict_2 = {} # print(cd.text) dict_2['name'] = cd.get_attribute("title") dict_2['url'] = cd.get_attribute('href') dict_2["anchor"] = cd.text # print(dict_2) list_6.append(dict_2) # time.sleep(2) for ss in list_6: self.save_1(ss) next_session = self.driver.find_element_by_xpath("./html/body/div[@class='g-bd']/div[@class='g-wrap p-pl f-pr']/div/div/a[@class='zbtn znxt']") next_url = next_session.get_attribute("href") print("下载第{}页".format(a)) a+=1 except: break return list_5 #--------------------------------------------- def save_1(self,con): file1 = open("E://a.csv", 'a+') try: file1.write(con['name']+","*5+str(con['url'])+","*6+con['anchor']+'\n') file1.close() except: pass #------------------------------------------------------ def run(self): list_1 = self.all_url_1() # with open('E://ss.txt', 'a+') as f: for li in list_1: list_5 = self.page_all(li["url"]) # print(list_5) time.sleep(0.5) # print(list_1) if __name__ == '__main__': a = WY() a.run()

    Processed: 0.016, SQL: 9