9

    技术2024-08-12  64

    from selenium import webdriver import time class Douyu(object): def __init__(self): self.url = 'https://www.douyu.com/directory/all' self.driver = webdriver.Chrome() # self.data_list = [] def parse_data(self): # 保证页面基本加载完后开始定位数据,防止数据还没加载出,已经开始定位会失败 time.sleep(5) # 将所有的元素块存入列表 page = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div') print(len(page)) data_list = [] time.sleep(5) for tmp in page: temp = {} # 开始基于已定位的元素块为根,寻找所需的元素:名称、类型、房主、观看人数 temp['tp'] = tmp.find_element_by_xpath('./a[1]/div[2]/div[1]/span').text temp['owner'] = tmp.find_element_by_xpath('./a[1]/div[2]/div[1]/h3').text temp['room'] = tmp.find_element_by_xpath('./a[1]/div[2]/div[2]/h2').text temp['num'] = tmp.find_element_by_xpath('./a[1]/div[2]/div[2]/span').text # 将字典添加到列表中 data_list.append(temp) return data_list # 保存数据 def save_data(self,data_list): # 对列表遍历输出 for data in data_list: print(data) def run(self): # 1. url # 2. driver # 3. get self.driver.get(self.url) while True: # 4. parse_data # 对网页下滑,加载出当前页面全部,防止定位后面元素出错,也为了定位下一页做准备 js = 'scrollTo(0,100000)' self.driver.execute_script(js) data_list = self.parse_data() # 5. save_data self.save_data(data_list) try: # 当下一页可以点击时,正常点击,否则终止循环 self.driver.find_element_by_xpath('//*[contains(text(),"下一页")]').click() except: break if __name__ == "__main__": douyu = Douyu() douyu.run()
    Processed: 0.014, SQL: 9