FB贴文采集

    技术2022-07-17  69

    使用的是selenium方式爬取

    # -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.ui import WebDriverWait import time class Selectors(object): selectors = { "height_script": "return document.body.scrollHeight", "scroll_script": "window.scrollTo(0, document.body.scrollHeight);", "comment_section": ".//*[@class='commentable_item']", "more_comment_replies": ".//a[contains(@class,'_4sxc _42ft')]", "comment_see_more_link": ".//a[contains(@class,'_5v47 fss')]", } class FBSpider(object): def __init__(self, c_user, xs, total_scrolls, scroll_time, spider_url): self.c_user = c_user self.xs = xs self.total_scrolls = total_scrolls self.scroll_time = scroll_time self.spider_url = spider_url self.selectors = Selectors().selectors self.driver = None self.cookies_list = [ {"value": f"{self.c_user}", "name": "c_user", "domain": "facebook.com", "path": "/"}, {"value": f"{self.xs}", "name": "xs", "domain": "facebook.com", "path": "/"} ] def cookie_login(self): """ 使用cookie登录 :return: """ options = Options() options.add_argument("--disable-notifications") options.add_argument("--disable-infobars") options.add_argument("--mute-audio") self.driver = webdriver.Chrome( executable_path=ChromeDriverManager().install(), options=options ) login_url = "https://www.facebook.com" self.driver.get(login_url) time.sleep(5) for cookie in self.cookies_list: self.driver.add_cookie(cookie) def run(self): self.work() with open('posts.html', 'wb') as f: f.write(self.driver.page_source.encode('utf8')) def work(self): print('开始登录Facebook') self.cookie_login() print('登录Facebook成功') self.driver.get(self.spider_url) time.sleep(5) print('开始滑动滚动条') self.sliding_scroll_bar(self.driver) print('开始将隐藏的评论逐一展开') self.expand_comments(self.driver) def sliding_scroll_bar(self, driver): """ 滑动滚动条 :param driver: :return: """ old_height = 0 current_scrolls = 0 while True: try: if current_scrolls == self.total_scrolls: return old_height = driver.execute_script(self.selectors.get('height_script')) driver.execute_script(self.selectors.get('scroll_script')) WebDriverWait(driver, self.scroll_time, 0.05).until( lambda driver: self.check_height(driver, old_height) ) current_scrolls += 1 time.sleep(1) except TimeoutException: break return def check_height(self, driver, old_height): new_height = driver.execute_script(self.selectors.get('height_script')) return new_height != old_height def expand_comments(self, driver): """ 展开评论 :param driver: :return: """ try: # 展开更多评论 reply_links = driver.find_elements_by_xpath(self.selectors.get('more_comment_replies')) # 逐个展开每个帖子的更多评论 for link in reply_links: try: driver.execute_script("arguments[0].click();", link) except Exception: pass # 评论内容过多的话会出现隐藏,此时在界面上显示的"展开",因此找到这个按钮 see_more_links = driver.find_elements_by_xpath(self.selectors.get('comment_see_more_link')) for link in see_more_links: try: driver.execute_script("arguments[0].click();", link) except Exception: pass except Exception: pass
    Processed: 0.011, SQL: 9