杭州科技职业技术学院爬虫

技术2022-07-13 78

import requests from selenium import webdriver import time from lxml import etree from bs4 import BeautifulSoup from selenium.webdriver import ActionChains def response_spider(url,headers): response=requests.get(url,headers) response.encoding=‘utf-8’ print(response.text) return response.text def page_new_url(url,headers): # html=etree.HTML(response_spider(url,headers)) response_spider(url,headers) # url=html.xpath("") # return url

if name==‘main’: url=‘http://www.hzpt.edu.cn/Newslist.php?pernums=0&cid=313&page=1’ headers={‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36’} # url_list=page_new_url(url,headers) # for i in url_list: # print(i) driver = webdriver.Chrome(executable_path=r"C:\Users\Diana\Downloads\chromedriver\chromedriver.exe") driver.get(‘http://www.hzpt.edu.cn/Newslist.php?pernums=0&cid=313&page=1’) soup = BeautifulSoup(driver.page_source,‘lxml’) # url_list_count = [] url_list = soup.select(‘body > div > div.mainbody > table > tbody > tr:nth-child(2) > td:nth-child(3) > table > tbody > tr:nth-child(3) > td > table.xx > tbody’) # for i in url_list: # url = i.find_all(‘a’) # for j in url: # # a = j[‘href’] # a = ‘http://www.hzpt.edu.cn/’+’’.join(a).strip(’./’) # url_list_count.append(a) # print(a) # http://www.hzpt.edu.cn/Newsdetail.php?id=37660

# print(url_list_count) # ac=soup.select('body > div > div.mainbody > table > tbody > tr:nth-child(2) > td:nth-child(3) > table > tbody > tr:nth-child(3) > td > table.xx > tbody > tr:nth-child(1) > td:nth-child(2) > a') ac=driver.find_element_by_css_selector('body > div > div.mainbody > table > tbody > tr:nth-child(2) > td:nth-child(3) > table > tbody > tr:nth-child(3) > td > table.xx > tbody > tr:nth-child(1) > td:nth-child(2) > a') ActionChains(driver).move_to_element(ac).click(ac).perform() title=driver.find_element_by_tag_name('b') # time=driver.find_element_by_xpath("/html/body/div[2]/div[5]/table/tbody/tr[2]/td[3]/table/tbody/tr[2]/td/table/tbody/tr[3]/td/span/text()[1]") time=driver.find_element_by_css_selector('body >div> div>table> tbody> tr[2]> td[2] >tabel >tbody> tr[2]> td> table> tbody >tr[3] >td >span') print(title.text) print(time.text) driver.quit()

Processed: 0.013, SQL: 9