python crawler - 使用代理增加博客文章访问量

技术2022-07-13 81

import re , random , requests , logging from lxml import etree from multiprocessing.dummy import Pool as ThreadPool logging.basicConfig(level=logging.DEBUG) TIME_OUT = 15 proxies = [] header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36', 'Accept-Encoding' : 'gzip, deflate, br', 'Accept-Language' : 'zh-CN,zh-TW;q=0.9,zh;q=0.8,pl;q=0.7,en;q=0.6' , 'Connection' : 'keep-alive' } PROXY_URL = 'http://www.66ip.cn/areaindex_1/1.html' # 代理网站 # 从代理网站中爬取可用ip及端口 def GetProxies(): global proxies try: text = requests.get(PROXY_URL,headers=header).text except: logging.error('proxy failed!') return html = etree.HTML(text) ips = [] ips = html.xpath("//div[@id='footer']/div/table/tr/td[1]/text()") ports = html.xpath("//div[@id='footer']/div/table/tr/td[2]/text()") ips = ips[1:-1] ports = ports[1:-1] for i in range(len(ips)): proxies.append( dict( http='{}:{}'.format(ips[i],ports[i]) ) ) # print(proxies) # 获取文章链接 def GetArticles(url): res = GetRequest(url,prox=None) html = etree.HTML(res.text) urls_list = [] urls_list = html.xpath("//*[@id='mainBox']/main/div/div/h4/a/@href") return urls_list # print(urls_list) def GetRequest(url,prox): req = requests.get(url,headers=header,proxies=prox,timeout=TIME_OUT) return req def VisitWithProxy(url): proxy = random.choice(proxies) GetRequest(url,proxy) def VisitLoop(url): for i in range(count): logging.debug('Visiting:\t{}\tfor {} times'.format(url,i+1)) VisitWithProxy(url) if __name__ == '__main__': global count GetProxies() logging.debug('We got {} proxies'.format(len(proxies))) BlogUrl = input('Blog Address').strip(' ') logging.debug('Gonna visite {}'.format(BlogUrl)) try: count = int(input('Visit Count:')) except ValueError: logging.error('Arg error') quit() if count == 0 or count > 200 : logging.error('Count illegal') quit() article_list = GetArticles(BlogUrl) if len(article_list) == 0 : logging.error('No article , error !') quit() # 多线程 pool = ThreadPool(int(len(article_list) / 4 )) results = pool.map(VisitLoop,article_list) pool.close() pool.join() logging.debug('Done!')

Processed: 0.012, SQL: 9