爬取的目标网址: http://www.zdqx.com/qingchun/index.html
话不多说,直接上代码:
import requests import re,os headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 'referer': 'http://www.zdqx.com/pcbz/70270.html', 'cookie': 'Hm_lvt_303a32038183efa6d8efec90c0031b87=1581472898; Hm_lpvt_303a32038183efa6d8efec90c0031b87=1581472912' } def get_urls(url,num_page): response = requests.get(url=url, headers=headers) response.encoding = response.apparent_encoding result = re.findall('<div class="listbox">(.*?)</div>', response.text, re.S) urlsTitleList = re.findall('<a href="(.*?)" target="_blank" title="(.*?)">', str(result), re.S) pageNumList = re.findall('<em class="page_num">共(.*?)张</em>',str(result), re.S) for urlAndTitle,num in zip(urlsTitleList,pageNumList): url = 'http:'+urlAndTitle[0] title = urlAndTitle[1] num = int(num) for page in range(1,num+1): if page == 1: newUrl = url else: newUrl = url.replace('.html','_%s.html'%page) resp = requests.get(newUrl,headers=headers) resp.encoding = resp.apparent_encoding div_main = re.findall('<div class="main_center_img" style=" min-height: 745px;">(.*?)</div>',resp.text,re.S) url_alt = re.findall('<img src="(.*?)"', str(div_main), re.S) # 长度为3 _url = 'http:'+url_alt[1] imgTitle = title+str(page) savedata(_url,imgTitle,num_page) def savedata(url,title,num_page): path = '小姐姐图片/第{}页'.format(num_page) if not os.path.exists(path): os.makedirs(path) #多级目录 response = requests.get(url,headers=headers) response.encoding = response.apparent_encoding with open(path + '/' + title + '.jpg',mode="wb") as f: f.write(response.content) print(title+' 保存成功!') f.close() def download_page(num_page): if num_page == 1: url = 'http://www.zdqx.com/qingchun/index.html' else: url = 'http://www.zdqx.com/qingchun/index_' + str(num_page) + '.html' get_urls(url, num_page) print('第' + str(num_page) + '页采集完毕!') def main_run(): want_page = input("您需要爬取第几页?(共1-42页),输入'all'则爬取全部约10000张高清壁纸:\n") if (want_page == 'all'): for page in range(1, 41): download_page(page) try: num_page = int(want_page) except: print("您的输入不合法,只能输入纯数字或字符串'all',请重新输入") main_run() if(num_page<1 or num_page>42): print("您的输入不合法,页码数只能在1-42之间,请重新输入") main_run() else: download_page(num_page) if __name__ == '__main__': main_run()对于多线程还不太熟悉,就没有用到多线程。
虽然爬取全部约10000张高清图太慢了,但是爬取一页的时间还是可以勉强接受的。