Scrapy是适用于Python的一个快速、高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据。Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。;框架的力量,用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页内容以及各种图片,非常之方便
大数据可视化需要对不同行业不同种类的数据分析,那么python爬虫与大数据就更加形影不离,本次是对京东图书前后一星期价格进行爬取分析。 1、想要爬取某个网站第一步必须进行该网页数据分析,首先看图书页面信息: https://search.jd.com/Search?keyword=%E5%9B%BE%E4%B9%A6&wq=%E5%9B%BE%E4%B9%A6&page=1 https://search.jd.com/Search?keyword=%E5%9B%BE%E4%B9%A6&wq=%E5%9B%BE%E4%B9%A6&page=2 对上面连接可知图书列表每次显示30条数据,页面用page变量复制 2、对图书列表进行检查: 1)、获取列表xpath路径://[@id=“J_goodsList”]/ul 2)、获取每个图书信息连接(因为图书详情页链接都是以京东主页面连接加上商品编号): 3)、那么想要详细获取商品详情页面就必须进入商品详情页: xpath://[@id=“J_goodsList”]/ul/li[1]/div/div[3]/a 4)、进入详情,找到指定商品信息并进行获取: 5)、此时你会发现商品信息价格有点获取不了,因为价格和商品名称是动态加载的;一般的爬虫获取不了动态信息,那么我们就开始想办法了,我在检查元素是发现一个有趣的信息:“https://p.3.cn/prices/mgets?skuIds=J_” + 商品id就可以显示商品的价格信息
3、那么总的思路:因为要爬取同一件商品不同时间段的价格信息,那么就不只是一次爬取那么简单了;首先第一次爬取商品的详细信息(价格除外)放入一个json文件中,然后根据JSON中的商品ID(利用:“https://p.3.cn/prices/mgets?skuIds=J_” + 商品id)获取价格信息,加上当天爬取的时间段
1、创建good.py获取商品详细信息(出价格外)
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*- import scrapy from urllib.parse import quote from goods.items import GoodsItem class GoodSpider(scrapy.Spider): name = 'good' allowed_domains = ['search.jd.com/Search?keyword=图书'] start_urls = ['https://search.jd.com/Search?keyword=图书&wq=图书&page=1'] def parse(self, response): print("开始爬虫") #输入要爬取的页数 keyword = int(input("请输入要爬取的页数:")) #遍历访问的页数 a_href = [] a = 1 # 关键词中有中文的话,百度就会将其转码为%开始的编码 keyword1 = quote("图书", encoding="utf-8") print(keyword1) for a in range(0,keyword): b = "https://search.jd.com/Search?keyword=%E5%9B%BE%E4%B9%A6&wq=%E5%9B%BE%E4%B9%A6&page="+str(a+1) a_href.append(b) a= a+1 print(a_href) for href in a_href: print("==" * 40) print(href) # 因为请求被去重过滤了,所以才调试不了啊!要加入dont_filter=True才可正常访问 yield scrapy.Request(url=href,callback=self.parse_book_list,dont_filter=True) def parse_book_list(self, response): # 进入京东图书详情页 print("进入京东图书详情页") li_list = response.xpath("//*[@id='J_goodsList']/ul") print(li_list) for li in li_list: item = {} a_href = li.xpath(".//li/div/div[3]/a/@href").extract() print(a_href) if a_href is not None: a_href = ['https:' + x for x in a_href] print(len(a_href)) for c_href in a_href: print("==" * 40) print(c_href) yield scrapy.Request( url=c_href, callback=self.parse_book_li, dont_filter=True ) def parse_book_li(self,response): item = {} print("进入详情页") # 获取书名 item["Title"] = response.xpath("//*[@id='name']/div[1]/text()").extract_first().strip() print("书名===:",item["Title"]) # 获取作者 item["Author"] = response.xpath("//*[@id='p-author']/a[1]/@data-name").extract_first() print("作者===:",item["Author"]) li = response.xpath("//*[@id='parameter2']") b = ':' # print(li) for li_book in li: books=li_book.xpath(".//li/text()").extract() # print(books) # 正则表达式替换:之前的字符串 for a in books: # print(a) c = a[:a.find(b)] # print('c=',c) if c == '出版社': # 出版社 item["PublishingHouse"] = li.xpath(".//li/a[1]/@title").extract_first() # print("出版社===:",books["publishing_house"]) elif c == 'ISBN': # ISBN item["ISBN"] = a[a.find(b):].replace(":","") # print("ISBN===:",books["ISBN"]) elif c == '版次': # 版次 item["Edition"] = a[a.find(b):].replace(":","") # print("版次===:",books["Edition"]) elif c == '商品编码': # 商品编号 item["ProductID"] = a[a.find(b):].replace(":","") # print("商品编号===:",books["ProductID"]) elif c == '包装': # 包装 item["Packaging"] = a[a.find(b):].replace(":","") # print("包装===:",books["Packaging"]) elif c == '开本': # 开本 item["Format"] = a[a.find(b):].replace(":","") # print("开本===:",books["Format"]) elif c == '出版时间': # 出版时间 item["PublicationTime"] = a[a.find(b):].replace(":","") # print("出版时间===:",books["PublicationTime"]) elif c == '用纸类型': # 用纸类型 item["PaperType"] = a[a.find(b):].replace(":","") # print("用纸类型===:",books["PaperType"]) elif c == '页数': # 页数 item["BookNum"] = a[a.find(b):].replace(":","") # print("页数===:",books["BookNum"]) # else: # print("格式不正确!") yield item # # 出版社 # publishing_house = li.xpath(".//li[1]/[@title='']").extract_first() # print("出版社:",publishing_house) # # ISBN # ISBN = li.xpath(".//li[2]/@title").extract_first() # print("ISBN:",ISBN) # # 版次 # edition = li.xpath(".//li[3]/@title").extract_first() # print("版次:",edition) # # 商品编号 # product_iD = li.xpath(".//li[4]/@title").extract_first() # print("商品编号:",product_iD) # # 品牌 # brand = li.xpath(".//li[5]/@title").extract_first() # print("品牌:",brand) # # 包装 # packaging = li.xpath(".//li[6]/@title").extract_first() # print("包装:",packaging) # # 开本 # format = li.xpath(".//li[7]/@title").extract_first() # print("开本:",format) # # 出版时间 # publication_time = li.xpath(".//li[8]/@title").extract_first() # print("出版时间:",publication_time) # # 用纸类型 # paper_type = li.xpath(".//li[9]/@title").extract_first() # print("用纸类型:",paper_type) # # 页数 # book_num = li.xpath(".//li[10]/@title").extract_first() # print("页数:",book_num)2、创建book_message.py根据上面获取的good.py商品ID获取该商品的当天价格:
# -*- coding: utf-8 -*- import scrapy import json import datetime import copy from urllib.parse import quote from goods.items import GoodsItem class GoodSpider(scrapy.Spider): name = 'book_message' allowed_domains = ['search.jd.com/Search?keyword=图书'] start_urls = ['https://search.jd.com/Search?keyword=图书&wq=图书&page=1'] def parse(self, response): with open('.//books.json', 'r', encoding='utf-8') as load_f: strF = load_f.read() if len(strF) > 0: datas = json.loads(strF) a = datas["books"] print("这是文件中的json数据:") print(a) item = {} for b in a: #遍历获取的商品信息 #书名 if 'Title' in b: item["Title"] = b["Title"] else: item["Title"] = None # 获取作者 if 'Author' in b: item["Author"] = b["Author"] # 出版社 if 'PublishingHouse' in b: item["PublishingHouse"] = b["PublishingHouse"] # ISBN if 'ISBN' in b: item["ISBN"] = b["ISBN"] # 版次 if 'Edition' in b: item["Edition"] = b["Edition"] # 商品编号 if 'ProductID' in b: item["ProductID"] = b["ProductID"] # 包装 if 'Packaging' in b: item["Packaging"] = b["Packaging"] # 开本 if 'Format' in b: item["Format"] = b["Format"] # 出版时间 if 'PublicationTime' in b: item["PublicationTime"] = b["PublicationTime"] # # 用纸类型 # item["PaperType"] = b["PaperType"] # 页数 if 'BookNum' in b: item["BookNum"] = b["BookNum"] #爬取时间 item["crawlTime"] = datetime.datetime.now().strftime('%Y-%m-%d') print("time:",item["crawlTime"]) #遍历商品编码 c = b["ProductID"] print(c) url = "https://p.3.cn/prices/mgets?skuIds=J_"+c # print(url) yield scrapy.Request( url=url, callback=self.parse_books, meta={"item": copy.deepcopy(item)}, dont_filter=True ) else: print("这是文件中的json数据为空") datas = {} def parse_books(self,response): item = response.meta["item"] # 调用body_as_unicode()是为了能处理unicode编码的数据 price_list = json.loads(response.body_as_unicode()) # print sites['k'].split(',') print("price_list:",price_list) for li in price_list: item["ProductPrice"] = li["p"] #商品价格 print("ProductPrice",item["ProductPrice"]) yield item3、items部分接收item字典
import scrapy class GoodsItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 书名 Title = scrapy.Field() # 作者 Author = scrapy.Field() # 出版社 PublishingHouse = scrapy.Field() # 书ISBN ISBN = scrapy.Field() # 版次 Edition = scrapy.Field() # 商品编码 ProductID = scrapy.Field() # 包装 Packaging = scrapy.Field() # 开本 Format = scrapy.Field() # 出版时间 PublicationTime = scrapy.Field() # 用纸类型 PaperType = scrapy.Field() #页数 BookNum = scrapy.Field() #爬取时间 crawlTime = scrapy.Field() #爬取当天价格 ProductPrice = scrapy.Field()4、setting部分:
# -*- coding: utf-8 -*- # Scrapy settings for goods project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'goods' SPIDER_MODULES = ['goods.spiders'] NEWSPIDER_MODULE = 'goods.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'goods (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'goods.middlewares.GoodsSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'goods.middlewares.GoodsDownloaderMiddleware': 543, 'goods.middlewares.UserAgentDownloadMiddleware': 10, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'goods.pipelines.GoodsPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'5、middlewares.py
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals import random class UserAgentDownloadMiddleware(object): USER_AGENTS=[ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16", "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14", "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14", "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0", "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0", ] def process_request(self,request,spider): user_agent = random.choice(self.USER_AGENTS) request.headers['User-Agent']=user_agent6、pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exporters import JsonLinesItemExporter from openpyxl import Workbook class GoodsPipeline(object): def __init__(self): # 转为json格式 self.book_fp = open('book.json', 'wb') self.book_exporter = JsonLinesItemExporter(self.book_fp, ensure_ascii=False) #转为excel表 self.wb = Workbook() # 类实例化 self.ws = self.wb.active # 激活工作表 self.ws.append(['Title', 'Author', 'PublishingHouse', 'ISBN', 'Edition', 'ProductID', 'Packaging', 'Format','PublicationTime','BookNum','crawlTime','ProductPrice']) # 添加表头 def process_item(self, item, spider): #接收item(json) self.book_exporter.export_item(item) # excel data = [item["Title"], item["Author"], item["PublishingHouse"], item["ISBN"], item["Edition"], item["ProductID"], item["Packaging"], item["Format"],item["PublicationTime"], item["BookNum"], item["crawlTime"], item["ProductPrice"]] self.ws.append(data) # 将数据以行的形式添加到工作表中 self.wb.save('books_1.xlsx') # 保存 return item def close_spider(self,spider): self.book_fp.close()7、首先运行main启动类,获取商品ID及详细信息 最后运行main1获取同一件商品当前爬取的时间段价格
1、运行main结果如下 注意:这里使用需要把获取的json前面加上{“books”:[ ,尾部加上 ] } 复制全部内容放入自己创建的books.json中 2、再运行main1即可获取自己想要的当前爬取的商品价格信息了 打开结果:
最累的时候,家人是你最好的归宿!