瓜子二手车网站的爬取
运行环境:python3.7
爬取的地址:https://www.guazi.com/sh/buy/o{num}/ ---num为爬取的页码,num = 0-40页。
import requests
from lxml import etree
import re
import csv
import time
import pandas as pd
#请求头
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'www.guazi.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Cookie' : 'uuid=9e4c7460-99c0-4c76-b903-010314124b30; ganji_uuid=8317462581327123021320; lg=1; user_city_id=17; antipas=5q287q21313746Hlg4013K60G3Y8; clueSourceCode=*#00; sessionid=518e7f53-de88-465b-a4ad-f818c8760709; cainfo={"ca_a":"-","ca_b":"-","ca_s":"self","ca_n":"self","ca_medium":"-","ca_term":"-","ca_content":"-","ca_campaign":"-","ca_kw":"-","ca_i":"-","scode":"-","keyword":"-","ca_keywordid":"-","display_finance_flag":"-","platform":"1","version":1,"client_ab":"-","guid":"9e4c7460-99c0-4c76-b903-010314124b30","ca_city":"sh","sessionid":"518e7f53-de88-465b-a4ad-f818c8760709"}; close_finance_popup=2020-06-13; lng_lat=121.521431_30.96329; gps_type=1; preTime={"last":1592015555,"this":1591972062,"pre":1591972062}; cityDomain=su; _gl_tracker={"ca_source":"-","ca_name":"-","ca_kw":"-","ca_id":"-","ca_s":"self","ca_n":"-","ca_i":"-","sid":37974657707}'
}
#获取详情页面的url
def get_detail_urls(url):
try:
#获取页面内容
response=requests.get(url,headers=headers)
text=response.content.decode('utf-8')
except:
print('first error')
#print(text)
#解析网页
html=etree.HTML(text)
#print(html)
ul=html.xpath('//ul[@class="carlist clearfix js-top"]')[0]
lis=ul.xpath('./li')
detail_urls=[]
for li in lis:
detail_url=li.xpath('./a/@href')
detail_url='https://www.guazi.com'+detail_url[0]
detail_urls.append(detail_url)
return detail_urls
#解析详情页面内容
def parse_detail_url(url):
try:
time.sleep(5)
resp=requests.get(url,headers = headers)
text=resp.content.decode('utf-8')
except:
print('second error')
#print(text)
html=etree.HTML(text)
title=html.xpath('//div[@class="product-textbox"]/h2/text()')[0]
#双斜杠代表无论什么位置,找到所有div,class="product-textbox"是唯一的
#text是一个方法,方法必须加上括号
title_2=title.replace(r'\r\n','').strip()
#去掉空格等
infos={}
info=html.xpath('//div[@class="product-textbox"]/ul/li/span/text()')
price_info=html.xpath('//div[@class="product-textbox"]/div/div/span[@class="price-num"]/text()')[0]
price2=re.search(r'\d+.?\d+', price_info).group(0)
infos['title']=title_2
infos['cardtime']=info[0]
infos['km']=info[2]
infos['displacement']=info[3]
infos['speedbox']=info[4]
infos['price']=price2
return infos
def main():
#1.目标url
base_url='https://www.guazi.com/sh/buy/o{}/'
for x in range(0,41):
list = []
print("正在爬取:第{}页".format(x))
url=base_url.format(x)
detail_urls=get_detail_urls(url)
for detail_url in detail_urls:
infos=parse_detail_url(detail_url)
list.append(infos)
df = pd.DataFrame(list)
df.to_csv('guazi.csv',mode = 'a',header =None,index = None,encoding = 'utf-8-sig')
print('all right')
if __name__ == '__main__':
main()
end. 不足还请多多指正~