import requests import jsonpath import json import xlwt
def saveData(data_list): book=xlwt.Workbook(encoding=‘utf-8’,style_compression=0) sheet=book.add_sheet(‘tengxun2’) col=(“postid”,“职位名”,“工作地点”,“工作职责”,“工作要求”,“发布时间”,“url”) for i in range(7): sheet.write(0,i,col[i]) for i in range(len(data_list)): data=data_list[i] print(data) for j in data: # print(data) print(j) # for j in range(len(data)):C sheet.write(i+1,0,j[‘id’]) sheet.write(i+1,1,j[‘职位’]) sheet.write(i+1,2,j[‘工作地点’]) sheet.write(i+1,3,j[‘工作职责’]) sheet.write(i+1,4,j[‘工作要求’]) sheet.write(i+1,5,j[‘发布时间’]) sheet.write(i+1,6,j[‘url’])
book.save('tengxun2.xls')def spider_page(j): url=‘https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=&postId=’+str(j)+’&language=zh-cn’ response = requests.get(url) jsonobj = json.loads(response.text) items = jsonpath.jsonpath(jsonobj, ‘$.Data’)
#print(items) list1=[] result={} result['id']=items[0]['PostId'] result['职位']=items[0]['RecruitPostName'] result['工作地点']=items[0]['LocationName'] result['工作职责']=items[0]['Responsibility'] result['工作要求']=items[0]['Requirement'] result['发布时间']=items[0]['LastUpdateTime'] result['url']=items[0]['PostURL'] list1.append(result) # print(list1) #saveData(list1) return list1def spider_url(post,num):
url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1591689554351&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword='+post+'&pageIndex='+str(num)+'&pageSize=10&language=zh-cn&area=cn' #url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1591681767428&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex='+str(num)+'&pageSize=10&language=zh-cn&area=cn' headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'} response=requests.get(url,headers=headers) html=response.text jsonobj=json.loads(html) items=jsonpath.jsonpath(jsonobj,'$.Data.Posts[*]') list=[] for i in items: postid=i['PostId'] list.append(postid) return listif name==‘main’: data_list=[] post=str(input(‘输入要查找的职位:’)) num=int(input(‘输入要爬取的页数:’)) for i in range(1,num+1): for j in spider_url(post,i): data_list.append(spider_page(j))
# for i in range(len(data_list)): # data=data_list[i] # for j in data: # print(j['id']) saveData(data_list) print('爬取完毕')