获取迁木网QS世界大学排名信息

    技术2024-01-22  105

    处理网址:http://www.qianmu.org/ranking/1528.htm

    # 获取qianmu迁木网QS世界大学排名信息 import requests from lxml import etree import re def fetch(start_url): '''请求并下载网页''' r = requests.get(start_url) if r.status_code != 200: r.raise_for_status() return r.text def process_detail(link,length,num): '''处理详情页面''' select = etree.HTML(fetch(link).replace('\t','').replace('\n','').replace('\r','')) data = {} data['name'] = select.xpath('//*[@id="wikiContent"]/h1/text()')[0].strip() print("处理进度:[%s]-%d/%d"%(data['name'],num,length)) table = select.xpath('//div[@class="infobox"]')[0] keys = table.xpath('.//td[1]') cols = table.xpath('.//td[2]') keys_list = [] for key in keys: keys_list.append(''.join(key.xpath('.//text()'))) values = [] for col in cols: values.append( ''.join(col.xpath('.//text()'))) if len(keys) != len(values): return None # for i in range(len(keys)): # data[keys[i]] = values[i].strip() data.update(zip(keys_list, values)) return data def process_data(data): '''处理数据''' if data: # 结果中包含一些无用数据:'9,771*(3)'中的*(3),正则去掉 patt = re.compile(r'\*\(\d+\)') new_data ={} for k,v in data.items(): useless_data = (patt.findall(v)) if len(useless_data)>0: new_data[k] = v.replace(patt.findall(v)[0],'') else: new_data[k] = v print(new_data) if __name__ == "__main__": # 入口页面 start_url = 'http://www.qianmu.org/ranking/1528.htm' html = etree.HTML(fetch(start_url)) links = html.xpath('//div[@class="rankItem"]/table/tbody/tr[position()>1]/td[2]/a[contains(@a,"")]/@href') length = len(links) num = 1 for link in links: data = process_detail(link, length,num) process_data(data) num+=1

    效果图: 

    Processed: 0.016, SQL: 9