Lecture-02-Search-Policy-and-Simple-Machine-Learning

技术2022-07-10 119

问题描述

大家好，我们在用 networkx 显示中文的时候，会发现不能显示中文。

解决办法

下载Github仓库中的字体SimHei.ttf；在 jupyter notebook 中执行 import matplotlib print(matplotlib.__path__)

找到 matplotlib 的路径，然后 cd 到这个路径。 cd 到这个路径之后，继续 cd，cd 到 mpl-data/fonts/ttf 这个路径。然后把 DejaVuSans.ttf 这个文件换成我们刚刚下在的文件。

$ mv SimHei.ttf DejaVuSans.ttf

其中，用到的 ttf 字体。我给大家传到附件上了。

Previous Course

What’s the language model? OOV problem?Syntax Tree. lambda

$p r (w)$

Out of Vocabulary(OOV)

Pr(w_o) = constant < Pr(w_1)

Language Model: Smooth

import random """adj* => adj* adj | adj null""" def adj(): return random.choice('蓝色的 | 好看的 | 小小的'.split('|')).split()[0] def adj_star_2(): return random.choice([lambda : '', lambda : adj() + adj_star_2()])() def adj_star(): return random.choice(['', ajd() + adj_star()]) lambda : adj() + adj_star_2() numbers = [1, -2, -4, 1, 5, 6, 9, -11] def absolute(number): if number < 0: return -1 * number else: return number absolute(-11) def mod_5(number): return number % 5 sorted(numbers, key=lambda x: x % 5) mod_5_lambda = lambda x: x % 5 mod_5 mod_5_lambda(19) mod_5(19) adj_star_2() ## Search Policy coordination_source = """ {name:'Lanzhou', geoCoord:[103.73, 36.03]}, {name:'Jiayuguan', geoCoord:[98.17, 39.47]}, {name:'Xining', geoCoord:[101.74, 36.56]}, {name:'Chengdu', geoCoord:[104.06, 30.67]}, {name:'Shijiazhuang', geoCoord:[114.48, 38.03]}, {name:'Lasa', geoCoord:[102.73, 25.04]}, {name:'Guiyang', geoCoord:[106.71, 26.57]}, {name:'Wuhan', geoCoord:[114.31, 30.52]}, {name:'Zhengzhou', geoCoord:[113.65, 34.76]}, {name:'Jinan', geoCoord:[117, 36.65]}, {name:'Nanjing', geoCoord:[118.78, 32.04]}, {name:'Hefei', geoCoord:[117.27, 31.86]}, {name:'Hangzhou', geoCoord:[120.19, 30.26]}, {name:'Nanchang', geoCoord:[115.89, 28.68]}, {name:'Fuzhou', geoCoord:[119.3, 26.08]}, {name:'Guangzhou', geoCoord:[113.23, 23.16]}, {name:'Changsha', geoCoord:[113, 28.21]}, //{name:'海口', geoCoord:[110.35, 20.02]}, {name:'Shengyang', geoCoord:[123.38, 41.8]}, {name:'Changchun', geoCoord:[125.35, 43.88]}, {name:'Haerbing', geoCoord:[126.63, 45.75]}, {name:'Taiyuan', geoCoord:[112.53, 37.87]}, {name:'Xian', geoCoord:[108.95, 34.27]}, //{name:'Taiwan', geoCoord:[121.30, 25.03]}, {name:'Beijing', geoCoord:[116.46, 39.92]}, {name:'Shanghai', geoCoord:[121.48, 31.22]}, {name:'Chongqing', geoCoord:[106.54, 29.59]}, {name:'Tianjing', geoCoord:[117.2, 39.13]}, {name:'Huhehaote', geoCoord:[111.65, 40.82]}, {name:'Nanning', geoCoord:[108.33, 22.84]}, //{name:'西藏', geoCoord:[91.11, 29.97]}, {name:'Yingchuan', geoCoord:[106.27, 38.47]}, {name:'Wulumuqi', geoCoord:[87.68, 43.77]}, {name:'Xianggang', geoCoord:[114.17, 22.28]}, {name:'Aomen', geoCoord:[113.54, 22.19]} """ city_location = { '香港': (114.17, 22.28) } Input: String -> dict test_string = "{name:'兰州', geoCoord:[103.73, 36.03]}," import re pattern = re.compile(r"name:'(\w+)',\s+geoCoord:\[(\d+.\d+),\s(\d+.\d+)\]") for line in coordination_source.split('\n'): city_info = pattern.findall(line) if not city_info: continue # following: we find the city info city, long, lat = city_info[0] long, lat = float(long), float(lat) city_location[city] = (long, lat) city_location import math def geo_distance(origin, destination): """ Calculate the Haversine distance. Parameters ---------- origin : tuple of float (lat, long) destination : tuple of float (lat, long) Returns ------- distance_in_km : float Examples -------- >>> origin = (48.1372, 11.5756) # Munich >>> destination = (52.5186, 13.4083) # Berlin >>> round(distance(origin, destination), 1) 504.2 """ lat1, lon1 = origin lat2, lon2 = destination radius = 6371 # km dlat = math.radians(lat2 - lat1) dlon = math.radians(lon2 - lon1) a = (math.sin(dlat / 2) * math.sin(dlat / 2) + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2) * math.sin(dlon / 2)) c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) d = radius * c return d def get_geo_distance(city1, city2): return geo_distance(city_location[city1], city_location[city2]) get_geo_distance('Shanghai', 'Hangzhou') import networkx as nx city_graph = nx.Graph() city_graph.add_nodes_from(list(city_location.keys())) %matplotlib inline import matplotlib.pyplot as plt city_location nx.draw(city_graph, city_location, with_labels=True, node_size=30) threshold = 300 from collections import defaultdict city_connection = defaultdict(list) for c1 in city_location: for c2 in city_location: if c1 == c2: continue distance = get_geo_distance(c1, c2) if distance < threshold: city_connection[c1].append(c2) city_connection[c2].append(c1) city_connection city_with_road = nx.Graph(city_connection) nx.draw(city_with_road, city_location, with_labels=True, node_size=30) from collections import defaultdict simple_connection_info_src = { '北京': ['太原', '沈阳'], '太原': ['北京', '西安', '郑州'], '兰州': ['西安'], '郑州': ['太原'], '西安': ['兰州', '长沙'], '长沙': ['福州', '南宁'], '沈阳': ['北京'] } simple_connection_info = defaultdict(list) simple_connection_info.update(simple_connection_info_src) def bfs(graph, start): """ breath first search """ visited = [start] seen = set() while visited: froninter = visited.pop() # if froninter in seen: continue for successor in graph[froninter]: if successor in seen: continue print(successor) #visited = visited + [successor] # 我们每次扩展都扩展最新发现的点 -> depth first visited = [successor] + visited # 我们每次扩展都先考虑已经发现的老的点 -> breath first # 所以说，这个扩展顺序其实是决定了我们的深度优先还是广度优先 seen.add(froninter) return seen number_grpah = defaultdict(list) number_grpah.update({ 1: [2, 3], 2: [1, 4], 3: [1, 5], 4: [2, 6], 5: [3, 7], 7: [5, 8] }) bfs(number_grpah, 1) simple_connection_info['西安'] nx.draw(nx.Graph(simple_connection_info), city_location, with_labels=True, node_size=10) def search(start, destination, connection_grpah, sort_candidate): pathes = [[start]] visitied = set() while pathes: # if we find existing pathes path = pathes.pop(0) froninter = path[-1] if froninter in visitied: continue successors = connection_grpah[froninter] for city in successors: if city in path: continue # eliminate loop new_path = path + [city] pathes.append(new_path) if city == destination: return new_path visitied.add(froninter) pathes = sort_candidate(pathes) # 我们可以加一个排序函数对我们的搜索策略进行控制 def transfer_stations_first(pathes): return sorted(pathes, key=len) def transfer_as_much_possible(pathes): return sorted(pathes, key=len, reverse=True) def shortest_path_first(pathes): if len(pathes) <= 1: return pathes def get_path_distnace(path): distance = 0 for station in path[:-1]: distance += get_geo_distance(station, path[-1]) return distance return sorted(pathes, key=get_path_distnace) search('兰州', '福州', simple_connection_info, sort_candidate=shortest_path_first) def pretty_print(cities): print('🚗->'.join(cities)) pretty_print(search('北京', '福州', simple_connection_info))

Breath first search

Depth first search

pretty_print(search('北京', '南京', city_connection)) pretty_print(search('北京', '长沙', city_connection)) pretty_print(search('北京', '广州', city_connection, sort_candidate=transfer_stations_first)) pretty_print(search('北京', '广州', city_connection, sort_candidate=transfer_as_much_possible))

爬虫与 BFS

import requests import requests from lxml import etree from config import config from utils.common import get_header from utils.db_utils import insert from collections import Counter class LaGou(object): def __init__(self, keyword, city, type): self.keyword = keyword self.city = city self.type = type self.baseurl = 'https://www.lagou.com/jobs/positionAjax.json' self.header = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'https://www.lagou.com/jobs/list_运维?city=成都&cl=false&fromSearch=true&labelWords=&suginput=', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' } def spider(self): expanded_skills = [] max_page = 10 for i in range(1, max_page): s = requests.Session() s.get( url='https://www.lagou.com/jobs/list_运维?city=北京&cl=false&fromSearch=true&labelWords=&suginput=', headers=get_header(), timeout=3) cookie = s.cookies res = requests.post(self.baseurl, headers=self.header, data={'first': True, 'pn': i, 'kd': self.keyword}, params={'px': 'default', 'city': self.city, 'needAddtionalResult': 'false'}, cookies=cookie, timeout=3) text = res.json() all_data = text['content']['positionResult']['result'] for data in all_data: s = requests.Session() s.get( url='https://www.lagou.com/jobs/list_运维?city=北京&cl=false&fromSearch=true&labelWords=&suginput=', headers=get_header(), timeout=3) cookie1 = s.cookies url = 'https://www.lagou.com/jobs/' + str(data.get('positionId')) + '.html' req1 = requests.get(url, headers=self.header, cookies=cookie1) req1.encoding = 'utf-8' html = etree.HTML(req1.text) detail = ''.join(html.xpath('//*[@class="job-detail"]//*/text()')).strip() if detail.isspace(): detail = ''.join(html.xpath('//*[@class="job-detail"]/text()')).strip() # print(detail) related_skills = data.get('skillLables') data_dict = { "firstType": str(data.get('firstType')), "secondType": str(data.get('secondType')), "thirdType": str(data.get('thirdType')), "city": str(data.get("city")), "positionName": str(data.get('positionName')), "district": str(data.get('district')), "stationname": str(data.get('stationname')), "jobNature": str(data.get('jobNature')), "companyLabelList": str(data.get('companyLabelList')), "industryField": str(data.get('industryField')), "salary": str(data.get('salary')), "companySize": str(data.get('companySize')), "skillLables": str(related_skills), "createTime": str(data.get('createTime')), "companyFullName": str(data.get('companyFullName')), "workYear": str(data.get('workYear')), "education": str(data.get('education')), "positionAdvantage": str(data.get('positionAdvantage')), "url": str(url), "detail": str(detail), "type": str(self.type), "latitude": str(data.get("latitude")), "longitude": str(data.get("longitude")), "keyword": str(self.keyword), } # print(data_dict) # time.sleep(random.randint(1, 5)) expanded_skills += related_skills # print(related_skills) if not insert('jobs', **data_dict): continue return [s.lower() for s in expanded_skills] def lagou_worker(city): _, position, init_job = config() visited_jobs = set() while init_job: search_job = init_job.pop(0) print('We need to search {}, now search {}'.format(init_job, search_job)) if search_job in visited_jobs: continue type = '' for k, v in position.items(): if search_job in v: type = k new_expaned = LaGou(keyword=search_job, city=city, type=type).spider() expaned_counter = Counter(new_expaned).most_common(n=5) new_jobs = [j for j, n in expaned_counter] init_job += new_jobs visited_jobs.add(search_job) if __name__ == '__main__': init_job = ['人工智能', '测试', '运维', '交互设计', '数据产品经理', '原画师', '动画师', '区块链', '产品经理', '用户运营', '数据运营'] visited_jobs = set() while init_job: search_job = init_job.pop(0) print('We need to search {}, now search {}'.format(init_job, search_job)) if search_job in visited_jobs: continue new_expaned = LaGou(keyword=search_job, city='全国', type='产品线').spider() expaned_counter = Counter(new_expaned).most_common(n=5) new_jobs = [j for j, n in expaned_counter] init_job += new_jobs visited_jobs.add(search_job) print(search_job)

Machine Learning

from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target'] X[1] y[1] len(X[:, 0]) len(y) %matplotlib inline import matplotlib.pyplot as plt def draw_rm_and_price(): plt.scatter(X[:, 5], y) draw_rm_and_price() import random def price(rm, k, b): """f(x) = k * x + b""" return k * rm + b X_rm = X[:, 5] k = random.randint(-100, 100) b = random.randint(-100, 100) price_by_random_k_and_b = [price(r, k, b) for r in X_rm] draw_rm_and_price() plt.scatter(X_rm, price_by_random_k_and_b) X_rm = X[:, 5] k = random.randint(-100, 100) b = random.randint(-100, 100) price_by_random_k_and_b = [price(r, k, b) for r in X_rm] draw_rm_and_price() plt.scatter(X_rm, price_by_random_k_and_b)

$\hat{y}$

list(y) price_by_random_k_and_b [1, 1, 1] [2, 2, 2]

loss

$\frac{1}{n} \sum{(y_i - \hat{y_i})}^2$

$\frac{1}{n} \sum{(y_i - (kx_i + b_i))}^2$

$\frac{\partial{loss}}{\partial{k}} = -\frac{2}{n}\sum(y_i - (kx_i + b_i))x_i$

$\frac{\partial{loss}}{\partial{k}} = -\frac{2}{n}\sum(y_i - \hat{y_i})x_i$

$\frac{\partial{loss}}{\partial{b}} = -\frac{2}{n}\sum(y_i - \hat{y_i})$

def loss(y, y_hat): # to evaluate the performance return sum((y_i - y_hat_i)**2 for y_i, y_hat_i in zip(list(y), list(y_hat))) / len(list(y))

First-Method: Random generation: get best k and best b

trying_times = 2000 min_loss = float('inf') best_k, best_b = None, None for i in range(trying_times): k = random.random() * 200 - 100 b = random.random() * 200 - 100 price_by_random_k_and_b = [price(r, k, b) for r in X_rm] current_loss = loss(y, price_by_random_k_and_b) if current_loss < min_loss: min_loss = current_loss best_k, best_b = k, b print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i, best_k, best_b, min_loss)) 10 ** 0.5 X_rm = X[:, 5] k = 15 b = -68 price_by_random_k_and_b = [price(r, k, b) for r in X_rm] draw_rm_and_price() plt.scatter(X_rm, price_by_random_k_and_b)

2nd-Method: Direction Adjusting

trying_times = 2000 min_loss = float('inf') best_k = random.random() * 200 - 100 best_b = random.random() * 200 - 100 direction = [ (+1, -1), # first element: k's change direction, second element: b's change direction (+1, +1), (-1, -1), (-1, +1), ] next_direction = random.choice(direction) scalar = 0.1 update_time = 0 for i in range(trying_times): k_direction, b_direction = next_direction current_k, current_b = best_k + k_direction * scalar, best_b + b_direction * scalar price_by_k_and_b = [price(r, current_k, current_b) for r in X_rm] current_loss = loss(y, price_by_k_and_b) if current_loss < min_loss: # performance became better min_loss = current_loss best_k, best_b = current_k, current_b next_direction = next_direction update_time += 1 if update_time % 10 == 0: print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i, best_k, best_b, min_loss)) else: next_direction = random.choice(direction)

如果我们想得到更快的更新，在更短的时间内获得更好的结果，我们需要一件事情：

找对改变的方向

如何找对改变的方向呢？

2nd-method: 监督让他变化–> 监督学习

导数

def partial_k(x, y, y_hat): n = len(y) gradient = 0 for x_i, y_i, y_hat_i in zip(list(x), list(y), list(y_hat)): gradient += (y_i - y_hat_i) * x_i return -2 / n * gradient def partial_b(x, y, y_hat): n = len(y) gradient = 0 for y_i, y_hat_i in zip(list(y), list(y_hat)): gradient += (y_i - y_hat_i) return -2 / n * gradient from icecream import ic trying_times = 2000 X, y = data['data'], data['target'] min_loss = float('inf') current_k = random.random() * 200 - 100 current_b = random.random() * 200 - 100 learning_rate = 1e-04 update_time = 0 for i in range(trying_times): price_by_k_and_b = [price(r, current_k, current_b) for r in X_rm] current_loss = loss(y, price_by_k_and_b) if current_loss < min_loss: # performance became better min_loss = current_loss if i % 50 == 0: print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i, best_k, best_b, min_loss)) k_gradient = partial_k(X_rm, y, price_by_k_and_b) b_gradient = partial_b(X_rm, y, price_by_k_and_b) current_k = current_k + (-1 * k_gradient) * learning_rate current_b = current_b + (-1 * b_gradient) * learning_rate y = 10 X_rm = X[:, 5] k = 11.431551629413757 b = -49.52403584539048 price_by_random_k_and_b = [price(r, k, b) for r in X_rm] draw_rm_and_price() plt.scatter(X_rm, price_by_random_k_and_b) loss([1, 1, 1], [2, 2, 3]) data['feature_names']

Processed: 0.018, SQL: 9