小刘总--机器学习(深度优先算法)

    技术2022-07-10  133

    Lesson-02

    深度优先与广度优先爬虫的原理机器学习的原理

    OOV ?

    Out of Vocabulary

    Lambda

    import random """adj* => adj* adj | adj null""" def adj(): return random.choice('蓝色的 | 好看的 | 小小的'.split('|')).split()[0] give_adjs = lambda : adj() + adj_star_2() give_null = lambda : '' def adj_star_2(): return random.choice([give_null, give_adjs])() #return random.choice([lambda : '', lambda : adj() + adj_star_2()])() def adj_star(): # infite loop return random.choice(['', adj() + adj_star()]) adj_star_2() def simple_func(x, y): return x + y results = [] for e in [1, 3, 5, 7, 9, 10]: results.append(operate(e)) results #def operate(e): return e * 2 + 2 - 9 for i in map(lambda x: x * 2 + 2 - 9, [1, 3, 5, 7, 9, 10]): print(i) operator_2 = lambda x: x * 2 + 2 - 9 #def operate(e): return e * 2 + 2 - 9 def absolutly(x): if x < 0: return -1 * x else: return x def mod_3(x): return x % 3 mod_3(13) sorted([1, 3, 4, 1, 5, 1, 1, 4, 9, -1, 9, -19, 1], key=mod_3) sorted([1, 3, 4, 1, 5, 1, 1, 4, 9, -1, 9, -19, 1], key=lambda x: x % 3)

    Search Policy

    coordination_source = """ {name:'Lanzhou', geoCoord:[103.73, 36.03]}, {name:'Jiayuguan', geoCoord:[98.17, 39.47]}, {name:'Xining', geoCoord:[101.74, 36.56]}, {name:'Chengdu', geoCoord:[104.06, 30.67]}, {name:'Shijiazhuang', geoCoord:[114.48, 38.03]}, {name:'Lasa', geoCoord:[102.73, 25.04]}, {name:'Guiyang', geoCoord:[106.71, 26.57]}, {name:'Wuhan', geoCoord:[114.31, 30.52]}, {name:'Zhengzhou', geoCoord:[113.65, 34.76]}, {name:'Jinan', geoCoord:[117, 36.65]}, {name:'Nanjing', geoCoord:[118.78, 32.04]}, {name:'Hefei', geoCoord:[117.27, 31.86]}, {name:'Hangzhou', geoCoord:[120.19, 30.26]}, {name:'Nanchang', geoCoord:[115.89, 28.68]}, {name:'Fuzhou', geoCoord:[119.3, 26.08]}, {name:'Guangzhou', geoCoord:[113.23, 23.16]}, {name:'Changsha', geoCoord:[113, 28.21]}, //{name:'海口', geoCoord:[110.35, 20.02]}, {name:'Shengyang', geoCoord:[123.38, 41.8]}, {name:'Changchun', geoCoord:[125.35, 43.88]}, {name:'Haerbing', geoCoord:[126.63, 45.75]}, {name:'Taiyuan', geoCoord:[112.53, 37.87]}, {name:'Xian', geoCoord:[108.95, 34.27]}, //{name:'Taiwan', geoCoord:[121.30, 25.03]}, {name:'Beijing', geoCoord:[116.46, 39.92]}, {name:'Shanghai', geoCoord:[121.48, 31.22]}, {name:'Chongqing', geoCoord:[106.54, 29.59]}, {name:'Tianjing', geoCoord:[117.2, 39.13]}, {name:'Huhehaote', geoCoord:[111.65, 40.82]}, {name:'Nanning', geoCoord:[108.33, 22.84]}, //{name:'西藏', geoCoord:[91.11, 29.97]}, {name:'Yingchuan', geoCoord:[106.27, 38.47]}, {name:'Wulumuqi', geoCoord:[87.68, 43.77]}, {name:'Xianggang', geoCoord:[114.17, 22.28]}, {name:'Aomen', geoCoord:[113.54, 22.19]} """ geo_pattern = "{name:'(\w+)',\s+geoCoord:\[(\d+\.\d+),\s+(\d+\.\d+)\]}" city_location = {} for city, x, y in re.findall(geo_pattern, coordination_source): city_location[city] = (float(x), float(y)) city_location cities = {c : [] for c in city_location} nx.draw(nx.Graph(cities),city_location, with_labels=True, font_size=10, node_size=10) import math def geo_distance(origin, destination): """ Calculate the Haversine distance. Parameters ---------- origin : tuple of float (lat#纬度-北纬南纬-y, long#经度-东经西经-x) destination : tuple of float (lat, long) Returns ------- distance_in_km : float Examples -------- >>> origin = (48.1372, 11.5756) # Munich >>> destination = (52.5186, 13.4083) # Berlin >>> round(distance(origin, destination), 1) 504.2 """ #lat1, lon1 = origin lon1, lat1 = origin #lat2, lon2 = destination lon2, lat2 = destination radius = 6371 # km dlat = math.radians(lat2 - lat1) dlon = math.radians(lon2 - lon1) a = (math.sin(dlat / 2) * math.sin(dlat / 2) + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2) * math.sin(dlon / 2)) c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) d = radius * c return d city_location['Beijing'] city_location['Tianjing'] city_location['Shengyang'] city_location['Shanghai'] geo_distance((116.46, 39.92), (117.2, 39.13))

    平面上: P1: [123.38, 41.8] P2: [121.48, 31.22]

    P1: [41.8, 123.38] P2: [31.22, 121.48]

    geo_distance((123.38, 41.8), (121.48, 31.22)) geo_distance((41.8, 123.38), (31.22, 121.48)) def get_city_distance(city1, city2): return geo_distance(city_location[city1], city_location[city2]) get_city_distance('Beijing', 'Shanghai') get_city_distance('Hangzhou', 'Shanghai') from collections import defaultdict threshold = 700 #city_connection = {c : [] for c in city_location} city_connection = defaultdict(list) for city1 in city_location: for city2 in city_location: if city1 is city2: continue distance = get_city_distance(city1, city2) if distance < threshold: city_connection[city1].append(city2) city_connection ## 正则表达式 import re some_text = 'this is a test text, I want some more tests; And I have some tested cases' 'test', 'tests', 'testedededed'

    + 一个或者多个字符

    * 指的是0个或者多个字符

    re == > regular expresson

    pattern = 'test\w*' pattern_2 = 'test\w+' re.findall(pattern_2, some_text) #re.findall(pattern, some_text) with_numbers_text = """ 电话:18910451234 年龄:32 性别:男 ID:90 电话:18910451234 年龄:32 性别:男 电话:18910451234 年龄:32 性别:男 电话:18910451234 年龄:32 性别:男 电话:189104513123234 年龄:32 性别:男 ID: 312 电话:18910451234 年龄:32 性别:男 电话:18989810451234 年龄:32 性别:男 STU_OD: 312 电话:01018910451234 年龄:3 性别:男 电话:18910451234 年龄:32 性别:男 电话:18910451234 年龄:32 性别:男 电话:18910451234 年龄:32 性别:男 """ with_constraint_num_pattern = '电话:(\d+) 年龄:(\d+)' re.findall(with_constraint_num_pattern, with_numbers_text) import networkx as nx relatinship = { 'Zhangsan': [], 'Li': [], 'Huang': [] } simple_graph = nx.Graph(relatinship) %matplotlib inline locations = { 'Zhangsan': (0, 0), 'Li': (1, 4), 'Wang': (9, 0), 'Liu': (19, 1), 'Gao': (10, 10), 'Huang': (11, 19), 'Hu': (18, -10) } nx.draw(simple_graph, locations, with_labels=True) city_graph = nx.Graph(city_connection) nx.draw(city_graph, city_location, with_labels=True)

    五子棋 -->

    象棋

    从郑州到嘉峪关为例

    广度优先搜索

    def bfs(graph, start, destination): # need_visit = [start] pathes = [[start]] seen = set() visited_cities = [] # while need_visit: while pathes: path = pathes.pop(0) # froniter = need_visit.pop(0) froniter = path[-1] if froniter in seen: continue for successor in graph[froniter]: if successor in path: continue new_path = path + [successor] if successor == destination: # 找到目的地的时候 visited_cities.append(froniter) visited_cities.append(successor) return new_path, visited_cities, pathes # 返回了 pathes.append(new_path) seen.add(froniter) visited_cities.append(froniter) return seen, visited_cities, pathes def best_fs(graph, start, destination, strategy): # need_visit = [start] pathes = [[start]] seen = set() visited_cities = [] # while need_visit: while pathes: path = pathes.pop(0) # froniter = need_visit.pop(0) froniter = path[-1] if froniter in seen: continue for successor in graph[froniter]: if successor in path: continue new_path = path + [successor] if successor == destination: # 找到目的地的时候 visited_cities.append(froniter) visited_cities.append(successor) return new_path, visited_cities, pathes # 返回了 pathes.append(new_path) seen.add(froniter) visited_cities.append(froniter) pathes = sorted(pathes, key=strategy) return seen, visited_cities, pathes p, v, ps = best_fs(city_graph, 'Haerbing', 'Hefei', get_total_length) pretty_print(p) p, v, ps = best_fs(city_graph, 'Haerbing', 'Hefei', lambda x: -len(x)) pretty_print(p) get_total_length(p) p, v, ps = bfs(city_graph, 'Haerbing', 'Hefei') pretty_print(p) get_total_length(p) def pretty_print(cities): print('🚘->'.join(cities)) p, v, ps = bfs(city_graph, 'Haerbing', 'Hefei') pretty_print(p) def get_total_length(path): distance = 0 for i, city in enumerate(path[:-1]): current, next_c = city, path[i+1] distance += get_city_distance(current, next_c) return distance depth_first_search(city_graph, 'Beijing', 'Hefei')

    深度优先算法

    def depth_first_search(graph, start, destination): # need_visit = [start] pathes = [[start]] seen = set() visited_cities = [] # while need_visit: while pathes: # path = pathes.pop(0) # 把以前的老路先走完 path = pathes.pop(-1) # 取最新的路径 # froniter = need_visit.pop(0) froniter = path[-1] if froniter in seen: continue for successor in graph[froniter]: if successor in path: continue new_path = path + [successor] if successor == destination: # 找到目的地的时候 visited_cities.append(froniter) visited_cities.append(successor) return new_path, visited_cities, pathes # 返回了 pathes.append(new_path) seen.add(froniter) visited_cities.append(froniter) return seen, visited_cities, pathes path, visited_cities, pathes = search(city_connection, 'Zhengzhou', 'Jiayuguan') path pathes color_map = ['green' for _ in city_graph] def visited_procedure(graph, postion, visited_order, step): changed = visited_order[:step] if step else visited_order color_map = ['green' if c in changed else 'red' for c in graph] nx.draw(graph, postion, node_color=color_map, with_labels=True) path, visited_cities_bfs, pathes = search(city_connection, 'Zhengzhou', 'Jiayuguan') visited_procedure(city_graph, city_location, visited_cities_bfs, step=20) path, visited_cities_dfs, pathes = depth_first_search(city_connection, 'Zhengzhou', 'Jiayuguan') visited_procedure(city_graph, city_location, visited_cities_dfs, step=18) path

    找到了从郑州到嘉峪关的路

    path, visited_cities, pathes = search(city_connection, 'Zhengzhou', 'Jiayuguan') visited_procedure(city_graph, city_location, visited_cities, step=None)

    爬虫、搜索引擎与BFS,DFS的关系

    如果你要做一个爬虫程序,你应该用BFS还是DFS?

    import requests https://github.com/Autobicycle-Never-Blocking/Beloved-Little-Moto/blob/master/crawl.py

    作业:从https://ditie.mapbar.com/beijing/开始

    爬虫正则表达式数据预处理BFSBest-First-Search -> 实现北京地铁的换乘方案 url = 'https://ditie.mapbar.com/beijing/' response = requests.get(url) pattern = """<a href="(http://ditie\.mapbar\.com/beijing/\w+/)" style="background-color:#\w+" target="_blank">(\w+)</a>""" re.findall(pattern, response.text) a_station = 'https://ditie.mapbar.com/beijing/line_ditiebatongxian/' response = requests.get(a_station) pattern_s = '<li\s*.*><a class="cl\-station" href="http://ditie\.mapbar\.com/beijing/station_\w+/" target="_blank" title="\w+">(\w+)</a></li>' re.findall(pattern_s, response.text) { '八通线'[], '一号线'[], }
    Processed: 0.021, SQL: 12