Lesson-02
深度优先与广度优先爬虫的原理机器学习的原理
OOV ?
Out of Vocabulary
Lambda
import random
"""adj* => adj* adj | adj null"""
def adj(): return random.choice('蓝色的 | 好看的 | 小小的'.split('|')).split()[0]
give_adjs = lambda : adj() + adj_star_2()
give_null = lambda : ''
def adj_star_2():
return random.choice([give_null, give_adjs])()
def adj_star():
return random.choice(['', adj() + adj_star()])
adj_star_2()
def simple_func(x, y):
return x + y
results = []
for e in [1, 3, 5, 7, 9, 10]:
results.append(operate(e))
results
for i in map(lambda x: x * 2 + 2 - 9, [1, 3, 5, 7, 9, 10]):
print(i)
operator_2 = lambda x: x * 2 + 2 - 9
def absolutly(x):
if x < 0: return -1 * x
else:
return x
def mod_3(x): return x % 3
mod_3(13)
sorted([1, 3, 4, 1, 5, 1, 1, 4, 9, -1, 9, -19, 1], key=mod_3)
sorted([1, 3, 4, 1, 5, 1, 1, 4, 9, -1, 9, -19, 1], key=lambda x: x % 3)
Search Policy
coordination_source = """
{name:'Lanzhou', geoCoord:[103.73, 36.03]},
{name:'Jiayuguan', geoCoord:[98.17, 39.47]},
{name:'Xining', geoCoord:[101.74, 36.56]},
{name:'Chengdu', geoCoord:[104.06, 30.67]},
{name:'Shijiazhuang', geoCoord:[114.48, 38.03]},
{name:'Lasa', geoCoord:[102.73, 25.04]},
{name:'Guiyang', geoCoord:[106.71, 26.57]},
{name:'Wuhan', geoCoord:[114.31, 30.52]},
{name:'Zhengzhou', geoCoord:[113.65, 34.76]},
{name:'Jinan', geoCoord:[117, 36.65]},
{name:'Nanjing', geoCoord:[118.78, 32.04]},
{name:'Hefei', geoCoord:[117.27, 31.86]},
{name:'Hangzhou', geoCoord:[120.19, 30.26]},
{name:'Nanchang', geoCoord:[115.89, 28.68]},
{name:'Fuzhou', geoCoord:[119.3, 26.08]},
{name:'Guangzhou', geoCoord:[113.23, 23.16]},
{name:'Changsha', geoCoord:[113, 28.21]},
//{name:'海口', geoCoord:[110.35, 20.02]},
{name:'Shengyang', geoCoord:[123.38, 41.8]},
{name:'Changchun', geoCoord:[125.35, 43.88]},
{name:'Haerbing', geoCoord:[126.63, 45.75]},
{name:'Taiyuan', geoCoord:[112.53, 37.87]},
{name:'Xian', geoCoord:[108.95, 34.27]},
//{name:'Taiwan', geoCoord:[121.30, 25.03]},
{name:'Beijing', geoCoord:[116.46, 39.92]},
{name:'Shanghai', geoCoord:[121.48, 31.22]},
{name:'Chongqing', geoCoord:[106.54, 29.59]},
{name:'Tianjing', geoCoord:[117.2, 39.13]},
{name:'Huhehaote', geoCoord:[111.65, 40.82]},
{name:'Nanning', geoCoord:[108.33, 22.84]},
//{name:'西藏', geoCoord:[91.11, 29.97]},
{name:'Yingchuan', geoCoord:[106.27, 38.47]},
{name:'Wulumuqi', geoCoord:[87.68, 43.77]},
{name:'Xianggang', geoCoord:[114.17, 22.28]},
{name:'Aomen', geoCoord:[113.54, 22.19]}
"""
geo_pattern = "{name:'(\w+)',\s+geoCoord:\[(\d+\.\d+),\s+(\d+\.\d+)\]}"
city_location = {}
for city, x, y in re.findall(geo_pattern, coordination_source):
city_location[city] = (float(x), float(y))
city_location
cities = {c : [] for c in city_location}
nx.draw(nx.Graph(cities),city_location, with_labels=True, font_size=10, node_size=10)
import math
def geo_distance(origin, destination):
"""
Calculate the Haversine distance.
Parameters
----------
origin : tuple of float
(lat#纬度-北纬南纬-y, long#经度-东经西经-x)
destination : tuple of float
(lat, long)
Returns
-------
distance_in_km : float
Examples
--------
>>> origin = (48.1372, 11.5756) # Munich
>>> destination = (52.5186, 13.4083) # Berlin
>>> round(distance(origin, destination), 1)
504.2
"""
lon1, lat1 = origin
lon2, lat2 = destination
radius = 6371
dlat = math.radians(lat2 - lat1)
dlon = math.radians(lon2 - lon1)
a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
math.sin(dlon / 2) * math.sin(dlon / 2))
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
d = radius * c
return d
city_location['Beijing']
city_location['Tianjing']
city_location['Shengyang']
city_location['Shanghai']
geo_distance((116.46, 39.92), (117.2, 39.13))
平面上: P1: [123.38, 41.8] P2: [121.48, 31.22]
P1: [41.8, 123.38] P2: [31.22, 121.48]
geo_distance((123.38, 41.8), (121.48, 31.22))
geo_distance((41.8, 123.38), (31.22, 121.48))
def get_city_distance(city1, city2):
return geo_distance(city_location[city1], city_location[city2])
get_city_distance('Beijing', 'Shanghai')
get_city_distance('Hangzhou', 'Shanghai')
from collections import defaultdict
threshold = 700
city_connection = defaultdict(list)
for city1 in city_location:
for city2 in city_location:
if city1 is city2: continue
distance = get_city_distance(city1, city2)
if distance < threshold:
city_connection[city1].append(city2)
city_connection
import re
some_text = 'this is a test text, I want some more tests; And I have some tested cases'
'test', 'tests', 'testedededed'
+ 一个或者多个字符
* 指的是0个或者多个字符
re == > regular expresson
pattern = 'test\w*'
pattern_2 = 'test\w+'
re.findall(pattern_2, some_text)
with_numbers_text = """
电话:18910451234 年龄:32 性别:男 ID:90
电话:18910451234 年龄:32 性别:男
电话:18910451234 年龄:32 性别:男
电话:18910451234 年龄:32 性别:男
电话:189104513123234 年龄:32 性别:男 ID: 312
电话:18910451234 年龄:32 性别:男
电话:18989810451234 年龄:32 性别:男 STU_OD: 312
电话:01018910451234 年龄:3 性别:男
电话:18910451234 年龄:32 性别:男
电话:18910451234 年龄:32 性别:男
电话:18910451234 年龄:32 性别:男
"""
with_constraint_num_pattern = '电话:(\d+) 年龄:(\d+)'
re.findall(with_constraint_num_pattern, with_numbers_text)
import networkx as nx
relatinship = {
'Zhangsan': [],
'Li': [],
'Huang': []
}
simple_graph = nx.Graph(relatinship)
%matplotlib inline
locations = {
'Zhangsan': (0, 0),
'Li': (1, 4),
'Wang': (9, 0),
'Liu': (19, 1),
'Gao': (10, 10),
'Huang': (11, 19),
'Hu': (18, -10)
}
nx.draw(simple_graph, locations, with_labels=True)
city_graph = nx.Graph(city_connection)
nx.draw(city_graph, city_location, with_labels=True)
五子棋 -->
象棋
从郑州到嘉峪关为例
广度优先搜索
def bfs(graph, start, destination):
pathes = [[start]]
seen = set()
visited_cities = []
while pathes:
path = pathes.pop(0)
froniter = path[-1]
if froniter in seen: continue
for successor in graph[froniter]:
if successor in path: continue
new_path = path + [successor]
if successor == destination:
visited_cities.append(froniter)
visited_cities.append(successor)
return new_path, visited_cities, pathes
pathes.append(new_path)
seen.add(froniter)
visited_cities.append(froniter)
return seen, visited_cities, pathes
def best_fs(graph, start, destination, strategy):
pathes = [[start]]
seen = set()
visited_cities = []
while pathes:
path = pathes.pop(0)
froniter = path[-1]
if froniter in seen: continue
for successor in graph[froniter]:
if successor in path: continue
new_path = path + [successor]
if successor == destination:
visited_cities.append(froniter)
visited_cities.append(successor)
return new_path, visited_cities, pathes
pathes.append(new_path)
seen.add(froniter)
visited_cities.append(froniter)
pathes = sorted(pathes, key=strategy)
return seen, visited_cities, pathes
p, v, ps = best_fs(city_graph, 'Haerbing', 'Hefei', get_total_length)
pretty_print(p)
p, v, ps = best_fs(city_graph, 'Haerbing', 'Hefei', lambda x: -len(x))
pretty_print(p)
get_total_length(p)
p, v, ps = bfs(city_graph, 'Haerbing', 'Hefei')
pretty_print(p)
get_total_length(p)
def pretty_print(cities):
print('🚘->'.join(cities))
p, v, ps = bfs(city_graph, 'Haerbing', 'Hefei')
pretty_print(p)
def get_total_length(path):
distance = 0
for i, city in enumerate(path[:-1]):
current, next_c = city, path[i+1]
distance += get_city_distance(current, next_c)
return distance
depth_first_search(city_graph, 'Beijing', 'Hefei')
深度优先算法
def depth_first_search(graph, start, destination):
pathes = [[start]]
seen = set()
visited_cities = []
while pathes:
path = pathes.pop(-1)
froniter = path[-1]
if froniter in seen: continue
for successor in graph[froniter]:
if successor in path: continue
new_path = path + [successor]
if successor == destination:
visited_cities.append(froniter)
visited_cities.append(successor)
return new_path, visited_cities, pathes
pathes.append(new_path)
seen.add(froniter)
visited_cities.append(froniter)
return seen, visited_cities, pathes
path, visited_cities, pathes = search(city_connection, 'Zhengzhou', 'Jiayuguan')
path
pathes
color_map = ['green' for _ in city_graph]
def visited_procedure(graph, postion, visited_order, step):
changed = visited_order[:step] if step else visited_order
color_map = ['green' if c in changed else 'red' for c in graph]
nx.draw(graph, postion, node_color=color_map, with_labels=True)
path, visited_cities_bfs, pathes = search(city_connection, 'Zhengzhou', 'Jiayuguan')
visited_procedure(city_graph, city_location, visited_cities_bfs, step=20)
path, visited_cities_dfs, pathes = depth_first_search(city_connection, 'Zhengzhou', 'Jiayuguan')
visited_procedure(city_graph, city_location, visited_cities_dfs, step=18)
path
找到了从郑州到嘉峪关的路
path, visited_cities, pathes = search(city_connection, 'Zhengzhou', 'Jiayuguan')
visited_procedure(city_graph, city_location, visited_cities, step=None)
爬虫、搜索引擎与BFS,DFS的关系
如果你要做一个爬虫程序,你应该用BFS还是DFS?
import requests
https://github.com/Autobicycle-Never-Blocking/Beloved-Little-Moto/blob/master/crawl.py
作业:从https://ditie.mapbar.com/beijing/开始
爬虫正则表达式数据预处理BFSBest-First-Search -> 实现北京地铁的换乘方案
url = 'https://ditie.mapbar.com/beijing/'
response = requests.get(url)
pattern = """<a href="(http://ditie\.mapbar\.com/beijing/\w+/)" style="background-color:#\w+" target="_blank">(\w+)</a>"""
re.findall(pattern, response.text)
a_station = 'https://ditie.mapbar.com/beijing/line_ditiebatongxian/'
response = requests.get(a_station)
pattern_s = '<li\s*.*><a class="cl\-station" href="http://ditie\.mapbar\.com/beijing/station_\w+/" target="_blank" title="\w+">(\w+)</a></li>'
re.findall(pattern_s, response.text)
{
'八通线':[],
'一号线':[],
}
转载请注明原文地址:https://ipadbbs.8miu.com/read-4695.html