问题描述
大家好,我们在用 networkx 显示中文的时候,会发现不能显示中文。
解决办法
下载Github仓库中的字体SimHei.ttf;在 jupyter notebook 中执行
import matplotlib
print(matplotlib
.__path__
)
找到 matplotlib 的路径,然后 cd 到这个路径。 cd 到这个路径之后,继续 cd,cd 到 mpl-data/fonts/ttf 这个路径。 然后把 DejaVuSans.ttf 这个文件换成我们刚刚下在的文件。
$
mv SimHei.ttf DejaVuSans.ttf
其中,用到的 ttf 字体。 我给大家传到附件上了。
Previous Course
What’s the language model? OOV problem?Syntax Tree. lambda
p
r
(
w
)
pr(w)
pr(w)
Out of Vocabulary(OOV)
Pr(w_o) = constant < Pr(w_1)
Language Model: Smooth
import random
"""adj* => adj* adj | adj null"""
def adj(): return random
.choice
('蓝色的 | 好看的 | 小小的'.split
('|')).split
()[0]
def adj_star_2():
return random
.choice
([lambda : '', lambda : adj
() + adj_star_2
()])()
def adj_star():
return random
.choice
(['', ajd
() + adj_star
()])
lambda : adj
() + adj_star_2
()
numbers
= [1, -2, -4, 1, 5, 6, 9, -11]
def absolute(number
):
if number
< 0: return -1 * number
else:
return number
absolute
(-11)
def mod_5(number
):
return number
% 5
sorted(numbers
, key
=lambda x
: x
% 5)
mod_5_lambda
= lambda x
: x
% 5
mod_5
mod_5_lambda
(19)
mod_5
(19)
adj_star_2
()
coordination_source
= """
{name:'Lanzhou', geoCoord:[103.73, 36.03]},
{name:'Jiayuguan', geoCoord:[98.17, 39.47]},
{name:'Xining', geoCoord:[101.74, 36.56]},
{name:'Chengdu', geoCoord:[104.06, 30.67]},
{name:'Shijiazhuang', geoCoord:[114.48, 38.03]},
{name:'Lasa', geoCoord:[102.73, 25.04]},
{name:'Guiyang', geoCoord:[106.71, 26.57]},
{name:'Wuhan', geoCoord:[114.31, 30.52]},
{name:'Zhengzhou', geoCoord:[113.65, 34.76]},
{name:'Jinan', geoCoord:[117, 36.65]},
{name:'Nanjing', geoCoord:[118.78, 32.04]},
{name:'Hefei', geoCoord:[117.27, 31.86]},
{name:'Hangzhou', geoCoord:[120.19, 30.26]},
{name:'Nanchang', geoCoord:[115.89, 28.68]},
{name:'Fuzhou', geoCoord:[119.3, 26.08]},
{name:'Guangzhou', geoCoord:[113.23, 23.16]},
{name:'Changsha', geoCoord:[113, 28.21]},
//{name:'海口', geoCoord:[110.35, 20.02]},
{name:'Shengyang', geoCoord:[123.38, 41.8]},
{name:'Changchun', geoCoord:[125.35, 43.88]},
{name:'Haerbing', geoCoord:[126.63, 45.75]},
{name:'Taiyuan', geoCoord:[112.53, 37.87]},
{name:'Xian', geoCoord:[108.95, 34.27]},
//{name:'Taiwan', geoCoord:[121.30, 25.03]},
{name:'Beijing', geoCoord:[116.46, 39.92]},
{name:'Shanghai', geoCoord:[121.48, 31.22]},
{name:'Chongqing', geoCoord:[106.54, 29.59]},
{name:'Tianjing', geoCoord:[117.2, 39.13]},
{name:'Huhehaote', geoCoord:[111.65, 40.82]},
{name:'Nanning', geoCoord:[108.33, 22.84]},
//{name:'西藏', geoCoord:[91.11, 29.97]},
{name:'Yingchuan', geoCoord:[106.27, 38.47]},
{name:'Wulumuqi', geoCoord:[87.68, 43.77]},
{name:'Xianggang', geoCoord:[114.17, 22.28]},
{name:'Aomen', geoCoord:[113.54, 22.19]}
"""
city_location
= {
'香港': (114.17, 22.28)
}
Input
: String
-> dict
test_string
= "{name:'兰州', geoCoord:[103.73, 36.03]},"
import re
pattern
= re
.compile(r
"name:'(\w+)',\s+geoCoord:\[(\d+.\d+),\s(\d+.\d+)\]")
for line
in coordination_source
.split
('\n'):
city_info
= pattern
.findall
(line
)
if not city_info
: continue
city
, long, lat
= city_info
[0]
long, lat
= float(long), float(lat
)
city_location
[city
] = (long, lat
)
city_location
import math
def geo_distance(origin
, destination
):
"""
Calculate the Haversine distance.
Parameters
----------
origin : tuple of float
(lat, long)
destination : tuple of float
(lat, long)
Returns
-------
distance_in_km : float
Examples
--------
>>> origin = (48.1372, 11.5756) # Munich
>>> destination = (52.5186, 13.4083) # Berlin
>>> round(distance(origin, destination), 1)
504.2
"""
lat1
, lon1
= origin
lat2
, lon2
= destination
radius
= 6371
dlat
= math
.radians
(lat2
- lat1
)
dlon
= math
.radians
(lon2
- lon1
)
a
= (math
.sin
(dlat
/ 2) * math
.sin
(dlat
/ 2) +
math
.cos
(math
.radians
(lat1
)) * math
.cos
(math
.radians
(lat2
)) *
math
.sin
(dlon
/ 2) * math
.sin
(dlon
/ 2))
c
= 2 * math
.atan2
(math
.sqrt
(a
), math
.sqrt
(1 - a
))
d
= radius
* c
return d
def get_geo_distance(city1
, city2
):
return geo_distance
(city_location
[city1
], city_location
[city2
])
get_geo_distance
('Shanghai', 'Hangzhou')
import networkx
as nx
city_graph
= nx
.Graph
()
city_graph
.add_nodes_from
(list(city_location
.keys
()))
%matplotlib inline
import matplotlib
.pyplot
as plt
city_location
nx
.draw
(city_graph
, city_location
, with_labels
=True, node_size
=30)
threshold
= 300
from collections
import defaultdict
city_connection
= defaultdict
(list)
for c1
in city_location
:
for c2
in city_location
:
if c1
== c2
: continue
distance
= get_geo_distance
(c1
, c2
)
if distance
< threshold
:
city_connection
[c1
].append
(c2
)
city_connection
[c2
].append
(c1
)
city_connection
city_with_road
= nx
.Graph
(city_connection
)
nx
.draw
(city_with_road
, city_location
, with_labels
=True, node_size
=30)
from collections
import defaultdict
simple_connection_info_src
= {
'北京': ['太原', '沈阳'],
'太原': ['北京', '西安', '郑州'],
'兰州': ['西安'],
'郑州': ['太原'],
'西安': ['兰州', '长沙'],
'长沙': ['福州', '南宁'],
'沈阳': ['北京']
}
simple_connection_info
= defaultdict
(list)
simple_connection_info
.update
(simple_connection_info_src
)
def bfs(graph
, start
):
"""
breath first search
"""
visited
= [start
]
seen
= set()
while visited
:
froninter
= visited
.pop
()
if froninter
in seen
: continue
for successor
in graph
[froninter
]:
if successor
in seen
: continue
print(successor
)
visited
= [successor
] + visited
seen
.add
(froninter
)
return seen
number_grpah
= defaultdict
(list)
number_grpah
.update
({
1: [2, 3],
2: [1, 4],
3: [1, 5],
4: [2, 6],
5: [3, 7],
7: [5, 8]
})
bfs
(number_grpah
, 1)
simple_connection_info
['西安']
nx
.draw
(nx
.Graph
(simple_connection_info
), city_location
, with_labels
=True, node_size
=10)
def search(start
, destination
, connection_grpah
, sort_candidate
):
pathes
= [[start
]]
visitied
= set()
while pathes
:
path
= pathes
.pop
(0)
froninter
= path
[-1]
if froninter
in visitied
: continue
successors
= connection_grpah
[froninter
]
for city
in successors
:
if city
in path
: continue
new_path
= path
+ [city
]
pathes
.append
(new_path
)
if city
== destination
: return new_path
visitied
.add
(froninter
)
pathes
= sort_candidate
(pathes
)
def transfer_stations_first(pathes
):
return sorted(pathes
, key
=len)
def transfer_as_much_possible(pathes
):
return sorted(pathes
, key
=len, reverse
=True)
def shortest_path_first(pathes
):
if len(pathes
) <= 1: return pathes
def get_path_distnace(path
):
distance
= 0
for station
in path
[:-1]:
distance
+= get_geo_distance
(station
, path
[-1])
return distance
return sorted(pathes
, key
=get_path_distnace
)
search
('兰州', '福州', simple_connection_info
, sort_candidate
=shortest_path_first
)
def pretty_print(cities
):
print('🚗->'.join
(cities
))
pretty_print
(search
('北京', '福州', simple_connection_info
))
Breath first search
Depth first search
pretty_print
(search
('北京', '南京', city_connection
))
pretty_print
(search
('北京', '长沙', city_connection
))
pretty_print
(search
('北京', '广州', city_connection
, sort_candidate
=transfer_stations_first
))
pretty_print
(search
('北京', '广州', city_connection
, sort_candidate
=transfer_as_much_possible
))
爬虫 与 BFS
import requests
import requests
from lxml
import etree
from config
import config
from utils
.common
import get_header
from utils
.db_utils
import insert
from collections
import Counter
class LaGou(object):
def __init__(self
, keyword
, city
, type):
self
.keyword
= keyword
self
.city
= city
self
.type = type
self
.baseurl
= 'https://www.lagou.com/jobs/positionAjax.json'
self
.header
= {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_运维?city=成都&cl=false&fromSearch=true&labelWords=&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
def spider(self
):
expanded_skills
= []
max_page
= 10
for i
in range(1, max_page
):
s
= requests
.Session
()
s
.get
(
url
='https://www.lagou.com/jobs/list_运维?city=北京&cl=false&fromSearch=true&labelWords=&suginput=',
headers
=get_header
(), timeout
=3)
cookie
= s
.cookies
res
= requests
.post
(self
.baseurl
, headers
=self
.header
, data
={'first': True, 'pn': i
, 'kd': self
.keyword
},
params
={'px': 'default', 'city': self
.city
, 'needAddtionalResult': 'false'},
cookies
=cookie
, timeout
=3)
text
= res
.json
()
all_data
= text
['content']['positionResult']['result']
for data
in all_data
:
s
= requests
.Session
()
s
.get
(
url
='https://www.lagou.com/jobs/list_运维?city=北京&cl=false&fromSearch=true&labelWords=&suginput=',
headers
=get_header
(), timeout
=3)
cookie1
= s
.cookies
url
= 'https://www.lagou.com/jobs/' + str(data
.get
('positionId')) + '.html'
req1
= requests
.get
(url
, headers
=self
.header
, cookies
=cookie1
)
req1
.encoding
= 'utf-8'
html
= etree
.HTML
(req1
.text
)
detail
= ''.join
(html
.xpath
('//*[@class="job-detail"]//*/text()')).strip
()
if detail
.isspace
():
detail
= ''.join
(html
.xpath
('//*[@class="job-detail"]/text()')).strip
()
related_skills
= data
.get
('skillLables')
data_dict
= {
"firstType": str(data
.get
('firstType')),
"secondType": str(data
.get
('secondType')),
"thirdType": str(data
.get
('thirdType')),
"city": str(data
.get
("city")),
"positionName": str(data
.get
('positionName')),
"district": str(data
.get
('district')),
"stationname": str(data
.get
('stationname')),
"jobNature": str(data
.get
('jobNature')),
"companyLabelList": str(data
.get
('companyLabelList')),
"industryField": str(data
.get
('industryField')),
"salary": str(data
.get
('salary')),
"companySize": str(data
.get
('companySize')),
"skillLables": str(related_skills
),
"createTime": str(data
.get
('createTime')),
"companyFullName": str(data
.get
('companyFullName')),
"workYear": str(data
.get
('workYear')),
"education": str(data
.get
('education')),
"positionAdvantage": str(data
.get
('positionAdvantage')),
"url": str(url
),
"detail": str(detail
),
"type": str(self
.type),
"latitude": str(data
.get
("latitude")),
"longitude": str(data
.get
("longitude")),
"keyword": str(self
.keyword
),
}
expanded_skills
+= related_skills
if not insert
('jobs', **data_dict
):
continue
return [s
.lower
() for s
in expanded_skills
]
def lagou_worker(city
):
_
, position
, init_job
= config
()
visited_jobs
= set()
while init_job
:
search_job
= init_job
.pop
(0)
print('We need to search {}, now search {}'.format(init_job
, search_job
))
if search_job
in visited_jobs
:
continue
type = ''
for k
, v
in position
.items
():
if search_job
in v
:
type = k
new_expaned
= LaGou
(keyword
=search_job
, city
=city
,
type=type).spider
()
expaned_counter
= Counter
(new_expaned
).most_common
(n
=5)
new_jobs
= [j
for j
, n
in expaned_counter
]
init_job
+= new_jobs
visited_jobs
.add
(search_job
)
if __name__
== '__main__':
init_job
= ['人工智能', '测试', '运维', '交互设计', '数据产品经理', '原画师', '动画师', '区块链', '产品经理', '用户运营', '数据运营']
visited_jobs
= set()
while init_job
:
search_job
= init_job
.pop
(0)
print('We need to search {}, now search {}'.format(init_job
, search_job
))
if search_job
in visited_jobs
: continue
new_expaned
= LaGou
(keyword
=search_job
, city
='全国', type='产品线').spider
()
expaned_counter
= Counter
(new_expaned
).most_common
(n
=5)
new_jobs
= [j
for j
, n
in expaned_counter
]
init_job
+= new_jobs
visited_jobs
.add
(search_job
)
print(search_job
)
Machine Learning
from sklearn
.datasets
import load_boston
data
= load_boston
()
X
, y
= data
['data'], data
['target']
X
[1]
y
[1]
len(X
[:, 0])
len(y
)
%matplotlib inline
import matplotlib
.pyplot
as plt
def draw_rm_and_price():
plt
.scatter
(X
[:, 5], y
)
draw_rm_and_price
()
import random
def price(rm
, k
, b
):
"""f(x) = k * x + b"""
return k
* rm
+ b
X_rm
= X
[:, 5]
k
= random
.randint
(-100, 100)
b
= random
.randint
(-100, 100)
price_by_random_k_and_b
= [price
(r
, k
, b
) for r
in X_rm
]
draw_rm_and_price
()
plt
.scatter
(X_rm
, price_by_random_k_and_b
)
X_rm
= X
[:, 5]
k
= random
.randint
(-100, 100)
b
= random
.randint
(-100, 100)
price_by_random_k_and_b
= [price
(r
, k
, b
) for r
in X_rm
]
draw_rm_and_price
()
plt
.scatter
(X_rm
, price_by_random_k_and_b
)
y
^
\hat{y}
y^
list(y
)
price_by_random_k_and_b
[1, 1, 1]
[2, 2, 2]
loss
l
o
s
s
=
1
n
∑
(
y
i
−
y
i
^
)
2
loss = \frac{1}{n} \sum{(y_i - \hat{y_i})}^2
loss=n1∑(yi−yi^)2
l
o
s
s
=
1
n
∑
(
y
i
−
(
k
x
i
+
b
i
)
)
2
loss = \frac{1}{n} \sum{(y_i - (kx_i + b_i))}^2
loss=n1∑(yi−(kxi+bi))2
∂
l
o
s
s
∂
k
=
−
2
n
∑
(
y
i
−
(
k
x
i
+
b
i
)
)
x
i
\frac{\partial{loss}}{\partial{k}} = -\frac{2}{n}\sum(y_i - (kx_i + b_i))x_i
∂k∂loss=−n2∑(yi−(kxi+bi))xi
∂
l
o
s
s
∂
k
=
−
2
n
∑
(
y
i
−
y
i
^
)
x
i
\frac{\partial{loss}}{\partial{k}} = -\frac{2}{n}\sum(y_i - \hat{y_i})x_i
∂k∂loss=−n2∑(yi−yi^)xi
∂
l
o
s
s
∂
b
=
−
2
n
∑
(
y
i
−
y
i
^
)
\frac{\partial{loss}}{\partial{b}} = -\frac{2}{n}\sum(y_i - \hat{y_i})
∂b∂loss=−n2∑(yi−yi^)
def loss(y
, y_hat
):
return sum((y_i
- y_hat_i
)**2 for y_i
, y_hat_i
in zip(list(y
), list(y_hat
))) / len(list(y
))
First-Method: Random generation: get best k and best b
trying_times
= 2000
min_loss
= float('inf')
best_k
, best_b
= None, None
for i
in range(trying_times
):
k
= random
.random
() * 200 - 100
b
= random
.random
() * 200 - 100
price_by_random_k_and_b
= [price
(r
, k
, b
) for r
in X_rm
]
current_loss
= loss
(y
, price_by_random_k_and_b
)
if current_loss
< min_loss
:
min_loss
= current_loss
best_k
, best_b
= k
, b
print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i
, best_k
, best_b
, min_loss
))
10 ** 0.5
X_rm
= X
[:, 5]
k
= 15
b
= -68
price_by_random_k_and_b
= [price
(r
, k
, b
) for r
in X_rm
]
draw_rm_and_price
()
plt
.scatter
(X_rm
, price_by_random_k_and_b
)
2nd-Method: Direction Adjusting
trying_times
= 2000
min_loss
= float('inf')
best_k
= random
.random
() * 200 - 100
best_b
= random
.random
() * 200 - 100
direction
= [
(+1, -1),
(+1, +1),
(-1, -1),
(-1, +1),
]
next_direction
= random
.choice
(direction
)
scalar
= 0.1
update_time
= 0
for i
in range(trying_times
):
k_direction
, b_direction
= next_direction
current_k
, current_b
= best_k
+ k_direction
* scalar
, best_b
+ b_direction
* scalar
price_by_k_and_b
= [price
(r
, current_k
, current_b
) for r
in X_rm
]
current_loss
= loss
(y
, price_by_k_and_b
)
if current_loss
< min_loss
:
min_loss
= current_loss
best_k
, best_b
= current_k
, current_b
next_direction
= next_direction
update_time
+= 1
if update_time
% 10 == 0:
print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i
, best_k
, best_b
, min_loss
))
else:
next_direction
= random
.choice
(direction
)
如果我们想得到更快的更新,在更短的时间内获得更好的结果,我们需要一件事情:
找对改变的方向
如何找对改变的方向呢?
2nd-method: 监督让他变化–> 监督学习
导数
def partial_k(x
, y
, y_hat
):
n
= len(y
)
gradient
= 0
for x_i
, y_i
, y_hat_i
in zip(list(x
), list(y
), list(y_hat
)):
gradient
+= (y_i
- y_hat_i
) * x_i
return -2 / n
* gradient
def partial_b(x
, y
, y_hat
):
n
= len(y
)
gradient
= 0
for y_i
, y_hat_i
in zip(list(y
), list(y_hat
)):
gradient
+= (y_i
- y_hat_i
)
return -2 / n
* gradient
from icecream
import ic
trying_times
= 2000
X
, y
= data
['data'], data
['target']
min_loss
= float('inf')
current_k
= random
.random
() * 200 - 100
current_b
= random
.random
() * 200 - 100
learning_rate
= 1e-04
update_time
= 0
for i
in range(trying_times
):
price_by_k_and_b
= [price
(r
, current_k
, current_b
) for r
in X_rm
]
current_loss
= loss
(y
, price_by_k_and_b
)
if current_loss
< min_loss
:
min_loss
= current_loss
if i
% 50 == 0:
print('When time is : {}, get best_k: {} best_b: {}, and the loss is: {}'.format(i
, best_k
, best_b
, min_loss
))
k_gradient
= partial_k
(X_rm
, y
, price_by_k_and_b
)
b_gradient
= partial_b
(X_rm
, y
, price_by_k_and_b
)
current_k
= current_k
+ (-1 * k_gradient
) * learning_rate
current_b
= current_b
+ (-1 * b_gradient
) * learning_rate
y
= 10
X_rm
= X
[:, 5]
k
= 11.431551629413757
b
= -49.52403584539048
price_by_random_k_and_b
= [price
(r
, k
, b
) for r
in X_rm
]
draw_rm_and_price
()
plt
.scatter
(X_rm
, price_by_random_k_and_b
)
loss
([1, 1, 1], [2, 2, 3])
data
['feature_names']