import random
sentence
= """
句子 = 主 谓 宾
主 = 你 | 我 | 他
谓 = 吃 | 喝
宾 = 橘子 | 汽水 | 茶
"""
input: 根据这个语法定语,能够生成句子
def sentence():
return 主语
() + 谓语
() + 宾语
()
def 主语
():
return random
.choice
('你 | 我 | 他'.split
('|'))
def 谓语
():
return random
.choice
('吃 | 喝'.split
('|'))
def 宾语
():
return random
.choice
('橘子 | 汽水 | 茶'.split
('|'))
for _
in range(10):
print(sentence
())
two_number
= """
numbers = num numbers | num
num = 0 | 1 | 2 | 3
"""
def num():
return random
.choice
(' 0 | 1 | 2 | 3'.split
('|'))
def numbers():
if random
.random
() < 0.3:
return num
()
else:
return num
() + numbers
()
numbers
()
for _
in range(10):
print(numbers
())
怎么样在问题场景变化(语法变化)的情况下,你的程序不需要重新写?
numbers_ops
= """
expression => expression op num_op | num_op
num_op => nums op nums
nums => num nums | num
num => 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
op => + | - | * | /
"""
def generate_grammer(grammer_str
: str, split
='=>'):
grammar
= {}
for line
in grammer_str
.split
('\n'):
if not line
: continue
expr
, formula
= line
.split
(split
)
formulas
= formula
.split
('|')
formulas
= [f
.split
() for f
in formulas
]
grammar
[expr
.strip
()] = formulas
return grammar
def generate_by_grammer(grammar
: dict, target
=str):
if target
not in grammar
: return target
expr
= random
.choice
(grammar
[target
])
return ''.join
(generate_by_grammer
(grammar
, t
) for t
in expr
)
generate_by_grammer
(grammar
, target
='expression')
def generate_by_str(grammar_str
, target
, spliter
='=>'):
return generate_by_grammer
(grammar
=generate_grammer
(grammar_str
, spliter
), target
=target
)
sentence
= """
句子 => 主 谓 宾
主 => 你 | 我 | 他
谓 => 吃 | 喝
宾 => 橘子 | 汽水 | 茶
"""
generate_by_str
(numbers_ops
, target
='expression')
for _
in range(10):
print(generate_by_grammer
(generate_grammer
(sentence
), target
='句子'))
human
= """
human = 自己 寻找 活动
自己 = 我 | 俺 | 我们
寻找 = 找找 | 想找点
活动 = 乐子 | 玩的
"""
假如既然
= """
句子 = if someone state , then do something
if = 既然 | 如果 | 假设
someone = one 和 someone | one
one = 小红 | 小蓝 | 小绿 | 白白
state = 饿了 | 醒了 | 醉了 | 癫狂了
then = 那么 | 就 | 可以
do = 去
something = 吃饭 | 玩耍 | 去浪 | 睡觉
"""
host
= """
host = 寒暄 报数 询问 业务相关 结尾
报数 = 我是 数字 号 ,
数字 = 单个数字 | 数字 单个数字
单个数字 = 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
寒暄 = 称谓 打招呼 | 打招呼
称谓 = 人称 ,
人称 = 先生 | 女士 | 小朋友
打招呼 = 你好 | 您好
询问 = 请问你要 | 您需要
业务相关 = 具体业务
具体业务 = 喝酒 | 打牌 | 打猎 | 赌博
结尾 = 吗?
"""
for _
in range(5):
print(generate_by_str
(human
, 'human', '='))
for _
in range(5):
print(generate_by_str
(假如既然
, '句子', '='))
for _
in range(5):
print(generate_by_str
(host
, 'host', '='))
Data Driven
simpel_programming
= '''
programming => assignment logic_programming
assignment => assign change_line assignment | assign change_line
logic_programming => if_stmt | assign | while_loop
while_loop => while { cond } { change_line stmt change_line }
if_stmt => if { cond } { change_line stmt change_line } | if { cond } { change_line stmt change_line } else { change_line stmt change_line }
change_line => /N
cond => var op var
op => | == | < | >= | <= | && | **
stmt => assign | if_stmt
assign => var = var
var => var _ num | words
words => words _ word | word
word => name | info | student | lib | database
nums => nums num | num
num => 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0
'''
generate_by_str
(simpel_programming
, 'programming', spliter
='=>')
def pretty_print(line
):
lines
= line
.split
('/N')
code_lines
= []
for i
, sen
in enumerate(lines
):
if i
< len(lines
) / 2:
code_lines
.append
(i
* " " + sen
)
else:
code_lines
.append
((len(lines
) - i
) * " " + sen
)
return code_lines
generated_programming
= []
for i
in range(20):
generated_programming
+= pretty_print
(generate_by_str
(simpel_programming
, target
='programming', spliter
='=>'))
generated_programming
计算机如何判断,那句话说得最对?
10分钟之后继续~ 21:42分
形式语言 自然语言
俺想找点玩的
俺想找点玩的
我找找玩的
我们想找点玩的
俺想找点玩的
如果白白和白白醒了
,那么去睡觉
既然小绿醉了
,那么去玩耍
假设白白和白白醒了
,可以去睡觉
假设小红醒了
,就去睡觉
如果小红和白白和白白和白白和小蓝和小蓝醉了
,就去玩耍
你好我是
34758626号
,请问你要打猎吗?
女士
,您好我是
552号
,请问你要打牌吗?
女士
,您好我是
654号
,请问你要打猎吗?
你好我是
638号
,您需要打牌吗?
您好我是
22号
,您需要打猎吗?
!ls
ARTILES
= open('article_9k.txt').read
()
len(ARTILES
)
ARTILES
[:100]
import jieba
def cut(string
): return list(jieba
.cut
(string
))
cut
('我是一只小狗')
ALL_TOKENS
= cut
(ARTILES
)
ALL_TOKENS
[:10]
from collections
import Counter
word_counts
= Counter
(ALL_TOKENS
)
word_counts
.most_common
(100)
frequencies
= [f
for w
, f
in word_counts
.most_common
(100)]
x
= [i
for i
in range(100)]
%matplotlib inline
import matplotlib
.pyplot
as plt
plt
.plot
(x
, frequencies
)
只要你的数据量足够大, 那么,出现次数第N多的单词,是出现次数最多的单词的频率的 1/N
语言模型
l
a
n
g
u
a
g
e
−
m
o
d
e
l
=
P
r
(
s
e
n
t
e
n
c
e
)
language-model = Pr(sentence)
language−model=Pr(sentence)
1-gram
most_common_500
= [w
for w
, f
in word_counts
.most_common
(500)]
most_common_to_10000_100500
= [w
for w
, f
in word_counts
.most_common
(50000)][500:]
''.join
(random
.sample
(most_common_500
, k
=10))
''.join
(random
.sample
(most_common_500_to_1000
, k
=10))
1-gram
我衡量一个句子的概率,我就简单说,这个句子的概率,就等于每个单词的概率,乘起来!
$ Pr(sententence) = Pr(w_1 w_2… w_n)$
$ Pr(w1 w2 … wn) = Pr(w1 | w_2 w_3 … w_n)Pr(w_2 w_3 … w_n)$ $ Pr(w1 w2 … wn) = Pr(w1 | w_2 w_3 … w_n)Pr(w_2 | w_3 … w_n) Pr( w_3 … w_n)$ $ Pr(w1 w2 … wn) = Pr(w1 | w_2 w_3 … w_n)Pr(w_2 | w_3 … w_n)… Pr( w_{n-1} | w_n)Pr(w_n)$
Pr(AB) = Pr(A|B)Pr(B)
Pr(A|B) = Pr(A) => A 和 B 两者无关
Pr(你明天上班迟到 | 南非总统今天喝牛奶) != Pr(你明天上班迟到)
Pr(你明天上班迟到 | 南非总统今天喝牛奶) == Pr(你明天上班迟到)
$ Pr(w1 w2 … wn) = Pr(w1 | w_2 w_3 … w_n)Pr(w_2 | w_3 … w_n)… Pr( w_{n-1} | w_n)Pr(w_n)$
$ one-gram = Pr(w1 w2 … wn) = Pr(w1)Pr(w_2)… Pr( w_{n-1})Pr(w_n)$
min_frequences
= min([f
for w
, f
in word_counts
.most_common
()])
def prob_1(word
):
if word
in word_counts
:
return word_counts
[word
] / len(ALL_TOKENS
)
else:
return min_frequences
/ len(ALL_TOKENS
)
prob_1
('中国')
prob_1
('美国')
from functools
import reduce
from operator
import mul
reduce(mul
, [1, 2, 3, 4, 5, 6, 8])
def _1_gram(sentence
):
words
= cut
(sentence
)
return reduce(mul
, [prob_1
(w
) for w
in words
])
_1_gram
('我今天回家')
_1_gram
('我今天西瓜')
$ Pr
(w1 w2
.. wn
) = Pr
(w1
| w_2 w_3
.. w_n
)Pr
(w_2
| w_3
.. w_n
).. Pr
( w_
{n
-1} | w_n
)Pr
(w_n
)$
Pr
(你
| 今天
) = count
(你今天
) / count
(今天
)
$ two
-gram
= Pr
(w1
*w2
.. wn
) = Pr
(w1
| w_2
)Pr
(w_2
| w_3
).. Pr
( w_
{n
-1} | w_n
)Pr
(w_n
)$
$ two
-gram
= Pr
(w1
*w2
.. wn
) = \prod_i
^n \frac
{count
(w_iw_
{i
+1})}{count
(w_
{i
+1})} Pr
(w_n
)$
TOKEN_2_GRAM
= [''.join
(ALL_TOKENS
[i
:i
+2]) for i
in range(len(ALL_TOKENS
[:-2]))]
len(ALL_TOKENS
)
len(TOKEN_2_GRAM
)
TOKEN_2_GRAM
[10:]
len(TOKEN_2_GRAM
)
word_count_2
= Counter
(TOKEN_2_GRAM
)
$ two
-gram
= Pr
(w1
*w2
.. wn
) = \prod_i
^n \frac
{count
(w_iw_
{i
+1})}{count
(w_
{i
+1})} Pr
(w_n
)$
def prob_2(word1
, word2
):
combine
= word1
+ word2
if combine
in word_count_2
:
return word_count_2
[combine
] / word_counts
[word2
]
else:
return 1 / len(word_counts
)
def _2_gram(sentence
):
words
= cut
(sentence
)
prob
= 1
for i
in range(len(words
)-1):
word
, next_word
= words
[i
], words
[i
+1]
prob
*= prob_2
(word
, next_word
)
prob
*= prob_1
(words
[-1])
return prob
_2_gram
('中国发射了一枚火箭')
_2_gram
('中国发射了一枚窗帘')
need_compared
= [
('今天晚上请你吃大餐,我们一起吃日料', '今天晚上请你吃大餐,我们一起吃苹果'),
('真是一只好看的小猫', '真事一只好看的小猫'),
('今晚我去吃火锅', '今晚火锅去吃我'),
('洋葱奶昔来一杯', '养乐多绿来一杯')
]
for s1
, s2
in need_compared
:
print('Pr({}) = {}'.format(s1
, _2_gram
(s1
)))
print('Pr({}) = {}'.format(s2
, _2_gram
(s2
)))
1. 数据思维 Data Driven
2. 数学模型 -> 编程实现
More Data, Better Result