lesson-01-course-code

    技术2022-07-10  113

    import random sentence = """ 句子 = 主 谓 宾 主 = 你 | 我 | 他 谓 = 吃 | 喝 宾 = 橘子 | 汽水 | 茶 """

    input: 根据这个语法定语,能够生成句子

    def sentence(): return 主语() + 谓语() + 宾语() def 主语(): return random.choice('你 | 我 | 他'.split('|')) def 谓语(): return random.choice('吃 | 喝'.split('|')) def 宾语(): return random.choice('橘子 | 汽水 | 茶'.split('|')) for _ in range(10): print(sentence()) two_number = """ numbers = num numbers | num num = 0 | 1 | 2 | 3 """ # def numbers_with_bug(): # return random.choice([num() + numbers_with_bug(), num()]) def num(): return random.choice(' 0 | 1 | 2 | 3'.split('|')) def numbers(): if random.random() < 0.3: return num() else: return num() + numbers() numbers() for _ in range(10): print(numbers())

    怎么样在问题场景变化(语法变化)的情况下,你的程序不需要重新写?

    numbers_ops = """ expression => expression op num_op | num_op num_op => nums op nums nums => num nums | num num => 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 op => + | - | * | / """ # 3 + 4 / 6 * 9 def generate_grammer(grammer_str: str, split='=>'): grammar = {} for line in grammer_str.split('\n'): if not line: continue expr, formula = line.split(split) formulas = formula.split('|') formulas = [f.split() for f in formulas] grammar[expr.strip()] = formulas return grammar def generate_by_grammer(grammar: dict, target=str): if target not in grammar: return target # 是一个终结符 expr = random.choice(grammar[target]) return ''.join(generate_by_grammer(grammar, t) for t in expr) generate_by_grammer(grammar, target='expression') def generate_by_str(grammar_str, target, spliter='=>'): return generate_by_grammer(grammar=generate_grammer(grammar_str, spliter), target=target) sentence = """ 句子 => 主 谓 宾 主 => 你 | 我 | 他 谓 => 吃 | 喝 宾 => 橘子 | 汽水 | 茶 """ generate_by_str(numbers_ops, target='expression') for _ in range(10): print(generate_by_grammer(generate_grammer(sentence), target='句子')) #在西部世界里,一个”人类“的语言可以定义为: human = """ human = 自己 寻找 活动 自己 = 我 | 俺 | 我们 寻找 = 找找 | 想找点 活动 = 乐子 | 玩的 """ 假如既然 = """ 句子 = if someone state , then do something if = 既然 | 如果 | 假设 someone = one 和 someone | one one = 小红 | 小蓝 | 小绿 | 白白 state = 饿了 | 醒了 | 醉了 | 癫狂了 then = 那么 | 就 | 可以 do = 去 something = 吃饭 | 玩耍 | 去浪 | 睡觉 """ #一个“接待员”的语言可以定义为 host = """ host = 寒暄 报数 询问 业务相关 结尾 报数 = 我是 数字 号 , 数字 = 单个数字 | 数字 单个数字 单个数字 = 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 寒暄 = 称谓 打招呼 | 打招呼 称谓 = 人称 , 人称 = 先生 | 女士 | 小朋友 打招呼 = 你好 | 您好 询问 = 请问你要 | 您需要 业务相关 = 具体业务 具体业务 = 喝酒 | 打牌 | 打猎 | 赌博 结尾 = 吗? """ for _ in range(5): print(generate_by_str(human, 'human', '=')) for _ in range(5): print(generate_by_str(假如既然, '句子', '=')) for _ in range(5): print(generate_by_str(host, 'host', '='))

    Data Driven

    simpel_programming = ''' programming => assignment logic_programming assignment => assign change_line assignment | assign change_line logic_programming => if_stmt | assign | while_loop while_loop => while { cond } { change_line stmt change_line } if_stmt => if { cond } { change_line stmt change_line } | if { cond } { change_line stmt change_line } else { change_line stmt change_line } change_line => /N cond => var op var op => | == | < | >= | <= | && | ** stmt => assign | if_stmt assign => var = var var => var _ num | words words => words _ word | word word => name | info | student | lib | database nums => nums num | num num => 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 ''' generate_by_str(simpel_programming, 'programming', spliter='=>') def pretty_print(line): # utility tool function lines = line.split('/N') code_lines = [] for i, sen in enumerate(lines): if i < len(lines) / 2: #print() code_lines.append(i * " " + sen) else: code_lines.append((len(lines) - i) * " " + sen) return code_lines generated_programming = [] for i in range(20): generated_programming += pretty_print(generate_by_str(simpel_programming, target='programming', spliter='=>')) generated_programming

    计算机如何判断,那句话说得最对?

    10分钟之后继续~ 21:42分

    形式语言 自然语言

    俺想找点玩的 俺想找点玩的 我找找玩的 我们想找点玩的 俺想找点玩的 如果白白和白白醒了,那么去睡觉 既然小绿醉了,那么去玩耍 假设白白和白白醒了,可以去睡觉 假设小红醒了,就去睡觉 如果小红和白白和白白和白白和小蓝和小蓝醉了,就去玩耍 你好我是34758626,请问你要打猎吗? 女士,您好我是552,请问你要打牌吗? 女士,您好我是654,请问你要打猎吗? 你好我是638,您需要打牌吗? 您好我是22,您需要打猎吗? !ls ARTILES = open('article_9k.txt').read() len(ARTILES) ARTILES[:100] import jieba def cut(string): return list(jieba.cut(string)) cut('我是一只小狗') ALL_TOKENS = cut(ARTILES) ALL_TOKENS[:10] from collections import Counter word_counts = Counter(ALL_TOKENS) word_counts.most_common(100) frequencies = [f for w, f in word_counts.most_common(100)] x = [i for i in range(100)] %matplotlib inline import matplotlib.pyplot as plt plt.plot(x, frequencies)

    只要你的数据量足够大, 那么,出现次数第N多的单词,是出现次数最多的单词的频率的 1/N

    语言模型

    l a n g u a g e − m o d e l = P r ( s e n t e n c e ) language-model = Pr(sentence) languagemodel=Pr(sentence)

    1-gram

    most_common_500 = [w for w, f in word_counts.most_common(500)] most_common_to_10000_100500 = [w for w, f in word_counts.most_common(50000)][500:] ''.join(random.sample(most_common_500, k=10)) ''.join(random.sample(most_common_500_to_1000, k=10))

    1-gram

    我衡量一个句子的概率,我就简单说,这个句子的概率,就等于每个单词的概率,乘起来!

    $ Pr(sententence) = Pr(w_1 w_2… w_n)$

    $ Pr(w1 w2 … wn) = Pr(w1 | w_2 w_3 … w_n)Pr(w_2 w_3 … w_n)$ $ Pr(w1 w2 … wn) = Pr(w1 | w_2 w_3 … w_n)Pr(w_2 | w_3 … w_n) Pr( w_3 … w_n)$ $ Pr(w1 w2 … wn) = Pr(w1 | w_2 w_3 … w_n)Pr(w_2 | w_3 … w_n)… Pr( w_{n-1} | w_n)Pr(w_n)$

    Pr(AB) = Pr(A|B)Pr(B)

    Pr(A|B) = Pr(A) => A 和 B 两者无关

    Pr(你明天上班迟到 | 南非总统今天喝牛奶) != Pr(你明天上班迟到)

    Pr(你明天上班迟到 | 南非总统今天喝牛奶) == Pr(你明天上班迟到)

    $ Pr(w1 w2 … wn) = Pr(w1 | w_2 w_3 … w_n)Pr(w_2 | w_3 … w_n)… Pr( w_{n-1} | w_n)Pr(w_n)$

    $ one-gram = Pr(w1 w2 … wn) = Pr(w1)Pr(w_2)… Pr( w_{n-1})Pr(w_n)$

    min_frequences= min([f for w, f in word_counts.most_common()]) def prob_1(word): # out of vocabulary if word in word_counts: return word_counts[word] / len(ALL_TOKENS) else: return min_frequences / len(ALL_TOKENS) prob_1('中国') prob_1('美国') from functools import reduce from operator import mul reduce(mul, [1, 2, 3, 4, 5, 6, 8]) def _1_gram(sentence): words = cut(sentence) return reduce(mul, [prob_1(w) for w in words]) _1_gram('我今天回家') _1_gram('我今天西瓜') $ Pr(w1 w2 .. wn) = Pr(w1 | w_2 w_3 .. w_n)Pr(w_2 | w_3 .. w_n).. Pr( w_{n-1} | w_n)Pr(w_n)$ Pr(| 今天) = count(你今天) / count(今天) $ two-gram = Pr(w1*w2 .. wn) = Pr(w1 | w_2)Pr(w_2 | w_3).. Pr( w_{n-1} | w_n)Pr(w_n)$ $ two-gram = Pr(w1*w2 .. wn) = \prod_i^n \frac{count(w_iw_{i+1})}{count(w_{i+1})} Pr(w_n)$ TOKEN_2_GRAM = [''.join(ALL_TOKENS[i:i+2]) for i in range(len(ALL_TOKENS[:-2]))] len(ALL_TOKENS) len(TOKEN_2_GRAM) TOKEN_2_GRAM[10:] len(TOKEN_2_GRAM) word_count_2 = Counter(TOKEN_2_GRAM) $ two-gram = Pr(w1*w2 .. wn) = \prod_i^n \frac{count(w_iw_{i+1})}{count(w_{i+1})} Pr(w_n)$ def prob_2(word1, word2): combine = word1 + word2 if combine in word_count_2: return word_count_2[combine] / word_counts[word2] else: # out of vocabulary return 1 / len(word_counts) def _2_gram(sentence): words = cut(sentence) prob = 1 for i in range(len(words)-1): word, next_word = words[i], words[i+1] prob *= prob_2(word, next_word) prob *= prob_1(words[-1]) return prob _2_gram('中国发射了一枚火箭') _2_gram('中国发射了一枚窗帘') need_compared = [ ('今天晚上请你吃大餐,我们一起吃日料', '今天晚上请你吃大餐,我们一起吃苹果'), ('真是一只好看的小猫', '真事一只好看的小猫'), ('今晚我去吃火锅', '今晚火锅去吃我'), ('洋葱奶昔来一杯', '养乐多绿来一杯') ] for s1, s2 in need_compared: print('Pr({}) = {}'.format(s1, _2_gram(s1))) print('Pr({}) = {}'.format(s2, _2_gram(s2)))

    1. 数据思维 Data Driven

    2. 数学模型 -> 编程实现

    More Data, Better Result

    Processed: 0.012, SQL: 9