re库的三个函数: findall sub split
2.pandas 库
import pandas as pd import numpy as np dates=pd.date_range('20180310',periods=6) df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])#生成6行4列位置 print(df)#输出6行4列的表格 ''' A B C D 2018-03-10 -0.092889 -0.503172 0.692763 -1.261313 2018-03-11 -0.895628 -2.300249 -1.098069 0.468986 2018-03-12 0.084732 -1.275078 1.638007 -0.291145-*9 2018-03-13 -0.561528 0.431088 0.430414 1.065939 2018-03-14 1.485434 -0.341404 0.267613 -1.493366 2018-03-15 -1.671474 0.110933 1.688264 -0.910599 ''' print(df['B']) ''' 2018-03-10 -0.927291 2018-03-11 -0.406842 2018-03-12 -0.088316 2018-03-13 -1.631055 2018-03-14 -0.929926 2018-03-15 -0.010904 Freq: D, Name: B, dtype: float64 ''' #创建特定数据的DataFrame df_1=pd.DataFrame({'A' : 1., 'B' : pd.Timestamp('20180310'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo' }) print(df_1) ''' A B C D E F 0 1.0 2018-03-10 1.0 3 test foo 1 1.0 2018-03-10 1.0 3 train foo 2 1.0 2018-03-10 1.0 3 test foo 3 1.0 2018-03-10 1.0 3 train foo ''' print(df_1.dtypes) ''' A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object ''' print(df_1.index)#行的序号 #Int64Index([0, 1, 2, 3], dtype='int64') print(df_1.columns)#列的序号名字 ''' Int64Index([0, 1, 2, 3], dtype='int64') Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') ''' #Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') print(df_1.values)#把每个值进行打印出来 ''' [[1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'train' 'foo'] [1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'train' 'foo']] ''' print(df_1.describe())#数字总结 ''' A C D count 4.0 4.0 4.0 mean 1.0 1.0 3.0 std 0.0 0.0 0.0 min 1.0 1.0 3.0 25% 1.0 1.0 3.0 50% 1.0 1.0 3.0 75% 1.0 1.0 3.0 max 1.0 1.0 3.0 ''' print(df_1.T)#翻转数据 ''' 0 1 2 \ A 1 1 1 B 2018-03-10 00:00:00 2018-03-10 00:00:00 2018-03-10 00:00:00 C 1 1 1 D 3 3 3 E test train test F foo foo foo 3 A 1 B 2018-03-10 00:00:00 C 1 D 3 E train F foo ''' print(df_1.sort_index(axis=1, ascending=False))#axis等于1按列进行排序 如ABCDEFG 然后ascending倒叙进行显示 ''' F E D C B A 0 foo test 3 1.0 2018-03-10 1.0 1 foo train 3 1.0 2018-03-10 1.0 2 foo test 3 1.0 2018-03-10 1.0 3 foo train 3 1.0 2018-03-10 1.0 ''' print(df_1.sort_values(by='E'))#按值进行排序 ''' A B C D E F 0 1.0 2018-03-10 1.0 3 test foo 2 1.0 2018-03-10 1.0 3 test foo 1 1.0 2018-03-10 1.0 3 train foo 3 1.0 2018-03-10 1.0 3 train foo '''3.json
import json # json 数据,和Python中的dict数据形式一样 data ={ "第一个key":"第一个value", "第二个key":"第二个value" } print('原生json数据',data) # 将json转换成str,方便在文件中保存 data_str = json.dumps(data) print('json转成str',data_str) # 将str转换成json,方便在Python的调用 data_json = json.loads(data_str) print('从str转成json',data_json) # 将一个json对象直接保存在文件中 with open('json.txt','w') as f : json.dump(data_json,f) # 将一个保存json对象的文件直接转成字符串 with open('json.txt','r') as f : data_json_exchange = json.load(f) print('从文件中获得json数据',data_json_exchange) E:\miniconda\python.exe E:/BaiduNetdiskDownload/05NLP项目——医疗知识图谱项目/课时附件资料/课时0/课时0/附件/json.py 原生json数据 {'第一个key': '第一个value', '第二个key': '第二个value'} json转成str {"\u7b2c\u4e00\u4e2akey": "\u7b2c\u4e00\u4e2avalue", "\u7b2c\u4e8c\u4e2akey": "\u7b2c\u4e8c\u4e2avalue"} 从str转成json {'第一个key': '第一个value', '第二个key': '第二个value'} 从文件中获得json数据 {'第一个key': '第一个value', '第二个key': '第二个value'} Process finished with exit code 04. gensim词向量库
from gensim.models import Word2Vec from random import choice temp =[ ['用来','测试','的','分词','之后','的','第一','句','话'], ['我','随便','写','的','一','句','话'] ] ls_of_words = [] # 存放分词列表的列表 for i in range(1500): ls = choice(temp) ls_of_words.append([choice(ls) for _ in range(9, 15)]) # 训练词向量模型,主要的参数就输输入文本,其他的参数影不是很大 model = Word2Vec(ls_of_words) # 得到最想似的词 print(model.similar_by_word('用来')) # 计算两者之间的相似度 print(model.similarity('用来', '测试')) # 词向量聚类及可视化 from random import choice ls_of_ls = [['芝士', '酸奶', '蛋糕', '巧克力', '做', '吃'], ['文本', '数据', '挖掘', '分析', '做', '玩'], ['佛山', '广州', '南海', '天河', '吃', '玩']] ls_of_words = [] # 存放分词列表(假设是jieba.lcut后得到的)的列表 for i in range(2500): ls = choice(ls_of_ls) ls_of_words.append([choice(ls) for _ in range(9, 15)]) # 建模训练 from gensim.models import Word2Vec model = Word2Vec(ls_of_words, size=3, window=7) # 词向量聚类(基于密度) from sklearn.cluster import DBSCAN vectors = [model[word] for word in model.wv.index2word] labels = DBSCAN(eps=0.24, min_samples=3).fit(vectors).labels_ # 词向量可视化 import matplotlib from mpl_toolkits import mplot3d import matplotlib.pyplot as mp mp.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文 matplotlib.rcParams['axes.unicode_minus'] = False # 显示负号 fig = mp.figure() ax = mplot3d.Axes3D(fig) # 创建3d坐标轴 colors = ['red', 'blue', 'green', 'black'] for word, vector, label in zip(model.wv.index2word, vectors, labels): ax.scatter(vector[0], vector[1], vector[2], c=colors[label], s=500, alpha=0.4) ax.text(vector[0], vector[1], vector[2], word, ha='center', va='center') mp.show()5.collection
import collections # 计数器 print(collections.Counter('abcdeabcdabcaba')) # 双向链表 q= collections.deque(['a','b','c']) q.append('x') q.appendleft('y') print(q) # 默认字典,及当字典的key不存在时填写默认值 dic = collections.defaultdict(lambda :'N/A') dic['k1'] = 'abc' print(dic['k1']) #‘abc’ print(dic['k2']) #N/A # 有序字典,写入顺序是唯一的 print('Normal Dictionary:') d = {} d['age'] = 'v2' d['job'] = 'v3' d1 = {} d1['job'] = 'v3' d1['age'] = 'v2' print(d == d1) print('OrderedDict:') d2 = OrderedDict() d2['age'] = 'v2' d2['job'] = 'v3' d3 = OrderedDict() d3['job'] = 'v3' d3['age'] = 'v2' print(d2 == d3)