通过使用sklearn决策树，简单练习案例分析

技术2023-03-24 98

# 根据电影中的类型，票房，产地，预测去不去看电影 import csv from sklearn.feature_extraction import DictVectorizer from sklearn import preprocessing from sklearn import tree # 导入数据集， film_data = open('film.csv', 'rt') reader = csv.reader(film_data) # 表头数据 headers = next(reader) # 打印一下数据集的头部 print(headers) # 预测数据列表 feature_list = [] # 结果集列表 result_list = [] # 便利打印数据，组装成新的数据 for row in reader: # print(row[1:-1]) # print(headers[1:-1]) result_list.append(row[-1]) feature_list.append(dict(zip(headers[1:-1], row[1:-1]))) print(result_list, feature_list) # ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no'] # [{'type': 'anime', 'country': 'Janan', 'gross': 'low'}, {'type': 'science', 'country': 'America', 'gross': 'low'}, {'type': 'anime', 'country': 'America', 'gross': 'low'}, {'type': 'action', 'country': 'America', 'gross': 'high'}, {'type': 'action', 'country': 'China', 'gross': 'high'}, {'type': 'anime', 'country': 'China', 'gross': 'low'}, {'type': 'science', 'country': 'France', 'gross': 'low'}, {'type': 'action', 'country': 'China', 'gross': 'low'}] # 调用sklearn 自带的特征提取类 vec = DictVectorizer() # 传入训练数据模型 dummyX = vec.fit_transform(feature_list).toarray() "生成的这个二维数组是根据传入数据特征值进行分类的，如果特征值比比较多的话，二维数组会比较大，此二维数组前四位是代表国家，中间两位票房，最后三位数据代表电影类型，测试数据也是一样" """ [[0. 0. 0. 1. 0. 1. 0. 1. 0.] [1. 0. 0. 0. 0. 1. 0. 0. 1.] [1. 0. 0. 0. 0. 1. 0. 1. 0.] [1. 0. 0. 0. 1. 0. 1. 0. 0.] [0. 1. 0. 0. 1. 0. 1. 0. 0.] [0. 1. 0. 0. 0. 1. 0. 1. 0.] [0. 0. 1. 0. 0. 1. 0. 0. 1.] [0. 1. 0. 0. 0. 1. 1. 0. 0.]] """ #标签二值化 dummyY = preprocessing.LabelBinarizer().fit_transform(result_list) """ [[1] [1] [1] [1] [1] [1] [0] [0]] """ print(dummyX) print(dummyY) # 调用sklearn 的决策树训练模型 clf = tree.DecisionTreeClassifier(criterion='entropy', random_state=0) clf = clf.fit(dummyX, dummyY) # print('clf:' + str(clf)) A = ([[0, 0, 0, 1, 0, 1, 0, 1, 0]]) B = ([[0, 0, 1, 0, 0, 1, 0, 1, 0]]) C = ([[1, 0, 0, 0, 1, 0, 1, 0, 0]]) # 传入数据验证训练模型 predict_result = clf.predict(A) print('预测结果' + str(predict_result))

Processed: 0.019, SQL: 9