import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn import tree
film_data = open('film.csv', 'rt')
reader = csv.reader(film_data)
headers = next(reader)
print(headers)
feature_list = []
result_list = []
for row in reader:
result_list.append(row[-1])
feature_list.append(dict(zip(headers[1:-1], row[1:-1])))
print(result_list, feature_list)
vec = DictVectorizer()
dummyX = vec.fit_transform(feature_list).toarray()
"生成的这个二维数组 是根据传入数据特征值进行分类的,如果特征值比比较多的话,二维数组会比较大,此二维数组前四位是代表国家,中间两位票房,最后三位数据代表电影类型,测试数据也是一样"
"""
[[0. 0. 0. 1. 0. 1. 0. 1. 0.]
[1. 0. 0. 0. 0. 1. 0. 0. 1.]
[1. 0. 0. 0. 0. 1. 0. 1. 0.]
[1. 0. 0. 0. 1. 0. 1. 0. 0.]
[0. 1. 0. 0. 1. 0. 1. 0. 0.]
[0. 1. 0. 0. 0. 1. 0. 1. 0.]
[0. 0. 1. 0. 0. 1. 0. 0. 1.]
[0. 1. 0. 0. 0. 1. 1. 0. 0.]]
"""
dummyY = preprocessing.LabelBinarizer().fit_transform(result_list)
"""
[[1]
[1]
[1]
[1]
[1]
[1]
[0]
[0]]
"""
print(dummyX)
print(dummyY)
clf = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
clf = clf.fit(dummyX, dummyY)
A = ([[0, 0, 0, 1, 0, 1, 0, 1, 0]])
B = ([[0, 0, 1, 0, 0, 1, 0, 1, 0]])
C = ([[1, 0, 0, 0, 1, 0, 1, 0, 0]])
predict_result = clf.predict(A)
print('预测结果' + str(predict_result))
转载请注明原文地址:https://ipadbbs.8miu.com/read-41069.html