参数criterion的应用:
尝试执行代码框内容并思考代码结果:
(1)导入算法库和模块
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
(2)探索数据
wine = load_wine() # 导入数据集
wine.data.shape
wine.feature_names # 特征
wine.target
wine.target_names # 一共有3类酒
(3)构造所需数据集
# 合成数据
data=np.concatenate((wine.data,wine.target.reshape(-1,1)),axis=1)
#wine.target.reshape(-1,1):将target标签变量转化为1列数据
# 合成列索引
wine.feature_names.append("label")
# 合成DataFrame
wine_df=pd.DataFrame(data=data,columns=wine.feature_names)
wine_df.columns=['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸','标签']
wine_df.head()
(4)分训练集和测试集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine_df.iloc[:,:-1],
wine_df.iloc[:,-1],
test_size=0.3,
random_state=420)
Xtrain.shape
Xtest.shape
(5)初步建立决策树
clf = tree.DecisionTreeClassifier(criterion="gini") # 生成决策树分类器
clf = clf.fit(Xtrain, Ytrain)
clf.score(Xtest, Ytest) #返回预测的准确度