[Machine Learning] Decision Tree
Decision Tree Classifier
특징
- 모든 feature를 사용해서 나눠보고 가장 잘 구분하는 feature를 사용하여 최종 구분
- 그리디 알고리즘: 당장의 최선의 결정을 선택
손실 함수
- Decision Tree는 손실 함수로 Impurity를 사용
- Iris dataset
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[:, :2]
y = iris.target
X.shape, y.shape
((150, 2), (150,))
plt.scatter(X[:,0], X[:,1], c=y)
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X, y)
clf.score(X, y)
0.7733333333333333
Plotting Decision Trees
- using tree.plot_tree
- using graphviz library
from sklearn import tree
tree.plot_tree(clf, filled=True) # filled=True -> paint to indicate majority class
Graphviz 설치
- graphviz.org 사이트에서 다운로드 후 설치
- 윈도우 버전:
- https://graphviz.gitlab.io/_pages/Download/Download_windows.html
# !pip install graphviz or
# conda install python-graphviz (in cmd 창)
# note that the above two commands install graphviz library in different places.
- export_graphviz(): Export a decision tree in DOT format. This function generates a GraphViz representation of the decision tree, which is then written into out_file
- dot file: DOT is a graph description language. DOT graphs are typically files with the filename extension gv or dot.
# graphvis 실행시 path 문제가 있는 경우
# - 내 PC (오른쪽 마우스 클릭) -> 속성 -> 고급시스템 -> 환경변수 (시스템변수) -> path 에 graphviz path 추가
# export_graphviz(): Export a decision tree in DOT format.
from sklearn.tree import export_graphviz
import graphviz
export_graphviz(
clf,
out_file = "./iris.dot",
feature_names = iris.feature_names[:2],
class_names = iris.target_names,
filled = True
)
with open("./iris.dot") as f:
dot_graph = str(open("./iris.dot", "rb").read(), "utf8")
graphviz.Source(dot_graph)
Plot borderline
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.xlim(4, 8.5)
plt.ylim(1.5, 4.5)
markers = ['o', '+', '^']
for i in range(3):
xs = X[:, 0][y == i]
ys = X[:, 1][y == i]
plt.scatter(xs, ys, marker=markers[i])
plt.legend(iris.target_names)
plt.xlabel("Sepal length")
plt.ylabel("Sepal width")
# 결정 트리 경계선: 실선은 루트 노드 점선은 자식 노드
xx = np.linspace(5.45, 5.45, 3)
yy = np.linspace(1.5, 4.5, 3)
plt.plot(xx, yy, '-k') # 검정색 실선
xx = np.linspace(4, 5.45, 3)
yy = np.linspace(2.8, 2.8, 3)
plt.plot(xx, yy, '--b') # 파란색 점선
xx = np.linspace(6.15, 6.15, 3)
yy = np.linspace(1.5, 4.5, 3)
plt.plot(xx, yy, '--r') # 붉은색 점선
[<matplotlib.lines.Line2D at 0x1de1bc071c0>]
print(clf.predict([[5.5, 4]])) # prediction
[1]
print(clf.predict_proba([[5.5, 4]])) # prediction probability
[[0.11627907 0.65116279 0.23255814]]
Tree Hyper parameters
Breast Cancer classification
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
cancer = load_breast_cancer()
# dir(cancer)
cancer.feature_names
array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error',
'fractal dimension error', 'worst radius', 'worst texture',
'worst perimeter', 'worst area', 'worst smoothness',
'worst compactness', 'worst concavity', 'worst concave points',
'worst symmetry', 'worst fractal dimension'], dtype='<U23')
np.random.seed(9)
# X_train, X_test, y_train, y_test = train_test_split(
# cancer.data, cancer.target, stratify=cancer.target)
# stratify: If not None, data is split in a stratified fashion, using this as the class labels.
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target)
X_train.shape, y_train.shape
((426, 30), (426,))
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
0.9370629370629371
- feature importance: The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance.
- The higher the value the more important the feature bold text.
- 결정트리를 만드는데 기여한 정도
list(zip(cancer.feature_names, clf.feature_importances_.round(4)))
[('mean radius', 0.0074),
('mean texture', 0.043),
('mean perimeter', 0.0),
('mean area', 0.0),
('mean smoothness', 0.0),
('mean compactness', 0.0),
('mean concavity', 0.0),
('mean concave points', 0.0),
('mean symmetry', 0.0),
('mean fractal dimension', 0.0),
('radius error', 0.0),
('texture error', 0.0),
('perimeter error', 0.0),
('area error', 0.0033),
('smoothness error', 0.0),
('compactness error', 0.0188),
('concavity error', 0.0),
('concave points error', 0.0),
('symmetry error', 0.0093),
('fractal dimension error', 0.0),
('worst radius', 0.7116),
('worst texture', 0.0591),
('worst perimeter', 0.0),
('worst area', 0.0),
('worst smoothness', 0.0),
('worst compactness', 0.0211),
('worst concavity', 0.0106),
('worst concave points', 0.1157),
('worst symmetry', 0.0),
('worst fractal dimension', 0.0)]
df = pd.DataFrame({'feature':cancer.feature_names,'importance':clf.feature_importances_ })
df.head()
feature | importance | |
---|---|---|
0 | mean radius | 0.007389 |
1 | mean texture | 0.043044 |
2 | mean perimeter | 0.000000 |
3 | mean area | 0.000000 |
4 | mean smoothness | 0.000000 |
df = df.sort_values(by='importance', ascending=False)
print(df.head(20))
feature importance
20 worst radius 0.711625
27 worst concave points 0.115708
21 worst texture 0.059071
1 mean texture 0.043044
25 worst compactness 0.021073
15 compactness error 0.018815
26 worst concavity 0.010635
18 symmetry error 0.009318
0 mean radius 0.007389
13 area error 0.003323
9 mean fractal dimension 0.000000
6 mean concavity 0.000000
28 worst symmetry 0.000000
2 mean perimeter 0.000000
3 mean area 0.000000
4 mean smoothness 0.000000
24 worst smoothness 0.000000
23 worst area 0.000000
22 worst perimeter 0.000000
5 mean compactness 0.000000
x = df.feature
y = df.importance
ypos = np.arange(len(x))
plt.figure(figsize=(10,7))
plt.barh(x, y)
plt.yticks(ypos, x)
plt.xlabel('Importance')
plt.ylabel('Variable')
plt.xlim(0, 1)
plt.ylim(-1, len(x))
plt.show()
DecisionTree Regressor
# exercise for tree regressor()
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
X = iris.data[:,:2]
y = iris.data[:,2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
tr_reg1 = DecisionTreeRegressor(max_depth=2)
tr_reg2 = DecisionTreeRegressor(max_depth=5)
tr_reg1.fit(X_train,y_train)
tr_reg2.fit(X_train,y_train)
tr_reg1.score(X_test, y_test), tr_reg2.score(X_test, y_test)
(0.8273026760637123, 0.8239910886453354)
tree.plot_tree(tr_reg1, filled=True)
# predicting petal width (y) from petal length (x)
X = iris.data[:,2]
y = iris.data[:,3]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
tr_reg3 = DecisionTreeRegressor(max_depth=2)
tr_reg3.fit(X_train.reshape(-1,1),y_train)
tr_reg3.score(X_test.reshape(-1,1), y_test)
from sklearn.metrics import mean_squared_error, r2_score
y_pred = tr_reg3.predict(X_train.reshape(-1,1))
plt.scatter(X_train, y_train, c='b', s = 5)
plt.scatter(X_train, y_pred, c ='r', s = 3)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
r2 = r2_score(y_train, y_pred) # same as score(x,y)
print('MSE: ', mse, 'R2 score: ', r2)
MSE: 0.042516993988801044 R2 score: 0.931000996993349
tree.plot_tree(tr_reg3, filled=True)
Leave a comment