[Machine Learning] SVM
Linear Classification
Binary classification
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X2 = X[:, :2]
y2 = y.copy() # y의 복사본을 만든다
y2[(y2==2)] = 1 # y중에 2의 값을 모두 1로 바꾼다
y2
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
from sklearn.model_selection import train_test_split
np.random.seed(13)
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.3)
X_train.shape, X_test.shape
((105, 2), (45, 2))
import matplotlib.pyplot as plt
markers = ['o', '+', '^']
for i in range(3):
xs = X_train[:, 0][y_train == i]
ys = X_train[:, 1][y_train == i]
plt.scatter(xs, ys, marker=markers[i])
binary_names = ['setosa', 'non-setosa']
plt.legend(binary_names)
plt.xlabel("Sepal length")
plt.ylabel("Sepal width")
Text(0, 0.5, 'Sepal width')
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(max_iter=1000)
clf.fit(X_train, y_train)
SGDClassifier()
a = clf.coef_[0,0]
b = clf.coef_[0,1]
c = clf.intercept_
clf.score(X_train, y_train), clf.score(X_test, y_test)
(0.9809523809523809, 0.9777777777777777)
Multi-class: use all 3 classes
하나의 클래스와 나머지 클래스를 구분하는 선을 예측
# use all classes
np.random.seed(17)
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3)
X_train.shape, y_train.shape
markers = ['o', '+', '^']
for i in range(3):
xs = X_train[:, 0][y_train == i]
ys = X_train[:, 1][y_train == i]
plt.scatter(xs, ys, marker=markers[i])
plt.legend(iris.target_names)
plt.xlabel("Sepal length")
plt.ylabel("Sepal width")
clf = SGDClassifier(max_iter=1000)
clf.fit(X_train, y_train)
print("Coefficients: ", clf.coef_, clf.intercept_)
print("multi-class score: ", clf.score(X_test, y_test))
for i in range(3):
a = clf.coef_[i,0]
b = clf.coef_[i,1]
c = clf.intercept_[i]
xx = np.linspace(4,9,100)
yy = -a/b * xx - c/b
plt.plot(xx, yy, c='r')
Coefficients: [[-114.57251644 165.80131533]
[ 2.3557126 -69.49352179]
[ 66.21199204 -76.72634271]] [ 78.79909348 104.38733447 -178.46683519]
multi-class score: 0.5777777777777777
test set에 대해 경계선을 그려보자.
# contour
h = .02 # step size in the mesh
x_min, x_max = X2[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X2[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # column 으로 붙이기
print(Z.shape)
Z = Z.reshape(xx.shape)
print(Z.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
# plt.contourf(xx, yy, Z) # Z: height values over which the contour is drawn
# plt.axis('tight')
#-----------------------
markers = ['o', '+', '^']
colors = "rbg"
for i in range(3):
xs = X_test[:, 0][y_test == i]
ys = X_test[:, 1][y_test == i]
plt.scatter(xs, ys, marker=markers[i], c=colors[i])
for i in range(3):
a = clf.coef_[i,0]
b = clf.coef_[i,1]
c = clf.intercept_[i]
xx = np.linspace(4,8,100)
yy = -a/b * xx - c/b
plt.plot(xx, yy, c='k')
plt.plot()
(61600,)
(220, 280)
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred) # one vs. rest
array([[12, 0, 0],
[ 1, 0, 18],
[ 0, 0, 14]], dtype=int64)
- one-vs-all (one-vs-rest)
Linear SVM Classifier
C
가 증가하면 곡선이 디테일해지고 (margin이 hard해진다)gamma
가 증가하면 섬들이 많이 생긴다 (이웃의 수가 적어진다)
SVM
SVM Optimization
Loss Function
SVM uses hinge loss.
Iris data
X, y = iris.data, iris.target
X2 = X[:, :2]
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.3)
# X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3)
from sklearn.svm import SVC
lin_clf = SGDClassifier(max_iter=1000)
lin_clf.fit(X_train, y_train)
svm_clf = SVC(kernel="linear", C=10)
svm_clf.fit(X_train, y_train)
SVC(C=10, kernel='linear')
print(svm_clf.score(X_test, y_test), lin_clf.score(X_test, y_test))
y_pred = svm_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
y_pred = lin_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
0.9777777777777777 0.9777777777777777
[[11 1]
[ 0 33]]
[[11 1]
[ 0 33]]
SVM과 선형 분류기의 차이는?
svm_clf.coef_, svm_clf.intercept_, lin_clf.coef_, lin_clf.intercept_
(array([[ 3.33199106, -4.07243689]]),
array([-5.14593311]),
array([[ 38.06228374, -62.28373702]]),
array([-9.57793523]))
plt.figure(figsize=(8,6))
plt.xlim(3.9,7.1)
plt.ylim(1.9,4.5)
w = svm_clf.coef_[0]
v = svm_clf.intercept_[0]
XX = np.linspace(4, 8, 30)
decision_boundary = -w[0]/w[1] * XX - v/w[1]
margin = 1/(np.sqrt(w[0]**2 + w[1]**2))
gutter_up = decision_boundary + margin
gutter_down = decision_boundary - margin
svs = svm_clf.support_vectors_
plt.scatter(svs[:, 0], svs[:, 1], s=180) # support vectors
# plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#AAFFAA')
# print(svs)
plt.plot(XX, decision_boundary, "k-")
plt.plot(XX, gutter_up, "k--")
plt.plot(XX, gutter_down, "k--")
markers = ['o', '+', '^']
for i in range(3):
xs = X_train[:, 0][y_train == i]
ys = X_train[:, 1][y_train == i]
plt.scatter(xs, ys, marker=markers[i])
binary_names = ['setosa', 'non-setosa']
plt.legend(binary_names)
plt.xlabel("Sepal length")
plt.ylabel("Sepal width")
# 선형분류 결정 경계선
a = lin_clf.coef_[0,0]
b = lin_clf.coef_[0,1]
c = lin_clf.intercept_
plt.plot(XX, (-a/b * XX - c/b), "r-")
svs.shape # support vectors
(4, 2)
Non-linear SVM: Kernel Trick (커널 기법)
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X.shape, y.shape
((569, 30), (569,))
from sklearn.multiclass import OneVsRestClassifier
# SVC uses one-vs-one
classifier = OneVsRestClassifier(SVC(kernel='rbf', C=1000, gamma=0.1, probability=True))
# enable prob estimates
classifier = classifier.fit(X_train, y_train)
classifier.score(X_train, y_train), classifier.score(X_test, y_test)
(1.0, 0.9777777777777777)
clf = SGDClassifier(max_iter=1000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
0.9777777777777777
svm_clf = SVC(kernel="linear")
svm_clf.fit(X_train, y_train)
print("SVM score:", svm_clf.score(X_test, y_test))
SVM score: 0.9777777777777777
Nonlinear by Polynomial features
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.15, random_state=42)
def plot_dataset(X, y, axes):
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "bs")
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "g^")
plt.axis(axes)
plt.grid(True, which='both')
plt.xlabel(r"$x_1$", fontsize=20)
plt.ylabel(r"$x_2$", fontsize=20, rotation=0)
plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
plt.show()
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
clf = Pipeline([
("poly_features", PolynomialFeatures(degree=3)),
("scaler", StandardScaler()),
("svm_clf", LinearSVC(C=10, loss="hinge", random_state=42))
])
clf.fit(X, y)
Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
('scaler', StandardScaler()),
('svm_clf', LinearSVC(C=10, loss='hinge', random_state=42))])
clf.steps
[('poly_features', PolynomialFeatures(degree=3)),
('scaler', StandardScaler()),
('svm_clf', LinearSVC(C=10, loss='hinge', random_state=42))]
- Pipeline of transforms with a final estimator.
- Sequentially apply a list of transforms and a final estimator.
- Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods.
- The final estimator only needs to implement fit.
- The transformers in the pipeline can be cached using memory argument.
- The Pipeline is built using a list of (key, value) pairs, where the key is a string containing the name you want to give this step and value is an estimator object:
def plot_predictions(clf, axes):
x0s = np.linspace(axes[0], axes[1], 100)
x1s = np.linspace(axes[2], axes[3], 100)
x0, x1 = np.meshgrid(x0s, x1s)
X = np.c_[x0.ravel(), x1.ravel()]
y_pred = clf.predict(X).reshape(x0.shape)
y_decision = clf.decision_function(X).reshape(x0.shape)
plt.contourf(x0, x1, y_pred, cmap=plt.cm.brg, alpha=0.2)
plt.contourf(x0, x1, y_decision, cmap=plt.cm.brg, alpha=0.1)
plot_predictions(clf, [-1.5, 2.5, -1, 1.5])
plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
plt.show()
SVM Classifier example
gamma 값과 C 값을 조정함에 따라 train과 test set에 대한 모델의 score를 보자.
Ex 1. Iris dataset
# for train and test data
iris = load_iris()
X = iris.data[:, [0, 1]]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
For train set
# Training classifiers
clf1 = SVC(gamma=.1, C=1, kernel='rbf', probability=True)
clf2 = SVC(gamma=.1, C=100, kernel='rbf', probability=True)
clf3 = SVC(gamma=100, C=1, kernel='rbf', probability=True)
clf4 = SVC(gamma=100, C=100, kernel='rbf', probability=True)
clf5 = SVC(gamma=1000, C=1000, kernel='rbf', probability=True)
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)
clf4.fit(X_train, y_train)
clf5.fit(X_train, y_train)
# Plotting decision regions
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(3, 2, sharex='col', sharey='row', figsize=(10, 8))
for idx, clf, tt in zip(product([0, 1, 2], [0, 1]),
[clf1, clf2, clf3, clf4, clf5],
['gamma=0.1, C=1', 'gamma=0.1, C=100',
'gamma=100, C=1', 'gamma=100, C=100',
'gamma=1000, C=1000']):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
axarr[idx[0], idx[1]].scatter(X_train[:, 0], X_train[:, 1], c=y_train,
s=20, edgecolor='k')
axarr[idx[0], idx[1]].set_title(tt)
plt.show()
# 확인
print (clf1.score(X_train, y_train),
clf2.score(X_train, y_train),
clf3.score(X_train, y_train),
clf4.score(X_train, y_train),
clf5.score(X_train, y_train))
0.7666666666666667 0.8 0.925 0.9333333333333333 0.9333333333333333
For test set
# for test data
# Plotting decision regions
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(3, 2, sharex='col', sharey='row', figsize=(10, 8))
for idx, clf, tt in zip(product([0, 1, 2], [0, 1]),
[clf1, clf2, clf3, clf4, clf5],
['gamma=0.1, C=1', 'gamma=0.1, C=100',
'gamma=100, C=1', 'gamma=100, C=100',
'gamma=1000, C=1000']):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
axarr[idx[0], idx[1]].scatter(X_test[:, 0], X_test[:, 1], c=y_test,
s=20, edgecolor='k')
axarr[idx[0], idx[1]].set_title(tt)
plt.show()
# 확인
print (clf1.score(X_test, y_test),
clf2.score(X_test, y_test),
clf3.score(X_test, y_test),
clf4.score(X_test, y_test),
clf5.score(X_test, y_test))
0.8333333333333334 0.8666666666666667 0.7 0.7333333333333333 0.4
위에서 보면 알 수 있듯이, gamma와 C 값을 너무 키우게 되면 overfitting이 발생한다.
Ex 2. XOR problem
- binary classification
- target to predict is a XOR of the inputs
- illustrate decision function learned by SVC
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import svm
xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
np.linspace(-3, 3, 500))
np.random.seed(0)
X = np.random.randn(300, 2)
Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
X.shape, Y.shape
((300, 2), (300,))
plt.scatter(X[:,0], X[:,1], c=Y)
# fit the model
clf = svm.SVC(gamma='auto') # gamma = 'auto': uses 1/n_features
clf.fit(X, Y)
# plot the decision function for each datapoint on the grid
# ravel(): Return a contiguous flattened array.
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) # evaluate the decision function
Z.shape, xx.shape
((250000,), (500, 500))
Z = Z.reshape(xx.shape) # 500 x 500
np.c_[xx.ravel(), yy.ravel()].shape
(250000, 2)
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
origin='lower', cmap=plt.cm.PuOr_r)
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
linestyles='dashed')
plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
edgecolors='k')
plt.xticks(())
plt.yticks(())
plt.axis([-3, 3, -3, 3])
plt.show()
SVM Regression
epsilon
값을 키운다는 것은 street의 너비를 키운다는 것이다. 따라서 오차에 관대해진다.
SVM Regression
Loss function
SVM Regressor uses epsilon-insensitive loss.
import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
np.random.seed(21)
N = 1000
def makeData(x):
r = [a/10 for a in x]
y = np.sin(x) + np.random.normal(0, 0.2, len(x))
return np.array(y + r)
x = [i/100 for i in range(N)]
y = makeData(x)
x = np.array(x).reshape(-1,1)
plt.scatter(x, y, s=5, color="blue")
plt.show()
Change kernel and epsilon
RBF kernel은 차원을 높여 학습을 하기 때문에 non-linear dataset에 대한 예측이 가능하다.
svr1 = SVR(kernel='linear', epsilon=0.1).fit(x, y)
svr2 = SVR(epsilon=0.01).fit(x, y)
svr3 = SVR(epsilon=1.).fit(x, y)
f, axarr = plt.subplots(1, 3, sharex='col', sharey='row', figsize=(14, 6))
for idx, svr_n, tt in zip(range(3),
[svr1, svr2, svr3],
['linear, epsilon=0.1', 'rbf, epsilon=0.01','rbf, epsilon=1.0']):
axarr[idx].scatter(x, y, s=5, color="blue", label="original")
yfit = svr_n.predict(x)
axarr[idx].plot(x, yfit, lw=2, color="red")
axarr[idx].set_title(tt)
plt.show()
print (svr1.score(x, y), svr2.score(x, y), svr3.score(x, y))
0.08016199956060532 0.9272981404051117 0.5802043013218763
Comparison of many classifiers
- Decision Tree
- Knn
- SVC
- VotingClassifier (soft voting): The idea behind the VotingClassifier is to combine conceptually different machine learning classifiers and use a majority vote or the average predicted probabilities (soft vote) to predict the class labels. Such a classifier can be useful for a set of equally well performing model in order to balance out their individual weaknesses.
xx, yy = np.meshgrid(np.linspace(0,2,3), np.linspace(0,2,3))
print(xx, '\n', yy)
xx.shape
[[0. 1. 2.]
[0. 1. 2.]
[0. 1. 2.]]
[[0. 0. 0.]
[1. 1. 1.]
[2. 2. 2.]]
(3, 3)
xx.ravel()
array([0., 1., 2., 0., 1., 2., 0., 1., 2.])
np.c_[xx.ravel(), yy.ravel()]
array([[0., 0.],
[1., 0.],
[2., 0.],
[0., 1.],
[1., 1.],
[2., 1.],
[0., 2.],
[1., 2.],
[2., 2.]])
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
# Loading some example data
iris = datasets.load_iris()
X = iris.data[:, [0, 1]]
y = iris.target
# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=6)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(gamma=1e2, C=100, kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)],
voting='soft', weights=[2, 1, 2])
clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
eclf.fit(X, y)
# Plotting decision regions
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))
for idx, clf, tt in zip(product([0, 1], [0, 1]),
[clf1, clf2, clf3, eclf],
['Decision Tree (depth=4)', 'KNN (k=7)',
'Kernel SVM', 'Soft Voting']):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
s=20, edgecolor='k')
axarr[idx[0], idx[1]].set_title(tt)
plt.show()
print(clf1.score(X, y),clf2.score(X, y),clf3.score(X, y),eclf.score(X, y))
0.8533333333333334 0.8266666666666667 0.9266666666666666 0.86
Leave a comment