[Machine Learning] Midterm Summary 1

13 minute read


week2

Lab 1: End to End Machine Learning Process

데이터셋 분석

import pandas as pd

housing = pd.read_csv(csv_path) # csv 파일을 읽어와서 데이터프레임으로 변환

housing.columns # 데이터프레임의 feature list
housing.shape # 데이터프레임 형상
housing.info() # 데이터프레임의 각 feature의 개수, dtype
housing["열 이름"].value_counts() # 해당 열의 어떤 값이 몇 개 있는지 반환
housing.describe() # 데이터프레임의 각 feature에 대한 수치적 정보(개수, 평균, 최소값 등)를 반환

housing["median_income"].hist() # feature의 value_counts()를 히스토그램으로 plot
housing["income_cat"] = pd.cut(housing["median_income"], # feature의 값을 특정 구간으로 매핑
                              bins = [0., 1.5, 3.0, 4.5, 6., np.inf], 
                              labels = [1,2,3,4,5])

housing.corr() # 상관관계 행렬
housing.sort_values(by="열 이름", ascending) # 데이터프레임을 특정 열의 값을 기준으로 정렬

# 추가
data['diagnosis'] = data['diagnosis'].map({'M':1, 'B':0}) # 값을 매핑


훈련-테스트 데이터셋 분리

  • train_test_split: dataset, test_size, shuffle, random_state
  • StratifiedShuffleSplit: n_splits, test_size, random_state
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) # 학습-테스트 데이터셋 분리

split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42) # 클래스 비율이 같도록 데이터셋 분리
for train_index, test_index in split.split(housing, housing["income_cat"]): # X, y
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]


데이터 시각화

  • DataFrame.plot: kind, x, y, alpha, s, label, figsize, c, cmap, colorbar, sharex, sharey
# dataframe.plot
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population", figsize=(10,7),
             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()
plt.show()

image-20211022155811569

# seaborn
import seaborn as sns

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
sns.pairplot(housing[attributes])

image-20211022155825315

# pandas.plotting
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

image-20211022155838080


데이터 전처리

  • SimpleImputer: strategy
  • OrdinalEncoder/LabelEncoder/OneHotEncoder
    • categories_, toarray()(for OneHotEncoder)
  • StandardScaler/MinMaxScaler
    • mean_, std_, inverse_transform()
  • Pipeline: steps(list of tuples(name, transform))
  • ColumnTransformer: transformers(list of tuples(name, transformer, columns))
housing.drop("열 이름", axis) # 0: 행 방향, 1: 열 방향 (default: 0)
housing.isnull().any(0) # 각 feature 별로 null이 있는지 검사
housing.isnull().any(1) # 각 sample 별로 null이 있는지 검사
housing.isnull().sum() # 각 feature 별로 null이 몇 개인지 검사
housing.isnull().sum().sum() # 전체 데이터프레임에 null이 몇 개인지 검사

housing_num = housing.drop("ocean_proximity", axis=1)

# 누락값에 대한 처리
housing.dropna(subset=["열 이름"]) # null이 있는 샘플을 누락. subset은 null이 있는지 검사할 열을 지정. 
housing.drop("열 이름", axis=1) # null이 있는 feature를 누락. 
median = housing["열 이름"].median()
housing["열 이름"].fillna(median) # 다른 값으로 대체


# Imputer
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(numerical_dataframe)
numerical_dataframe_without_nan = imputer.transform(numerical_dataframe_with_nan)


# Encoding
from sklearn.preprocessing import OrdinalEncoder, LebelEncoder, OneHotEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
'''
array([[3.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.]])
'''
ordinal_encoder.categories_
'''
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]
'''


cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
'''
<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>
'''
housing_cat_1hot.toarray()
'''
array([[0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.]])
'''
cat_encoder.categories_
'''
array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]
'''


# Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
scaled_num_attribs = scaler.fit_transform(num_attribs)
scaler.mean_, scaler.std_
num_attribs = scaler.inverse_transform(scaled_num_attribs)


# Pipeline
col_names = ["total_rooms", "total_bedrooms", "population", "households"] # 추가할 특성
rooms_ix, bedrooms_ix, population_ix, households_ix = [ # 3, 4, 5, 6
    housing.columns.get_loc(c) for c in col_names] 
class CombinedAttributesAdder(): # 특성 추가 클래스 (fit과 transform 메서드 정의 필요)
    def fit(self, X, y=None):
        return self  
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]

        X = np.delete(X, [households_ix, rooms_ix, population_ix, bedrooms_ix], 1)

        return np.c_[X, rooms_per_household, population_per_household,
                    bedrooms_per_room]
    
from sklearn.pipeline import Pipeline
# 수치 데이터에 대한 파이프라인
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

from sklearn.compose import ColumnTransformer # 특성 별로 다른 변환 적용

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
# 전체 파이프라인
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs), # 수치 데이터는 수치 데이터에 대한 파이프라인 적용
        ("cat", OneHotEncoder(), cat_attribs), # 카테고리 데이터는 원-핫 인코딩 적용
    ])

housing_prepared = full_pipeline.fit_transform(housing)


모델 선택, 훈련

# 선형 회귀 모델
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

# 성능 평가
from sklearn.metrics import mean_squared_error, r2_score

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_r2 = r2_score(housing_labels, housing_predictions)
lin_mse, np.sqrt(lin_mse), lin_r2, lin_reg.score(housing_prepared, housing_labels)

# 결정 트리 모델
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42, max_depth=20)
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_r2 = r2_score(housing_labels, housing_predictions)
tree_mse, np.sqrt(tree_mse), tree_r2, tree_reg.score(housing_prepared, housing_labels)



week3

선형 모델이나 SVM, 신경망에서는 반드시 Scaling을 해야 한다.

Gradient Descent Regression

임시 데이터 생성

n = 100
x = np.random.randn(n)                # batch size
y = x*20 + 10                         # w=20, b=10
y = y + np.random.randn(n) * 10       # add noise

plt.scatter(x,y)

image-20211022174854187

경사 하강 알고리즘

손코딩 나올 수도!!!

w=np.random.randn()   
b=np.random.randn()

lr = 0.1          # learning rate
n_epoch = 200     # number of epoch
lossHistory = []  

# 1 feature
for epoch in range(n_epoch):
    y_pred = w*x + b
    loss = ((y_pred - y)**2).mean()     # mean square error
    lossHistory.append(loss)
    
    # update parameters by differentiation of MSE
    w = w - lr* ((y_pred - y)*x).mean()
    b = b - lr* (y_pred - y).mean()
    if epoch %10 == 0:
        print('epoch=', epoch, 'loss=', loss, 'w=', w, 'b=', b)
        
print('---------------------------')
print('epoch=', epoch, 'loss=', loss, 'w=', w, 'b=', b)

'''
epoch= 0 loss= 608.135326701648 w= 0.689784764278889 b= 0.9972609240805469
epoch= 10 loss= 207.53080423004252 w= 11.670716818468396 b= 7.227075715189502
epoch= 20 loss= 152.28938253673073 w= 15.930927964152566 b= 9.20608673708662
epoch= 30 loss= 144.4693105859433 w= 17.59153654618134 b= 9.821009291157926
epoch= 40 loss= 143.33405880761802 w= 18.24161978885116 b= 10.006080890111248
epoch= 50 loss= 143.16542027501967 w= 18.497100763621827 b= 10.059105160686649
epoch= 60 loss= 143.1398621531657 w= 18.597855042631664 b= 10.073064262692903
epoch= 70 loss= 143.13592303973158 w= 18.63771339903109 b= 10.076142655288594
epoch= 80 loss= 143.13530760578266 w= 18.653524866671727 b= 10.076507439041466
epoch= 90 loss= 143.13521041460535 w= 18.659812389462125 b= 10.076353219559355
epoch= 100 loss= 143.13519493822233 w= 18.66231798888442 b= 10.076187429013352
epoch= 110 loss= 143.1351924582818 w= 18.66331833668638 b= 10.076084942824329
epoch= 120 loss= 143.1351920590203 w= 18.663718366743243 b= 10.076031356750361
epoch= 130 loss= 143.13519199451582 w= 18.66387855956513 b= 10.076005529109846
epoch= 140 loss= 143.13519198406763 w= 18.663942786892253 b= 10.075993661364555
epoch= 150 loss= 143.1351919823721 w= 18.66396856497532 b= 10.0759883752038
epoch= 160 loss= 143.13519198209656 w= 18.663978920499822 b= 10.075986071025419
epoch= 170 loss= 143.13519198205177 w= 18.66398308371867 b= 10.075985082344856
epoch= 180 loss= 143.13519198204446 w= 18.663984758562417 b= 10.075984663108663
epoch= 190 loss= 143.1351919820433 w= 18.66398543272708 b= 10.075984486949109
---------------------------
epoch= 199 loss= 143.13519198204307 w= 18.663985686793474 b= 10.075984418251208
'''

n=100
x1 = np.random.randn(n)             # randn=normal distribution in (-1,1), rand=(0,1)
x2 = np.random.randn(n)

y = x1*30 + x2*40 + 50
y = y + np.random.randn(n)*20      # add noise

w1 = np.random.rand()               # initial guess
w2 = np.random.rand()
b = np.random.rand()

lr = 0.1                            # learning rate
n_epoch = 200                      # no of epoch
lossHistory = []

# 2 features
for epoch in range(n_epoch):
    y_pred = w1*x1 + w2*x2 + b
    error = ((y_pred - y)**2).mean()
    lossHistory.append(error)

    w1 = w1 - lr* ((y_pred - y)*x1).mean()
    w2 = w2 - lr* ((y_pred - y)*x2).mean()
    b = b - lr* (y_pred - y).mean()
    if epoch %10 == 0:
        print('epoch=', epoch, 'loss=', loss, 'w=', w, 'b=', b)
        
print('---------------------------')
print('epoch=', epoch, 'error=', error, 'w1=', w1.round(2), 'w2=', w2.round(2), 'b=', b.round(2))

'''
epoch= 0 loss= 98.49198209239596 w1= 3.0622983189562536 w2= 4.764356865654392 b= 5.897084897272597
epoch= 10 loss= 98.49198209239596 w1= 18.010371367108256 w2= 28.11790158719528 b= 34.84015273300073
epoch= 20 loss= 98.49198209239596 w1= 23.92513539978548 w2= 35.47955293266165 b= 44.56444221585169
epoch= 30 loss= 98.49198209239596 w1= 26.225643561017915 w2= 37.8418415003053 b= 47.81856077797792
epoch= 40 loss= 98.49198209239596 w1= 27.11094695935368 w2= 38.61417874461482 b= 48.902444027182995
epoch= 50 loss= 98.49198209239596 w1= 27.44947647216773 w2= 38.87154475680563 b= 49.26146817037565
epoch= 60 loss= 98.49198209239596 w1= 27.578477035364905 w2= 38.95893768889174 b= 49.37959563238877
epoch= 70 loss= 98.49198209239596 w1= 27.627561562799617 w2= 38.98915556957853 b= 49.418142205981276
epoch= 80 loss= 98.49198209239596 w1= 27.646236490880202 w2= 38.99978248478305 b= 49.43059025098738
epoch= 90 loss= 98.49198209239596 w1= 27.653347822725966 w2= 39.00357796550781 b= 49.43455663059089
epoch= 100 loss= 98.49198209239596 w1= 27.656059892304143 w2= 39.00495242597293 b= 49.435798183185476
epoch= 110 loss= 98.49198209239596 w1= 27.657096198309056 w2= 39.00545624717423 b= 49.43617740968662
epoch= 120 loss= 98.49198209239596 w1= 27.65749303968171 w2= 39.005642884171394 b= 49.436289198820965
epoch= 130 loss= 98.49198209239596 w1= 27.65764535248929 w2= 39.00571265085133 b= 49.4363203692288
epoch= 140 loss= 98.49198209239596 w1= 27.65770394667565 w2= 39.00573893220989 b= 49.43632824570201
epoch= 150 loss= 98.49198209239596 w1= 27.657726538440585 w2= 39.005748897521244 b= 49.43632984226368
epoch= 160 loss= 98.49198209239596 w1= 27.65773526779394 w2= 39.0057526971633 b= 49.4363299559148
epoch= 170 loss= 98.49198209239596 w1= 27.65773864763172 w2= 39.005754152737794 b= 49.43632982441512
epoch= 180 loss= 98.49198209239596 w1= 27.657739958712025 w2= 39.005754712566734 b= 49.436329712316414
epoch= 190 loss= 98.49198209239596 w1= 27.657740468179426 w2= 39.00575492861191 b= 49.43632964753197
---------------------------
epoch= 199 error= 437.5417546691009 w1= 27.66 w2= 39.01 b= 49.44
'''


LinearRegression 모델 사용

from sklearn.linear_model import LinearRegression

# Make it to matrix(two features)
X = np.concatenate([x1.reshape(n,1), x2.reshape(n,1)], axis=1)

model = LinearRegression()        # create model
model.fit(X,y)                    # train model
print("score: ",model.score(X,y))
print('w1=', model.coef_[0], 'w2=', model.coef_[1], 'b=', model.intercept_)

# prediction
new_X=[1,3] # x1, x2
print('Real Value: ', 1*30 + 3*40 + 50)        # y 
print('Predicted Value', *model.predict([new_X]))  # model predict(inference)

'''
score:  0.8465475643687691
w1= 27.93626338823067 w2= 40.08110377245416 b= 46.27791539580053
Real Value:  200
Predicted Value 194.4574901013937
'''


선형 회귀 데이터 만들기

  • make_regression: n_samples, n_features, noise, random_state
from sklearn.datasets import make_regression 

X, y = make_regression(n_samples=2000, n_features=2, noise=1.5, random_state=1)

image-20211022180323986


Gradient Descent Classification

선형 분류 데이터셋 만들기

  • make_blobs: n_samples, n_features, centers, cluster_std, random_state
from sklearn.datasets import make_blobs

N = 500
(X, y) = make_blobs(n_samples=N, n_features=2, centers=2, cluster_std=2.0, random_state=17)

image-20211022180352475

경사 하강 알고리즘

이진 분류는 경사 하강 알고리즘 + 시그모이드 함수 사용

손실 함수로 cross entropy 사용

image-20211022181544852

w1 = np.random.randn()
w2 = np.random.randn()
b  = np.random.randn() 

def sigmoid_activation(z):
    return 1.0 / (1 + np.exp(-z))

lossHistory = []
epochs = 500
alpha = 0.03

for epoch in np.arange(epochs):
    # 예측
    z = w1*x1 + w2*x2 + b
    # 활성화 함수
    y_hat = sigmoid_activation(z)       # prediction
	# 손실 함수
    loss = -((y*np.log(y_hat) + (1-y)*np.log(1-y_hat))).mean()  # loss = cross entropy
    lossHistory.append(loss)
    # 미분치 계산
    dloss_dz = y_hat - y
    w1_deriv = dloss_dz * x1        # d(loss)/dw1 = d(loss)/dz * dz/dw1
    w2_deriv = dloss_dz * x2
    b_deriv = dloss_dz * 1
    # 가중치 갱신
    w1 = w1 - (alpha * w1_deriv).mean()
    w2 = w2 - (alpha * w2_deriv).mean()
    b  = b  - (alpha * b_deriv).mean()


Hinge Loss

분류에서 활성화 함수를 지난 값을 이용해 가중치를 갱신하는 대신, 활성화 함수를 지나기 전의 값(예측값)을 이용해 가중치를 갱신할 수 있다.

이를 Hinge Loss라 한다.

  • Hinge loss for input-output pair (x,y) is given as:
    • L = max(0, 1 - yf(x))
    • L = 0 (if yf(x) >= 1), 1-yf(x) (otherwise)
    • dL/dw1 = 0 (if yf(x) >= 1), -yx1 (otherwise)

image-20211022181640847

Hinge Loss는 분류에서 데이터들을 1(제대로 예측), -1(잘못 예측)로 구분한다.

# 데이터셋 변환
N = 500
(X, y_org) = make_blobs(n_samples=N, n_features=2, centers=2, cluster_std=2.0, random_state=17)
x1, x2 = X[:,0], X[:,1]
y = y_org.copy()
y[y==0] = -1 # 값이 0인 라벨 데이터를 -1로 변경 (Hinge Loss)

# 가중치 및 하이퍼파라미터 초기화
w1, w2, b = np.random.randn(), np.random.randn(), np.random.randn()
lossHistory = []
epochs = 500
alpha = 0.03
N = len(x1)

# 경사 하강 알고리즘
for epoch in np.arange(epochs):

    w1_deriv, w2_deriv, b_deriv, loss = 0., 0., 0., 0.
    for i in range(N):
        score = y[i]*(w1*x1[i] + w2*x2[i] + b) # y * y_hat
        if score <= 1: # Loss 발생 (y*y_hat <= 1, 1 - y*y_hat >= 0)
            w1_deriv = w1_deriv - x1[i]*y[i]
            w2_deriv = w2_deriv - x2[i]*y[i]
            b_deriv = b_deriv - y[i]
            loss = loss + (1 - score)
        # else : derivatives are zero. loss is 0
    
    # mean
    w1_deriv /= float(N)
    w2_deriv /= float(N)
    b_deriv  /= float(N)
    loss /= float(N)
    # update parameters
    w1 = w1 - alpha * w1_deriv
    w2 = w2 - alpha * w2_deriv
    b  =  b - alpha *  b_deriv


SGDClassifier/LogisticRegression 모델 사용

  • SGDClassifier 모델은 기본 손실 함수로 Hinge Loss를 사용
  • LogisticRegressor 모델은 기본 손실 함수로 Cross Entropy를 사용
from sklearn.linear_model import SGDClassifier, LogisticRegression

N = 500
(X, y) = make_blobs(n_samples=N, n_features=2, centers=2, cluster_std=2.0, random_state=17)

clf = SGDClassifier()      
clf.fit(X[:,:2], y)    
print("SGDClassifier: ", clf.score(X[:, :2],y))
print(clf.coef_, clf.intercept_)

log = LogisticRegression()      
log.fit(X[:,:2], y)    
print("Logistic Regression: ", log.score(X[:, :2],y))
print(log.coef_, log.intercept_)

'''
SGDClassifier:  0.984
[[ -8.56625092 -34.52822982]] [-211.3932073]
Logistic Regression:  0.994
[[-0.46621709 -1.9549905 ]] [-10.32380395]
'''



week 4

KFold Validation

  • KFold/StratifiedKFold: n_splits, shuffle, random_state
  • cross_val_score: model, X, y, cv object
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold

X = X_all[:,0]
y = X_all[:,2]

cv = KFold(n_splits=5,shuffle=True) # Returns the number of splitting iterations in the cross-validator.
# cv = StratifiedKFold(n_splits=5, shuffle=True)
score = cross_val_score(LinearRegression(), X.reshape(-1,1), y, cv=cv)

print(score.round(2))
print(score.mean().round(2))

'''
[0.83 0.71 0.8  0.72 0.62]
0.74
'''

cv란?

for train_index, test_index in cv.split(X):
    print("TRAIN:\n", train_index,'\n', "TEST:\n", test_index)
    # X_train, X_test = X[train_index], X[test_index]
    # y_train, y_test = y[train_index], y[test_index]
    
''' 총 5개
TRAIN:
 [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  17  18  19
  21  22  23  24  27  28  30  31  32  33  34  35  36  37  38  39  40  43
  44  45  46  47  48  50  51  58  59  60  61  62  63  64  65  66  67  68
  70  71  74  75  76  78  79  80  83  84  85  88  89  90  91  92  93  94
  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 111 112 113
 114 115 116 117 118 119 120 121 122 123 124 125 126 128 129 130 131 132
 133 134 135 136 137 138 140 141 142 143 145 146] 
 TEST:
 [ 15  16  20  25  26  29  41  42  49  52  53  54  55  56  57  69  72  73
  77  81  82  86  87 110 127 139 144 147 148 149]
...
TRAIN:
 [  0   2   3   4   6   7   8   9  10  11  13  14  15  16  18  19  20  21
  22  23  24  25  26  27  28  29  31  32  33  34  35  36  37  38  41  42
  43  44  45  46  47  49  50  51  52  53  54  55  56  57  58  59  60  61
  63  65  67  69  70  71  72  73  76  77  78  79  80  81  82  83  84  85
  86  87  88  89  91  92  95  96  97  98  99 100 101 102 103 105 106 107
 108 109 110 111 112 116 118 120 121 122 124 127 128 129 130 132 133 134
 135 136 137 138 139 142 144 145 146 147 148 149] 
 TEST:
 [  1   5  12  17  30  39  40  48  62  64  66  68  74  75  90  93  94 104
 113 114 115 117 119 123 125 126 131 140 141 143]
'''


Regualarization

alpha 하이퍼파라미터를 사용하고, 값이 클수록 규제의 효과가 크다.

  • Ridge: alpha

    • L2 규제로, 가중치의 제곱항을 줄인다. 따라서 값이 큰 가중치들이 많이 줄어든다.
    • overfitting 을 완화(일반화)시키는 효과가 있다.

    image-20211022231435458

    image-20211022231516937

  • Lasso: alpha

    • L1 규제로, 가중치를 전체적으로 줄인다. 따라서 값이 작은 가중치들이 먼저 사라지고 값이 큰 가중치들이 살아남는다.
    • feature selection 효과가 있다.
    • 일반적으로 같은 alpha 값일 때 Ridge보다 규제의 효과가 크다.

    image-20211022231304308

    image-20211022231536570

# Ridge(L2)
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1000)
ridge.fit(X, y_out)
y_ridge_pred = ridge.predict(X)


Multi-class Classification

  • LogisticRegression: multi_class, C, random_state
    • multi_class = “multinomial”로 지정하면 Softmax 함수(+Cross Entropy)를 사용하여 다중 분류를 수행한다.
    • multi_class = “ovr”로 지정하면 One vs Rest 방법을 사용하여 다중 분류를 수행한다.
    • C는 모델의 규제 강도의 역수로, 작을수록 규제가 강해진다(모델의 복잡도가 줄어든다).
softmax_reg = LogisticRegression(multi_class="multinomial", C=10, random_state=42)
ovr_clf = LogisticRegression(multi_class="ovr", C=10, random_state=42)

# 예측 수행 (특성 2개로 훈련된 모델)
softmax_reg.predict([[5, 2]]) # array([2])
softmax_reg.predict_proba([[5, 2]]) # array([[6.38014896e-07, 5.74929995e-02, 9.42506362e-01]])



week 5

Classification Performance

Static performance and Confusion matrix

Confusion matrix - accuracy(model.score()), precision, recall(sensitivity), f1-score

image-20211022233716443

image-20211022233735058

  • confusion_matrix: y_test, y_pred
  • classification_report: y_test, y_pred
y_pred=[1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0]
y_test=[1,1,0,1,0,1,1,1,0,0,1,0,1,1,0,1,0,0,0,0]

from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, y_pred)
'''
# x축(예측값): 0, 1
# y축(실제값): 0, 1
array([[5, 5],
       [1, 9]], dtype=int64)
'''
classification_report(y_test, y_pred)
'''
              precision    recall  f1-score   support

           0       0.83      0.50      0.62        10
           1       0.64      0.90      0.75        10

    accuracy                           0.70        20
   macro avg       0.74      0.70      0.69        20
weighted avg       0.74      0.70      0.69        20
'''


Dynamic Performance and ROC, AUC

Ranking-based or Score-based

image-20211022234557761

  • fpr, tpr, threshold = roc_curve(y_test, y_score)
  • auc(fpr, tpr)
y_score = np.linspace(99, 60, 20).round(1)
result = pd.DataFrame(list(zip(y_score, y_pred, y_test)), 
                      columns=['score', 'predict', 'real'])
result['correct'] = (result.predict == result.real)
'''

   score predict real correct
0	99.0	1	1		True
1	96.9	1	1		True
2	94.9	1	0		False
3	92.8	1	1		True
4	90.8	1	0		False
5	88.7	1	1		True
6	86.7	1	1		True
7	84.6	1	1		True
8	82.6	1	0		False
9	80.5	1	0		False
10	78.5	1	1		True
11	76.4	1	0		False
12	74.4	1	1		True
13	72.3	1	1		True
14	70.3	0	0		True
15	68.2	0	1		False
16	66.2	0	0		True
17	64.1	0	0		True
18	62.1	0	0		True
19	60.0	0	0		True
'''

fpr, tpr, thresholds1 = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

pd.DataFrame([thresholds1, tpr, fpr], index=['threshold','tpr','fpr'])

image-20211022234935885

plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")

image-20211022235006684

Precision-Recall Curve

  • precision, recall, threshold = precision_recall_curve(y_test, y_score)
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve

precision, recall, thresholds2 = precision_recall_curve(y_test, y_score)

auc_score = auc(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.plot(recall, precision, label='Precision-Recall curve (area = %0.2f)' % auc_score)
plt.legend(loc="upper right")

image-20211022235456187


대부분의 경우에 높은 AUC 값이 더 나은 모델 성능을 나타내지만, 데이터셋의 불균형이 심한 경우 신뢰할 수 없다.


Regression Performance

  • MAE (mean absolute error): y_true, y_pred

  • MSE (mean square error): y_true, y_pred

  • RMSE (root mean square error)

  • R-squared score (model.score()): y_true, y_pred

    image-20211023000237963

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



Leave a comment