[Machine Learning] Clustering

17 minute read


Clustering (군집화)

  • data from 전력거래소
  • 전력판매량(시도별/용도별) 액셀 파일: https://goo.gl/Cx8Rzw
  • Agglomerative, KMeans, DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

data read

!curl -L "https://goo.gl/Cx8Rzw" -o power_data.xls
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

100   184    0   184    0     0    159      0 --:--:--  0:00:01 --:--:--   159

100   318  100   318    0     0    219      0  0:00:01  0:00:01 --:--:--   219

  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100  1076    0  1076    0     0    455      0 --:--:--  0:00:02 --:--:--  1259

  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0
100 17920  100 17920    0     0   5619      0  0:00:03  0:00:03 --:--:-- 1750k


df = pd.read_excel('power_data.xls')
df.head()
구분 주거용 공공용 서비스업 업무용합계 농림어업 광업 제조업 식료품제조 섬유,의류 ... 기타기계 사무기기 전기기기 영상,음향 자동차 기타운송 가구및기타 재생재료 산업용합계 합계
0 강원 1940933 1400421 6203749 7604170 607139 398287 6002286 546621 13027 ... 35063 2019 38062 43986 113448 108629 12872 3418 7007712 16552816
1 개성 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 경기 16587710 5533662 33434551 38968213 2371347 317263 56603327 2544420 2109963 ... 3613798 317244 1040171 24519644 2977165 67594 1833112 133041 59291937 114847859
3 경남 4260988 1427560 8667737 10095297 2141813 95989 18053778 932743 346974 ... 1902913 8070 924235 534196 2156059 2048646 262523 47662 20291580 34647864
4 경북 3302463 1578115 8487402 10065517 1747462 224568 30115601 566071 3780171 ... 782570 14468 750786 4174971 2356890 123935 60280 77104 32087631 45455611

5 rows × 28 columns


print(df.columns)
print(df.index)
Index(['구분', '주거용', '공공용', '서비스업', '업무용합계', '농림어업', '광업', '제조업', '식료품제조',
       '섬유,의류', '목재,나무', '펄프,종이', '출판,인쇄', '석유,화확', '의료,광학', '요업', '1차금속',
       '조립금속', '기타기계', '사무기기', '전기기기', '영상,음향', '자동차', '기타운송', '가구및기타', '재생재료',
       '산업용합계', '합계'],
      dtype='object')
RangeIndex(start=0, stop=19, step=1)
df.shape
(19, 28)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   구분      19 non-null     object
 1   주거용     19 non-null     int64 
 2   공공용     19 non-null     int64 
 3   서비스업    19 non-null     int64 
 4   업무용합계   19 non-null     int64 
 5   농림어업    19 non-null     int64 
 6   광업      19 non-null     int64 
 7   제조업     19 non-null     int64 
 8   식료품제조   19 non-null     int64 
 9   섬유,의류   19 non-null     int64 
 10  목재,나무   19 non-null     int64 
 11  펄프,종이   19 non-null     int64 
 12  출판,인쇄   19 non-null     int64 
 13  석유,화확   19 non-null     int64 
 14  의료,광학   19 non-null     int64 
 15  요업      19 non-null     int64 
 16  1차금속    19 non-null     int64 
 17  조립금속    19 non-null     int64 
 18  기타기계    19 non-null     int64 
 19  사무기기    19 non-null     int64 
 20  전기기기    19 non-null     int64 
 21  영상,음향   19 non-null     int64 
 22  자동차     19 non-null     int64 
 23  기타운송    19 non-null     int64 
 24  가구및기타   19 non-null     int64 
 25  재생재료    19 non-null     int64 
 26  산업용합계   19 non-null     int64 
 27  합계      19 non-null     int64 
dtypes: int64(27), object(1)
memory usage: 4.3+ KB
df.describe()
주거용 공공용 서비스업 업무용합계 농림어업 광업 제조업 식료품제조 섬유,의류 목재,나무 ... 기타기계 사무기기 전기기기 영상,음향 자동차 기타운송 가구및기타 재생재료 산업용합계 합계
count 1.900000e+01 1.900000e+01 1.900000e+01 1.900000e+01 1.900000e+01 1.900000e+01 1.900000e+01 1.900000e+01 1.900000e+01 1.900000e+01 ... 1.900000e+01 19.000000 1.900000e+01 1.900000e+01 1.900000e+01 1.900000e+01 1.900000e+01 19.000000 1.900000e+01 1.900000e+01
mean 6.899673e+06 2.410981e+06 1.451057e+07 1.692155e+07 1.650270e+06 1.628526e+05 2.694144e+07 1.158857e+06 1.184641e+06 2.016269e+05 ... 1.107597e+06 51397.000000 6.087239e+05 5.018716e+06 1.878618e+06 4.595993e+05 3.581517e+05 59117.789474 2.875456e+07 5.257579e+07
std 1.457381e+07 4.957221e+06 3.031208e+07 3.526148e+07 3.464035e+06 3.102484e+05 5.669154e+07 2.399623e+06 2.604338e+06 4.514169e+05 ... 2.437518e+06 127516.047494 1.300814e+06 1.190370e+07 3.981244e+06 1.058183e+06 8.481672e+05 126709.246048 6.037040e+07 1.092609e+08
min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000e+00 0.000000e+00
25% 1.906912e+06 6.959615e+05 3.802654e+06 4.524926e+06 7.203850e+04 9.938500e+03 2.759556e+06 1.959545e+05 7.325600e+04 5.590500e+03 ... 7.220050e+04 3672.000000 6.083550e+04 4.510550e+04 9.622850e+04 1.154750e+04 1.313200e+04 2739.000000 2.814293e+06 1.240509e+07
50% 2.326183e+06 1.089613e+06 5.690659e+06 6.654683e+06 6.071390e+05 7.152900e+04 1.236782e+07 5.329430e+05 3.338460e+05 2.799800e+04 ... 1.988470e+05 7240.000000 1.785020e+05 4.200050e+05 6.128980e+05 6.812700e+04 4.181400e+04 19725.000000 1.258230e+07 2.451531e+07
75% 4.058920e+06 1.413990e+06 8.034786e+06 9.476781e+06 1.837764e+06 1.822120e+05 2.366853e+07 1.034889e+06 8.374750e+05 1.033945e+05 ... 8.433595e+05 14393.500000 5.898460e+05 2.614198e+06 2.256474e+06 1.775380e+05 1.976150e+05 46850.000000 2.530336e+07 4.005174e+07
max 6.457642e+07 2.220411e+07 1.347485e+08 1.569527e+08 1.537399e+07 1.347957e+06 2.529425e+08 1.073583e+07 1.124758e+07 1.905882e+06 ... 1.050464e+07 487262.000000 5.763846e+06 4.765581e+07 1.779015e+07 4.311878e+06 3.396006e+06 559909.000000 2.696645e+08 4.911936e+08

8 rows × 27 columns


df.tail()
구분 주거용 공공용 서비스업 업무용합계 농림어업 광업 제조업 식료품제조 섬유,의류 ... 기타기계 사무기기 전기기기 영상,음향 자동차 기타운송 가구및기타 재생재료 산업용합계 합계
14 전북 2326183 1096968 4910318 6007286 1415004 85300 12965875 1459217 731651 ... 159699 7240 130692 420005 859741 70980 16175 99003 14466179 22799647
15 제주 782601 301727 2308732 2610459 1364930 14019 241537 155987 3497 ... 1167 0 771 0 773 532 1743 743 1620486 5013545
16 충남 2691823 1089613 7164439 8254052 1928066 248313 37057955 1137035 269998 ... 611925 12208 428906 10953811 2526658 33766 53804 19725 39234334 50180209
17 충북 2027281 1267140 4804638 6071778 721131 139856 15883448 1152073 333846 ... 366871 23076 1125141 4103832 603349 82496 513501 46038 16744435 24843494
18 합계 64576423 22204112 134748546 156952658 15373994 1347957 252942540 10735833 11247578 ... 10504640 487262 5763846 47655808 17790147 4311878 3396006 559909 269664491 491193571

5 rows × 28 columns

df = df.set_index('구분')
df = df.drop(['합계', '개성'], errors='ignore')
df.shape
(17, 27)
df.head()
주거용 공공용 서비스업 업무용합계 농림어업 광업 제조업 식료품제조 섬유,의류 목재,나무 ... 기타기계 사무기기 전기기기 영상,음향 자동차 기타운송 가구및기타 재생재료 산업용합계 합계
구분
강원 1940933 1400421 6203749 7604170 607139 398287 6002286 546621 13027 19147 ... 35063 2019 38062 43986 113448 108629 12872 3418 7007712 16552816
경기 16587710 5533662 33434551 38968213 2371347 317263 56603327 2544420 2109963 529274 ... 3613798 317244 1040171 24519644 2977165 67594 1833112 133041 59291937 114847859
경남 4260988 1427560 8667737 10095297 2141813 95989 18053778 932743 346974 60160 ... 1902913 8070 924235 534196 2156059 2048646 262523 47662 20291580 34647864
경북 3302463 1578115 8487402 10065517 1747462 224568 30115601 566071 3780171 72680 ... 782570 14468 750786 4174971 2356890 123935 60280 77104 32087631 45455611
광주 1954876 565527 3174973 3740500 74608 2898 2910768 161072 295922 6782 ... 198847 5967 236622 723764 512148 5140 13392 16049 2988274 8683649

5 rows × 27 columns


Fonts for Korean Letters (한글폰트)

# Colab 에서 한글 폰트 설정 - 설중 후에 꼭 다시 runtime restart 해 주어야 함
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
!apt -qq -y install fonts-nanum
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

'apt'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.
# '-' 기호 보이게 하기
import platform
import matplotlib
from matplotlib import font_manager, rc
matplotlib.rcParams['axes.unicode_minus'] = False
import platform
import matplotlib
from matplotlib import font_manager, rc

# '-' 기호 보이게 하기
matplotlib.rcParams['axes.unicode_minus'] = False

# 운영 체제마다 한글이 보이게 하는 설정
if platform.system() == 'Windows':
    path = "c:\Windows\Fonts\malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Linux':
    rc('font', family='NanumBarunGothic')
df.head(2)
주거용 공공용 서비스업 업무용합계 농림어업 광업 제조업 식료품제조 섬유,의류 목재,나무 ... 기타기계 사무기기 전기기기 영상,음향 자동차 기타운송 가구및기타 재생재료 산업용합계 합계
구분
강원 1940933 1400421 6203749 7604170 607139 398287 6002286 546621 13027 19147 ... 35063 2019 38062 43986 113448 108629 12872 3418 7007712 16552816
경기 16587710 5533662 33434551 38968213 2371347 317263 56603327 2544420 2109963 529274 ... 3613798 317244 1040171 24519644 2977165 67594 1833112 133041 59291937 114847859

2 rows × 27 columns


df = df.drop("합계",axis=1)
df.plot(kind='barh', figsize=(10,6), stacked=True)

output_18_1

see_c = ['서비스업','제조업']
df[see_c].plot(kind='barh', figsize=(10,6), stacked=True)

output_19_1


df2 = df[see_c]
df2.head(5)
서비스업 제조업
구분
강원 6203749 6002286
경기 33434551 56603327
경남 8667737 18053778
경북 8487402 30115601
광주 3174973 2910768


scatter plot

plt.figure(figsize=(6,6))
plt.scatter(df2['서비스업'], df2['제조업'],c='k',marker='o')
plt.xlabel('서비스업')
plt.ylabel('제조업')

for n in range(df2.shape[0]):
    plt.text(df2['서비스업'][n]*1.03, df2['제조업'][n]*0.98, df2.index[n])

output_22_0

# drop outlier
df2 = df2.drop(['경기', '서울'])
df2.shape
(15, 2)
plt.figure(figsize=(6,6))
plt.scatter(df2['서비스업'], df2['제조업'],c='k',marker='o')
plt.xlabel('서비스업')
plt.ylabel('제조업')

for n in range(df2.shape[0]):
    plt.text(df2['서비스업'][n]*1.03, df2['제조업'][n]*0.98, df2.index[n])

output_24_0



Agglomerative clustering (Hierarchical) and Dendrogram

  • linkage: dataframe, metric, method
from scipy.cluster.hierarchy import dendrogram, linkage

plt.figure(figsize=(10, 5))
link_dist = linkage(df2, metric='euclidean', method='centroid')
link_dist
array([[0.00000000e+00, 4.00000000e+00, 7.46490444e+05, 2.00000000e+00],
       [3.00000000e+00, 5.00000000e+00, 8.37460840e+05, 2.00000000e+00],
       [7.00000000e+00, 1.20000000e+01, 2.08750703e+06, 2.00000000e+00],
       [9.00000000e+00, 1.10000000e+01, 2.32242339e+06, 2.00000000e+00],
       [6.00000000e+00, 1.50000000e+01, 2.35416537e+06, 3.00000000e+00],
       [1.60000000e+01, 1.70000000e+01, 2.81483295e+06, 4.00000000e+00],
       [1.40000000e+01, 1.80000000e+01, 3.44294208e+06, 3.00000000e+00],
       [1.00000000e+00, 1.00000000e+01, 4.51929196e+06, 2.00000000e+00],
       [1.90000000e+01, 2.00000000e+01, 6.06223563e+06, 7.00000000e+00],
       [2.10000000e+01, 2.20000000e+01, 6.21282975e+06, 5.00000000e+00],
       [2.00000000e+00, 8.00000000e+00, 6.42807846e+06, 2.00000000e+00],
       [1.30000000e+01, 2.50000000e+01, 9.12465562e+06, 3.00000000e+00],
       [2.30000000e+01, 2.40000000e+01, 1.25088770e+07, 1.20000000e+01],
       [2.60000000e+01, 2.70000000e+01, 2.21152298e+07, 1.50000000e+01]])


# method = ward, median, centroid, average, weightd, complete, single
dendrogram(link_dist, labels=list(df2.index))
plt.show()

output_27_0


Clustering after Scaling (Don’t forget!!)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df2[['서비스업', '제조업']] = scaler.fit_transform(df2[['서비스업', '제조업']])

plt.figure(figsize=(6,6))
plt.scatter(df2['서비스업'], df2['제조업'],c='k',marker='o')
plt.xlabel('서비스업')
plt.ylabel('제조업')

for n in range(df2.shape[0]):
    plt.text(df2['서비스업'][n]*1.03, df2['제조업'][n]*0.98, df2.index[n])

Z = linkage(df2, metric='euclidean', method='centroid')
plt.figure(figsize=(10, 5))
plt.title('Dendrogram')
dendrogram(Z, labels=df2.index)
plt.show()

output_29_0

output_29_1



KMeans

  • KMeans: n_clusters
  1. Initialize k centroids.
  2. Assign each data to the nearest centroid, these step will create clusters.
  3. Recalculate centroid - which is mean of all data belonging to same cluster.
  4. Repeat steps 2 & 3, till there is no data to reassign a different centroid.

Animation to explain algo - http://tech.nitoyon.com/en/blog/2013/11/07/k-means/

from sklearn.cluster import KMeans

km = KMeans(n_clusters = 3).fit(df2)
print(km.n_clusters)
3
km.labels_, km.cluster_centers_
(array([1, 0, 2, 1, 1, 1, 1, 1, 2, 0, 0, 0, 1, 2, 0]),
 array([[ 6245553.6       , 16144968.6       ],
        [ 4191629.42857143,  3805868.14285714],
        [ 6433742.33333333, 31018896.        ]]))


km_labels_two = km.labels_       # for compare later on
df2['클러스터'] = km.labels_
df2
서비스업 제조업 클러스터
구분
강원 6203749 6002286 1
경남 8667737 18053778 0
경북 8487402 30115601 2
광주 3174973 2910768 1
대구 5470438 5862633 1
대전 3955921 2608343 1
부산 7582169 7512588 1
세종 645424 1502922 1
울산 3649386 25883132 2
인천 7154416 12367816 0
전남 5690659 21453926 0
전북 4910318 12965875 0
제주 2308732 241537 1
충남 7164439 37057955 2
충북 4804638 15883448 0


df2.drop('클러스터', axis = 1, inplace=True) ; df2.head()
서비스업 제조업
구분
강원 6203749 6002286
경남 8667737 18053778
경북 8487402 30115601
광주 3174973 2910768
대구 5470438 5862633
centers = km.cluster_centers_ ; centers
array([[ 6245553.6       , 16144968.6       ],
       [ 4191629.42857143,  3805868.14285714],
       [ 6433742.33333333, 31018896.        ]])


my_markers=['*','^', 'o','^','.',',','1','2']
my_color =['r','c','g','b','g','k','r','y']

plt.figure(figsize=(10, 8))
plt.xlabel('서비스업')
plt.ylabel('제조업')
for n in range(df2.shape[0]):
    label = km.labels_[n]
    plt.scatter(df2['서비스업'][n], df2['제조업'][n], c=my_color[label], marker=my_markers[label], s=100)
    plt.text(df2['서비스업'][n]*1.03, df2['제조업'][n]*0.98, df2.index[n])
    
for i in range(km.n_clusters):
    plt.scatter(centers[i][0], centers[i][1], c = 'b', s= 50)

output_37_0


Clustering after Scaling (Don’t forget!!)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df2[['서비스업', '제조업']] = scaler.fit_transform(df2[['서비스업', '제조업']])
df2.head(3)
서비스업 제조업
구분
강원 0.393992 -0.676282
경남 1.498349 0.431200
경북 1.417523 1.539632


km = KMeans(n_clusters= 3).fit(df2)
centers = km.cluster_centers_
km_labels_two_scaled = km.labels_        # for compare later on
plt.clf()
plt.figure(figsize=(8, 6))
plt.xlabel('서비스업')
plt.ylabel('제조업')

for n in range(df2.shape[0]):
    label = km.labels_[n]
    plt.scatter(df2['서비스업'][n], df2['제조업'][n], c=my_color[label], marker=my_markers[label], s=100)
    plt.text(df2['서비스업'][n]*1.05, df2['제조업'][n]*0.99, df2.index[n])
    
for i in range(km.n_clusters):
    plt.scatter(centers[i][0], centers[i][1], c = 'k', s= 50)

output_42_1



Let’s use all features for clustering (instead of two)

df.head().T
구분 강원 경기 경남 경북 광주
주거용 1940933 16587710 4260988 3302463 1954876
공공용 1400421 5533662 1427560 1578115 565527
서비스업 6203749 33434551 8667737 8487402 3174973
업무용합계 7604170 38968213 10095297 10065517 3740500
농림어업 607139 2371347 2141813 1747462 74608
광업 398287 317263 95989 224568 2898
제조업 6002286 56603327 18053778 30115601 2910768
식료품제조 546621 2544420 932743 566071 161072
섬유,의류 13027 2109963 346974 3780171 295922
목재,나무 19147 529274 60160 72680 6782
펄프,종이 24382 1917458 817685 361772 41827
출판,인쇄 7727 731348 28486 44402 22038
석유,화확 175323 6881775 1865583 3653665 391151
의료,광학 84397 1336390 178498 217771 27221
요업 3695776 1728379 429920 1269917 20728
1차금속 1038913 2020196 3809547 10874970 75702
조립금속 39477 2302355 1699879 933178 156396
기타기계 35063 3613798 1902913 782570 198847
사무기기 2019 317244 8070 14468 5967
전기기기 38062 1040171 924235 750786 236622
영상,음향 43986 24519644 534196 4174971 723764
자동차 113448 2977165 2156059 2356890 512148
기타운송 108629 67594 2048646 123935 5140
가구및기타 12872 1833112 262523 60280 13392
재생재료 3418 133041 47662 77104 16049
산업용합계 7007712 59291937 20291580 32087631 2988274


df.drop(['업무용합계', '산업용합계', '합계'], axis=1, inplace=True, errors='ignore')
df.drop(['경기','서울'], inplace=True,errors='ignore')
df.head()
주거용 공공용 서비스업 농림어업 광업 제조업 식료품제조 섬유,의류 목재,나무 펄프,종이 출판,인쇄 석유,화확 의료,광학 요업 1차금속 조립금속 기타기계 사무기기 전기기기 영상,음향 자동차 기타운송 가구및기타 재생재료
구분
강원 1940933 1400421 6203749 607139 398287 6002286 546621 13027 19147 24382 7727 175323 84397 3695776 1038913 39477 35063 2019 38062 43986 113448 108629 12872 3418
경남 4260988 1427560 8667737 2141813 95989 18053778 932743 346974 60160 817685 28486 1865583 178498 429920 3809547 1699879 1902913 8070 924235 534196 2156059 2048646 262523 47662
경북 3302463 1578115 8487402 1747462 224568 30115601 566071 3780171 72680 361772 44402 3653665 217771 1269917 10874970 933178 782570 14468 750786 4174971 2356890 123935 60280 77104
광주 1954876 565527 3174973 74608 2898 2910768 161072 295922 6782 41827 22038 391151 27221 20728 75702 156396 198847 5967 236622 723764 512148 5140 13392 16049
대구 3151904 826396 5470438 69142 5858 5862633 212626 1057342 16215 445646 46804 418485 85871 68137 317580 661307 516493 58446 180189 252662 1381273 68127 41814 33616


index_ = df.index
column_ = df.columns
index_
Index(['강원', '경남', '경북', '광주', '대구', '대전', '부산', '세종', '울산', '인천', '전남', '전북',
       '제주', '충남', '충북'],
      dtype='object', name='구분')
column_
Index(['주거용', '공공용', '서비스업', '농림어업', '광업', '제조업', '식료품제조', '섬유,의류', '목재,나무',
       '펄프,종이', '출판,인쇄', '석유,화확', '의료,광학', '요업', '1차금속', '조립금속', '기타기계',
       '사무기기', '전기기기', '영상,음향', '자동차', '기타운송', '가구및기타', '재생재료'],
      dtype='object')
type(column_)
pandas.core.indexes.base.Index
list(column_)
['주거용',
 '공공용',
 '서비스업',
 '농림어업',
 '광업',
 '제조업',
 '식료품제조',
 '섬유,의류',
 '목재,나무',
 '펄프,종이',
 '출판,인쇄',
 '석유,화확',
 '의료,광학',
 '요업',
 '1차금속',
 '조립금속',
 '기타기계',
 '사무기기',
 '전기기기',
 '영상,음향',
 '자동차',
 '기타운송',
 '가구및기타',
 '재생재료']
list(column_).index('제조업'), list(column_).index('서비스업')
(5, 2)


Scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)    # returns an array (not dataframe)
print(type(df), type(df_scaled))
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>


Agglomerative

Z = linkage(df_scaled, metric='euclidean', method='centroid')
plt.figure(figsize=(10, 5))
plt.title('Dendrogram for all features (scaled)')
dendrogram(Z, labels=index_)   # version 에 따라 list(index_) 로 해야 할 수도 있음
plt.show()

output_55_0


KMeans

km = KMeans(n_clusters=3).fit(df_scaled)
print(km.cluster_centers_)
print(km.labels_)
[[ 0.78279298  0.44601929  0.49612314 -0.14792601 -0.2202511  -0.11522065
   0.61689344 -0.07050385  0.56693391  0.41413847  0.54521244 -0.34444738
   0.42201991 -0.08911986 -0.28490127  0.72185836  0.69386845  0.50846817
   0.37057962 -0.15995802  0.12890552  0.25516914  0.7424765   0.59481401]
 [-0.80670038 -0.65022296 -0.74554771 -0.15444205 -0.1766458  -0.43227338
  -0.72564872 -0.39191158 -0.50458273 -0.47379294 -0.79242599  0.16406657
  -0.45929592 -0.02806164 -0.39192638 -0.78715114 -0.73512937 -0.48012602
  -0.5501468  -0.46830075 -0.56935008 -0.10680832 -0.56171066 -0.71096136]
 [ 0.47507237  0.93772248  1.12104758  0.98432519  1.2790136   1.85861879
   0.68909021  1.58320209  0.06523783  0.4158599   1.13785365  0.45910913
   0.34147599  0.3655753   2.22644616  0.5894539   0.49134744  0.15503658
   0.81377495  2.11892669  1.6060087  -0.39167828 -0.26144221  0.70392275]]
[1 0 2 1 0 1 0 1 1 0 1 0 1 2 0]
print("all features, scaled: ", km.labels_)
print("two features:         ", km_labels_two)
print("two features, scaled: ", km_labels_two_scaled)
all features, scaled:  [1 0 2 1 0 1 0 1 1 0 1 0 1 2 0]
two features:          [2 0 1 2 2 2 2 2 1 0 0 0 2 1 0]
two features, scaled:  [0 1 1 2 0 2 0 2 0 0 0 0 2 1 0]



DBSCAN

  • DBSCAN: eps, min_samples, metric

  • eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
  • min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.
  • metric: The metric to use when calculating distance between instances in a feature array.
from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
X, y = make_moons(n_samples=300, noise=0.1, random_state=11)   # X :samples, y: label

X[:5],y[:5]
(array([[ 0.07466556, -0.13809597],
        [-0.96718677,  0.77131154],
        [-0.211739  ,  1.01774394],
        [ 1.22822953, -0.34209166],
        [ 0.99307628,  0.13267972]]), array([1, 0, 0, 1, 0]))
plt.scatter(X[:,0], X[:,1], c='b')
plt.show()

output_60_0


kmeans = KMeans(n_clusters=2)
predict = kmeans.fit_predict(X)   # returns a predicted array

predict
array([0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0], dtype=int32)
plt.scatter(X[:,0], X[:,1],c=predict)

output_62_1


dbscan = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0], X[:,1],c=predict)

output_63_1


predict
array([0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0])


dbscan = DBSCAN(eps=0.1, min_samples=5, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0], X[:,1],c=predict)

output_65_1


dbscan = DBSCAN(eps=0.1, min_samples=10, metric='euclidean')
predict = dbscan.fit_predict(X)
print(predict)
plt.scatter(X[:,0], X[:,1],c=predict)
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1  0 -1
 -1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]

output_66_2


dbscan = DBSCAN(eps=0.5, min_samples=10, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0], X[:,1],c=predict)

output_67_1


dbscan = DBSCAN(eps=0.2, min_samples=6, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0], X[:,1],c=predict)

output_68_1


dbscan = DBSCAN(eps=0.2, min_samples=15, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0], X[:,1],c=predict)

output_69_1



Leave a comment