  • 감성분석
    • 네이버 영화평점 (Naver sentiment movie corpus v.1.0) 데이터(https://github.com/e9t/nsmc)
    • 영화 리뷰 20만건이 저장됨. 각 평가 데이터는 0(부정), 1(긍정)으로 label 됨.
  • 한글 자연어 처리
    • KoNLPy(“코엔엘파이”라고 읽습니다)는 한국어 정보처리를 위한 파이썬 패키지입니다.
    • konlpy 패키지에서 제공하는 Twitter라는 문서 분석 라이브러리 사용 (트위터 분석 뿐 아니라 한글 텍스트 처리도 가능)
    • colab 사용 권장
!pip install konlpy
# 패키지 설치
import konlpy
import pandas as pd
import numpy as np
from konlpy.tag import Twitter # Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from sklearn import model_selection, metrics

# 토큰 파서(품사별)
def twitter_tokenizer(text):
    return Twitter().morphs(text)
twitter_tokenizer("Welcome to data science world!...한국말도 똑 같아요...")

!curl -L https://bit.ly/2X9Owwr -o ratings_train.txt
!curl -L https://bit.ly/2WuLd5I -o ratings_test.txt
# 데이터 로드
df_train = pd.read_csv('ratings_train.txt', delimiter='\t', keep_default_na=False)
df_test = pd.read_csv('ratings_test.txt', delimiter='\t', keep_default_na=False)





df_train['document'].values == df_train['document'].to_numpy()
array([ True,  True,  True, ...,  True,  True,  True])
text_train, y_train = df_train['document'].to_numpy(), df_train['label'].to_numpy()
text_test, y_test = df_test['document'].to_numpy(), df_test['label'].to_numpy()
text_train.shape, y_train.shape, text_test.shape, y_test.shape
((150000,), (150000,), (50000,), (50000,))

# too much... -> let's take few of them
text_train, y_train = text_train[:2000], y_train[:2000]
text_test, y_test = text_test[:1000], y_test[:1000]
y_train.shape, y_test.shape
((2000,), (1000,))
y_train.mean(), y_test.mean()    # check distribution of classes 1 and 0
(0.4945, 0.508)

cv = TfidfVectorizer(tokenizer=twitter_tokenizer, max_features=3000)
X_train = cv.fit_transform(text_train)
X_test = cv.transform(text_test) # cv.fit_transform(text_test) (X)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((2000, 3000), (2000,), (1000, 3000), (1000,))
['????', '???', '???', '?????', '????']

Linear Classification

from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
result = clf.fit(X_train,y_train)
feature_names = cv.get_feature_names()
print("Training : ", result.score(X_train, y_train))
print("Testing : ", result.score(X_test, y_test))
Training :  0.985
Testing :  0.739

Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
result = lr.fit(X_train,y_train)
feature_names = cv.get_feature_names()
print("Training : ", result.score(X_train, y_train))
print("Testing : ", result.score(X_test, y_test))
Training :  0.916
Testing :  0.771


# use one-hot encoded target (2 multi-class classification)
from tensorflow.keras.utils import to_categorical
y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)
max_words = X_train.shape[1]
batch_size = 100
nb_epoch = 5

model = Sequential()
model.add(Dense(64, input_shape=(max_words,), activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',

model.fit(X_train.toarray(), y_train_ohe, epochs=nb_epoch, 
print("Training : ", model.evaluate(X_train.toarray(), y_train_ohe))
print("Testing : ", model.evaluate(X_test.toarray(), y_test_ohe))
# use binary target (binary classifiaction)
max_words = X_train.shape[1]
batch_size = 100
nb_epoch = 5

model = Sequential()
model.add(Dense(64, input_shape=(max_words,)))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',

model.fit(X_train.toarray(), y_train, epochs=nb_epoch, 
print("Training : ", model.evaluate(X_train.toarray(), y_train))
print("Testing : ", model.evaluate(X_test.toarray(), y_test))
Epoch 1/5
20/20 [==============================] - 1s 3ms/step - loss: 0.6910 - accuracy: 0.5560
Epoch 2/5
20/20 [==============================] - 0s 3ms/step - loss: 0.6697 - accuracy: 0.7800
Epoch 3/5
20/20 [==============================] - 0s 4ms/step - loss: 0.5963 - accuracy: 0.8635
Epoch 4/5
20/20 [==============================] - 0s 3ms/step - loss: 0.4505 - accuracy: 0.8945
Epoch 5/5
20/20 [==============================] - 0s 3ms/step - loss: 0.2864 - accuracy: 0.9310
63/63 [==============================] - 1s 3ms/step - loss: 0.2006 - accuracy: 0.9540
Training :  [0.20061413943767548, 0.9539999961853027]
32/32 [==============================] - 0s 3ms/step - loss: 0.4924 - accuracy: 0.7530
Testing :  [0.4923769235610962, 0.753000020980835]


# just for checking
X_train.A[0] == X_train.toarray()[0]
array([ True,  True,  True, ...,  True,  True,  True])
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((2000, 3000), (1000, 3000), (2000,), (1000,))
# RNN ??? ?? ??? ???
X_train_rnn = X_train.A.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_rnn = X_test.A.reshape((X_test.shape[0], 1, X_test.shape[1]))

print(X_train_rnn.shape, X_test_rnn.shape)
(2000, 1, 3000) (1000, 1, 3000)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

model = Sequential()
                    input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2]), 
# return_sequences: return the last output in the output sequence, or the full sequence
# By this, it is possible to access the hidden state output for each input time step.
model.add(Dense(2, activation="softmax"))
              optimizer='adam', metrics=['accuracy'])   
Model: "sequential_2"
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, 1, 128)            400512    
activation_1 (Activation)    (None, 1, 128)            0         
dropout_4 (Dropout)          (None, 1, 128)            0         
simple_rnn_1 (SimpleRNN)     (None, 128)               32896     
activation_2 (Activation)    (None, 128)               0         
dropout_5 (Dropout)          (None, 128)               0         
dense_6 (Dense)              (None, 2)                 258       
Total params: 433,666
Trainable params: 433,666
Non-trainable params: 0

model.fit(X_train_rnn, y_train_ohe, batch_size = 100,
Epoch 1/5
20/20 [==============================] - 2s 24ms/step - loss: 0.6887 - accuracy: 0.5465
Epoch 2/5
20/20 [==============================] - 0s 6ms/step - loss: 0.6325 - accuracy: 0.7965
Epoch 3/5
20/20 [==============================] - 0s 6ms/step - loss: 0.4334 - accuracy: 0.8890
Epoch 4/5
20/20 [==============================] - 0s 6ms/step - loss: 0.2157 - accuracy: 0.9240
Epoch 5/5
20/20 [==============================] - 0s 7ms/step - loss: 0.1026 - accuracy: 0.9690

y_pred = np.argmax(model.predict(X_test_rnn), axis=1)
print("accuracy score: ", metrics.accuracy_score(y_test, y_pred))
print("Training : ", model.evaluate(X_train_rnn, y_train_ohe))
print("Testing : ", model.evaluate(X_test_rnn, y_test_ohe))
accuracy score:  0.765
63/63 [==============================] - 1s 4ms/step - loss: 0.0617 - accuracy: 0.9870
Training :  [0.061724983155727386, 0.9869999885559082]
32/32 [==============================] - 0s 3ms/step - loss: 0.5956 - accuracy: 0.7650
Testing :  [0.5956090688705444, 0.7649999856948853]


  • https://colah.github.io/posts/2015-08-Understanding-LSTMs/
from keras.layers import LSTM
model = Sequential()
               input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2]), 
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_rnn, y_train_ohe, batch_size = 100,
Epoch 1/5
20/20 [==============================] - 6s 10ms/step - loss: 0.6931 - accuracy: 0.5375
Epoch 2/5
20/20 [==============================] - 0s 10ms/step - loss: 0.6903 - accuracy: 0.8005
Epoch 3/5
20/20 [==============================] - 0s 10ms/step - loss: 0.6703 - accuracy: 0.8725
Epoch 4/5
20/20 [==============================] - 0s 10ms/step - loss: 0.5868 - accuracy: 0.8930
Epoch 5/5
20/20 [==============================] - 0s 10ms/step - loss: 0.4041 - accuracy: 0.9105

y_pred = np.argmax(model.predict(X_test_rnn), axis=1)
print("accuracy score: ", metrics.accuracy_score(y_test, y_pred))
print("Training : ", model.evaluate(X_train_rnn, y_train_ohe))
print("Testing : ", model.evaluate(X_test_rnn, y_test_ohe))
accuracy score:  0.774
63/63 [==============================] - 1s 5ms/step - loss: 0.2751 - accuracy: 0.9315
Training :  [0.27514004707336426, 0.9315000176429749]
32/32 [==============================] - 0s 5ms/step - loss: 0.4868 - accuracy: 0.7740
Testing :  [0.4867841601371765, 0.7739999890327454]


from keras.layers import GRU
model = Sequential()
model.add(GRU(128, input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2]), return_sequences=True))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_rnn, y_train_ohe, batch_size = 100,
Epoch 1/5
20/20 [==============================] - 4s 10ms/step - loss: 0.6921 - accuracy: 0.5170
Epoch 2/5
20/20 [==============================] - 0s 10ms/step - loss: 0.6716 - accuracy: 0.7150
Epoch 3/5
20/20 [==============================] - 0s 10ms/step - loss: 0.5628 - accuracy: 0.8495
Epoch 4/5
20/20 [==============================] - 0s 11ms/step - loss: 0.3357 - accuracy: 0.9130
Epoch 5/5
20/20 [==============================] - 0s 10ms/step - loss: 0.1801 - accuracy: 0.9390

y_pred = np.argmax(model.predict(X_test_rnn), axis=1)
print("accuracy score: ", metrics.accuracy_score(y_test, y_pred))
print("Training : ", model.evaluate(X_train_rnn, y_train_ohe))
print("Testing : ", model.evaluate(X_test_rnn, y_test_ohe))
accuracy score:  0.766
63/63 [==============================] - 1s 5ms/step - loss: 0.1123 - accuracy: 0.9670
Training :  [0.11229105293750763, 0.9670000076293945]
32/32 [==============================] - 0s 5ms/step - loss: 0.5520 - accuracy: 0.7660
Testing :  [0.552036702632904, 0.765999972820282]


한국어 불용어 처리

  • 한국어 불용어 확인은 형태소 분석 라이브러리인 KonLPy 를 이용하면 됨.
  • (예) 한국어 품사 중 조사를 추출하는 예
  • pos (part-of-speech): 품사 (명사, 동사, …)
from konlpy.tag import Twitter
Twitter().morphs("텍스트 데이터를 이용해서 불용어 사전을 구축하기 위한 간단 예제")

Twitter().pos("텍스트 데이터를 이용해서 불용어 사잔을 구축하기 위한 간단 예제")
[('텍스트', 'Noun'),
 ('데이터', 'Noun'),
 ('를', 'Josa'),
 ('이용', 'Noun'),
 ('해서', 'Verb'),
 ('불', 'Noun'),
 ('용어', 'Noun'),
 ('사잔', 'Noun'),
 ('을', 'Josa'),
 ('구축', 'Noun'),
 ('하기', 'Verb'),
 ('위', 'Noun'),
 ('한', 'Josa'),
 ('간단', 'Noun'),
 ('예제', 'Noun')]

Twitter().pos("텍스트 데이터를 이용해서 불용어 사전을 구축하기 위한 간단 예제", norm=True)   # norm - 오타 수정 (사잔->사전)
[('텍스트', 'Noun'),
 ('데이터', 'Noun'),
 ('를', 'Josa'),
 ('이용', 'Noun'),
 ('해서', 'Verb'),
 ('불', 'Noun'),
 ('용어', 'Noun'),
 ('사전', 'Noun'),
 ('을', 'Josa'),
 ('구축', 'Noun'),
 ('하기', 'Verb'),
 ('위', 'Noun'),
 ('한', 'Josa'),
 ('간단', 'Noun'),
 ('예제', 'Noun')]

Twitter().nouns("텍스트 데이터를 이용해서 불용어 사전을 구축하기 위한 간단 예제")
['텍스트', '데이터', '이용', '불', '용어', '사전', '구축', '위', '간단', '예제']
  • norm: 오타수정, stem: 어근 찾기

from konlpy.tag import Twitter

word_tags = Twitter().pos("텍스트 데이터를 이용해서 불용어 사전을 구축하기 위한 간단 예제", norm=True, stem=True)
stop_words = [word[0] for word in word_tags if word[1]=="Josa"]
print (stop_words)
[('텍스트', 'Noun'), ('데이터', 'Noun'), ('를', 'Josa'), ('이용', 'Noun'), ('하다', 'Verb'), ('불', 'Noun'), ('용어', 'Noun'), ('사전', 'Noun'), ('을', 'Josa'), ('구축', 'Noun'), ('하다', 'Verb'), ('위', 'Noun'), ('한', 'Josa'), ('간단', 'Noun'), ('예제', 'Noun')]
['를', '을', '한']


  • “Pickling” is the process whereby a Python object hierarchy is converted into a byte stream, and “unpickling” is the inverse operation, whereby a byte stream (from a binary file or bytes-like object) is converted back into an object hierarchy.
  • Pickling (and unpickling) is alternatively known as “serialization”, “marshalling,” or “flattening”; however, to avoid confusion, the terms “pickling” and “unpickling” are being mostly used.
  • Comparison with json
    • There are fundamental differences between the pickle protocols and JSON (JavaScript Object Notation):

    • JSON is a text serialization format (it outputs unicode text, although most of the time it is then encoded to utf-8), while pickle is a binary serialization format;
    • JSON is human-readable, while pickle is not;
    • JSON is interoperable and widely used outside of the Python ecosystem, while pickle is Python-specific;
    • JSON, by default, can only represent a subset of the Python built-in types, and no custom classes; pickle can represent an extremely large number of Python types (many of them automatically, by clever usage of Python’s introspection facilities; complex cases can be tackled by implementing specific object APIs);
    • Unlike pickle, deserializing untrusted JSON does not in itself create an arbitrary code execution vulnerability.
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
cv_ = TfidfVectorizer(tokenizer=twitter_tokenizer, max_features=10)
import os
import pickle
if not os.path.isfile("X_data.pickle"): 
    print('file does not exist')
    X_data_pre = cv_.fit_transform(text_train)
    pickle.dump(X_data_pre, open("X_data.pickle", "wb"))

# ??? tfidf vector ??? ??
with open('X_data.pickle', 'rb') as f:
    X_data_post = pickle.load(f)
X_data_pre.toarray() == X_data_post.toarray()
array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])


  • BoW (Bag of Words)
    • document-term matrix
    • tfidf matrix
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
vectorizer1 = CountVectorizer()
X = vectorizer1.fit_transform(corpus)
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

vectorizer2 = TfidfVectorizer()
X = vectorizer2.fit_transform(corpus)
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0.   0.47 0.58 0.38 0.   0.   0.38 0.   0.38]
 [0.   0.69 0.   0.28 0.   0.54 0.28 0.   0.28]
 [0.51 0.   0.   0.27 0.51 0.   0.27 0.51 0.27]
 [0.   0.47 0.58 0.38 0.   0.   0.38 0.   0.38]]

