[Deep Learning] Naver Movie Sentiment Analysis

9 minute read

  • 감성분석
    • 네이버 영화평점 (Naver sentiment movie corpus v.1.0) 데이터(https://github.com/e9t/nsmc)
    • 영화 리뷰 20만건이 저장됨. 각 평가 데이터는 0(부정), 1(긍정)으로 label 됨.
  • 한글 자연어 처리
    • KoNLPy(“코엔엘파이”라고 읽습니다)는 한국어 정보처리를 위한 파이썬 패키지입니다.
    • konlpy 패키지에서 제공하는 Twitter라는 문서 분석 라이브러리 사용 (트위터 분석 뿐 아니라 한글 텍스트 처리도 가능)
    • colab 사용 권장
!pip install konlpy
# 패키지 설치
import konlpy
import pandas as pd
import numpy as np
from konlpy.tag import Twitter # Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from sklearn import model_selection, metrics

# 토큰 파서(품사별)
def twitter_tokenizer(text):
    return Twitter().morphs(text)
twitter_tokenizer("Welcome to data science world!...한국말도 똑 같아요...")
['Welcome',
 'to',
 'data',
 'science',
 'world',
 '!...',
 '한국말',
 '도',
 '똑',
 '같아요',
 '...']


!curl -L https://bit.ly/2X9Owwr -o ratings_train.txt
!curl -L https://bit.ly/2WuLd5I -o ratings_test.txt
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   152  100   152    0     0   1831      0 --:--:-- --:--:-- --:--:--  1831
100   148    0   148    0     0    452      0 --:--:-- --:--:-- --:--:--   452
100   318  100   318    0     0    624      0 --:--:-- --:--:-- --:--:--  1939
100 14.0M  100 14.0M    0     0  12.5M      0  0:00:01  0:00:01 --:--:-- 12.5M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   151  100   151    0     0   3081      0 --:--:-- --:--:-- --:--:--  3081
100   147    0   147    0     0    578      0 --:--:-- --:--:-- --:--:--   907
100   318  100   318    0     0    757      0 --:--:-- --:--:-- --:--:--   757
100 4827k  100 4827k    0     0  5510k      0 --:--:-- --:--:-- --:--:-- 5510k
# 데이터 로드
df_train = pd.read_csv('ratings_train.txt', delimiter='\t', keep_default_na=False)
df_test = pd.read_csv('ratings_test.txt', delimiter='\t', keep_default_na=False)

df_train[:5]

image-20211129164010036

df_test[:5]

image-20211129164031719


df_train['document'].values == df_train['document'].to_numpy()
array([ True,  True,  True, ...,  True,  True,  True])
text_train, y_train = df_train['document'].to_numpy(), df_train['label'].to_numpy()
text_test, y_test = df_test['document'].to_numpy(), df_test['label'].to_numpy()
text_train.shape, y_train.shape, text_test.shape, y_test.shape
((150000,), (150000,), (50000,), (50000,))


# too much... -> let's take few of them
text_train, y_train = text_train[:2000], y_train[:2000]
text_test, y_test = text_test[:1000], y_test[:1000]
y_train.shape, y_test.shape
((2000,), (1000,))
y_train.mean(), y_test.mean()    # check distribution of classes 1 and 0
(0.4945, 0.508)


cv = TfidfVectorizer(tokenizer=twitter_tokenizer, max_features=3000)
X_train = cv.fit_transform(text_train)
X_test = cv.transform(text_test) # cv.fit_transform(text_test) (X)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((2000, 3000), (2000,), (1000, 3000), (1000,))
cv.get_feature_names()[-5:]
['????', '???', '???', '?????', '????']



Linear Classification

from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
result = clf.fit(X_train,y_train)
feature_names = cv.get_feature_names()
print("Training : ", result.score(X_train, y_train))
print("Testing : ", result.score(X_test, y_test))
Training :  0.985
Testing :  0.739



Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
result = lr.fit(X_train,y_train)
feature_names = cv.get_feature_names()
print("Training : ", result.score(X_train, y_train))
print("Testing : ", result.score(X_test, y_test))
Training :  0.916
Testing :  0.771



MLP

# use one-hot encoded target (2 multi-class classification)
from tensorflow.keras.utils import to_categorical
y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)
max_words = X_train.shape[1]
batch_size = 100
nb_epoch = 5

model = Sequential()
model.add(Dense(64, input_shape=(max_words,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train.toarray(), y_train_ohe, epochs=nb_epoch, 
          batch_size=batch_size) 
print("Training : ", model.evaluate(X_train.toarray(), y_train_ohe))
print("Testing : ", model.evaluate(X_test.toarray(), y_test_ohe))
# use binary target (binary classifiaction)
max_words = X_train.shape[1]
batch_size = 100
nb_epoch = 5

model = Sequential()
model.add(Dense(64, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train.toarray(), y_train, epochs=nb_epoch, 
          batch_size=batch_size) 
print("Training : ", model.evaluate(X_train.toarray(), y_train))
print("Testing : ", model.evaluate(X_test.toarray(), y_test))
Epoch 1/5
20/20 [==============================] - 1s 3ms/step - loss: 0.6910 - accuracy: 0.5560
Epoch 2/5
20/20 [==============================] - 0s 3ms/step - loss: 0.6697 - accuracy: 0.7800
Epoch 3/5
20/20 [==============================] - 0s 4ms/step - loss: 0.5963 - accuracy: 0.8635
Epoch 4/5
20/20 [==============================] - 0s 3ms/step - loss: 0.4505 - accuracy: 0.8945
Epoch 5/5
20/20 [==============================] - 0s 3ms/step - loss: 0.2864 - accuracy: 0.9310
63/63 [==============================] - 1s 3ms/step - loss: 0.2006 - accuracy: 0.9540
Training :  [0.20061413943767548, 0.9539999961853027]
32/32 [==============================] - 0s 3ms/step - loss: 0.4924 - accuracy: 0.7530
Testing :  [0.4923769235610962, 0.753000020980835]



RNN

# just for checking
X_train.A[0] == X_train.toarray()[0]
array([ True,  True,  True, ...,  True,  True,  True])
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((2000, 3000), (1000, 3000), (2000,), (1000,))
# RNN ??? ?? ??? ???
X_train_rnn = X_train.A.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_rnn = X_test.A.reshape((X_test.shape[0], 1, X_test.shape[1]))

print(X_train_rnn.shape, X_test_rnn.shape)
(2000, 1, 3000) (1000, 1, 3000)


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

model = Sequential()
model.add(SimpleRNN(128, 
                    input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2]), 
                    return_sequences=True))
# return_sequences: return the last output in the output sequence, or the full sequence
# By this, it is possible to access the hidden state output for each input time step.
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(SimpleRNN(128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation="softmax"))
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])   
model.summary()
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
simple_rnn (SimpleRNN)       (None, 1, 128)            400512    
_________________________________________________________________
activation_1 (Activation)    (None, 1, 128)            0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 1, 128)            0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 128)               32896     
_________________________________________________________________
activation_2 (Activation)    (None, 128)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 258       
=================================================================
Total params: 433,666
Trainable params: 433,666
Non-trainable params: 0
_________________________________________________________________


model.fit(X_train_rnn, y_train_ohe, batch_size = 100,
          epochs=nb_epoch)
Epoch 1/5
20/20 [==============================] - 2s 24ms/step - loss: 0.6887 - accuracy: 0.5465
Epoch 2/5
20/20 [==============================] - 0s 6ms/step - loss: 0.6325 - accuracy: 0.7965
Epoch 3/5
20/20 [==============================] - 0s 6ms/step - loss: 0.4334 - accuracy: 0.8890
Epoch 4/5
20/20 [==============================] - 0s 6ms/step - loss: 0.2157 - accuracy: 0.9240
Epoch 5/5
20/20 [==============================] - 0s 7ms/step - loss: 0.1026 - accuracy: 0.9690


y_pred = np.argmax(model.predict(X_test_rnn), axis=1)
print("accuracy score: ", metrics.accuracy_score(y_test, y_pred))
print("Training : ", model.evaluate(X_train_rnn, y_train_ohe))
print("Testing : ", model.evaluate(X_test_rnn, y_test_ohe))
accuracy score:  0.765
63/63 [==============================] - 1s 4ms/step - loss: 0.0617 - accuracy: 0.9870
Training :  [0.061724983155727386, 0.9869999885559082]
32/32 [==============================] - 0s 3ms/step - loss: 0.5956 - accuracy: 0.7650
Testing :  [0.5956090688705444, 0.7649999856948853]



LSTM

  • https://colah.github.io/posts/2015-08-Understanding-LSTMs/
from keras.layers import LSTM
model = Sequential()
model.add(LSTM(128, 
               input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2]), 
               return_sequences=True))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_rnn, y_train_ohe, batch_size = 100,
          epochs=nb_epoch)
Epoch 1/5
20/20 [==============================] - 6s 10ms/step - loss: 0.6931 - accuracy: 0.5375
Epoch 2/5
20/20 [==============================] - 0s 10ms/step - loss: 0.6903 - accuracy: 0.8005
Epoch 3/5
20/20 [==============================] - 0s 10ms/step - loss: 0.6703 - accuracy: 0.8725
Epoch 4/5
20/20 [==============================] - 0s 10ms/step - loss: 0.5868 - accuracy: 0.8930
Epoch 5/5
20/20 [==============================] - 0s 10ms/step - loss: 0.4041 - accuracy: 0.9105


y_pred = np.argmax(model.predict(X_test_rnn), axis=1)
print("accuracy score: ", metrics.accuracy_score(y_test, y_pred))
print("Training : ", model.evaluate(X_train_rnn, y_train_ohe))
print("Testing : ", model.evaluate(X_test_rnn, y_test_ohe))
accuracy score:  0.774
63/63 [==============================] - 1s 5ms/step - loss: 0.2751 - accuracy: 0.9315
Training :  [0.27514004707336426, 0.9315000176429749]
32/32 [==============================] - 0s 5ms/step - loss: 0.4868 - accuracy: 0.7740
Testing :  [0.4867841601371765, 0.7739999890327454]



GRU

from keras.layers import GRU
model = Sequential()
model.add(GRU(128, input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2]), return_sequences=True))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(GRU(128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_rnn, y_train_ohe, batch_size = 100,
          epochs=nb_epoch)
Epoch 1/5
20/20 [==============================] - 4s 10ms/step - loss: 0.6921 - accuracy: 0.5170
Epoch 2/5
20/20 [==============================] - 0s 10ms/step - loss: 0.6716 - accuracy: 0.7150
Epoch 3/5
20/20 [==============================] - 0s 10ms/step - loss: 0.5628 - accuracy: 0.8495
Epoch 4/5
20/20 [==============================] - 0s 11ms/step - loss: 0.3357 - accuracy: 0.9130
Epoch 5/5
20/20 [==============================] - 0s 10ms/step - loss: 0.1801 - accuracy: 0.9390


y_pred = np.argmax(model.predict(X_test_rnn), axis=1)
print("accuracy score: ", metrics.accuracy_score(y_test, y_pred))
print("Training : ", model.evaluate(X_train_rnn, y_train_ohe))
print("Testing : ", model.evaluate(X_test_rnn, y_test_ohe))
accuracy score:  0.766
63/63 [==============================] - 1s 5ms/step - loss: 0.1123 - accuracy: 0.9670
Training :  [0.11229105293750763, 0.9670000076293945]
32/32 [==============================] - 0s 5ms/step - loss: 0.5520 - accuracy: 0.7660
Testing :  [0.552036702632904, 0.765999972820282]



Exercise

한국어 불용어 처리

  • 한국어 불용어 확인은 형태소 분석 라이브러리인 KonLPy 를 이용하면 됨.
  • (예) 한국어 품사 중 조사를 추출하는 예
  • pos (part-of-speech): 품사 (명사, 동사, …)
from konlpy.tag import Twitter
Twitter().morphs("텍스트 데이터를 이용해서 불용어 사전을 구축하기 위한 간단 예제")
['텍스트',
 '데이터',
 '를',
 '이용',
 '해서',
 '불',
 '용어',
 '사전',
 '을',
 '구축',
 '하기',
 '위',
 '한',
 '간단',
 '예제']


Twitter().pos("텍스트 데이터를 이용해서 불용어 사잔을 구축하기 위한 간단 예제")
[('텍스트', 'Noun'),
 ('데이터', 'Noun'),
 ('를', 'Josa'),
 ('이용', 'Noun'),
 ('해서', 'Verb'),
 ('불', 'Noun'),
 ('용어', 'Noun'),
 ('사잔', 'Noun'),
 ('을', 'Josa'),
 ('구축', 'Noun'),
 ('하기', 'Verb'),
 ('위', 'Noun'),
 ('한', 'Josa'),
 ('간단', 'Noun'),
 ('예제', 'Noun')]


Twitter().pos("텍스트 데이터를 이용해서 불용어 사전을 구축하기 위한 간단 예제", norm=True)   # norm - 오타 수정 (사잔->사전)
[('텍스트', 'Noun'),
 ('데이터', 'Noun'),
 ('를', 'Josa'),
 ('이용', 'Noun'),
 ('해서', 'Verb'),
 ('불', 'Noun'),
 ('용어', 'Noun'),
 ('사전', 'Noun'),
 ('을', 'Josa'),
 ('구축', 'Noun'),
 ('하기', 'Verb'),
 ('위', 'Noun'),
 ('한', 'Josa'),
 ('간단', 'Noun'),
 ('예제', 'Noun')]


Twitter().nouns("텍스트 데이터를 이용해서 불용어 사전을 구축하기 위한 간단 예제")
['텍스트', '데이터', '이용', '불', '용어', '사전', '구축', '위', '간단', '예제']
  • norm: 오타수정, stem: 어근 찾기


from konlpy.tag import Twitter

word_tags = Twitter().pos("텍스트 데이터를 이용해서 불용어 사전을 구축하기 위한 간단 예제", norm=True, stem=True)
print(word_tags)
stop_words = [word[0] for word in word_tags if word[1]=="Josa"]
print (stop_words)
[('텍스트', 'Noun'), ('데이터', 'Noun'), ('를', 'Josa'), ('이용', 'Noun'), ('하다', 'Verb'), ('불', 'Noun'), ('용어', 'Noun'), ('사전', 'Noun'), ('을', 'Josa'), ('구축', 'Noun'), ('하다', 'Verb'), ('위', 'Noun'), ('한', 'Josa'), ('간단', 'Noun'), ('예제', 'Noun')]
['를', '을', '한']


Pickling

  • “Pickling” is the process whereby a Python object hierarchy is converted into a byte stream, and “unpickling” is the inverse operation, whereby a byte stream (from a binary file or bytes-like object) is converted back into an object hierarchy.
  • Pickling (and unpickling) is alternatively known as “serialization”, “marshalling,” or “flattening”; however, to avoid confusion, the terms “pickling” and “unpickling” are being mostly used.
  • Comparison with json
    • There are fundamental differences between the pickle protocols and JSON (JavaScript Object Notation):

    • JSON is a text serialization format (it outputs unicode text, although most of the time it is then encoded to utf-8), while pickle is a binary serialization format;
    • JSON is human-readable, while pickle is not;
    • JSON is interoperable and widely used outside of the Python ecosystem, while pickle is Python-specific;
    • JSON, by default, can only represent a subset of the Python built-in types, and no custom classes; pickle can represent an extremely large number of Python types (many of them automatically, by clever usage of Python’s introspection facilities; complex cases can be tackled by implementing specific object APIs);
    • Unlike pickle, deserializing untrusted JSON does not in itself create an arbitrary code execution vulnerability.
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
cv_ = TfidfVectorizer(tokenizer=twitter_tokenizer, max_features=10)
import os
import pickle
if not os.path.isfile("X_data.pickle"): 
    print('file does not exist')
    X_data_pre = cv_.fit_transform(text_train)
    pickle.dump(X_data_pre, open("X_data.pickle", "wb"))


# ??? tfidf vector ??? ??
with open('X_data.pickle', 'rb') as f:
    X_data_post = pickle.load(f)
X_data_pre.toarray() == X_data_post.toarray()
array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])


Vectorizing

  • BoW (Bag of Words)
    • document-term matrix
    • tfidf matrix
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
]
vectorizer1 = CountVectorizer()
X = vectorizer1.fit_transform(corpus)
print(vectorizer1.get_feature_names())
print(X.toarray())
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


vectorizer2 = TfidfVectorizer()
X = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X.toarray().round(2))
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0.   0.47 0.58 0.38 0.   0.   0.38 0.   0.38]
 [0.   0.69 0.   0.28 0.   0.54 0.28 0.   0.28]
 [0.51 0.   0.   0.27 0.51 0.   0.27 0.51 0.27]
 [0.   0.47 0.58 0.38 0.   0.   0.38 0.   0.38]]

Leave a comment