[Recommendation System] TMDB 5000 데이터를 이용한 콘텐츠 기반 필터링 실습

4 minute read


TMDB 5000 데이터를 이용한 콘텐츠 기반 필터링 실습

TMDB 5000은 캐글의 영화 데이터 세트입니다.

# 필요 라이브러리 import
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')


1. 데이터 전처리

from ast import literal_eval

movies =pd.read_csv('assets/tmdb_5000_movies.csv')
movies_df = movies[['id','title', 'genres', 'vote_average', 'vote_count',
                    'popularity', 'keywords', 'overview']]
pd.set_option('max_colwidth', 100)
movies_df[['genres','keywords']][:1]

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

movies_df['genres'] = movies_df['genres'].apply(lambda x : [ y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [ y['name'] for y in x])

movies_df[['genres', 'keywords']].head(5)
genres keywords
0 [Action, Adventure, Fantasy, Science Fiction] [culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa...
1 [Adventure, Fantasy, Action] [ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, ship...
2 [Action, Adventure, Crime] [spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom]
3 [Action, Crime, Drama, Thriller] [dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham...
4 [Action, Adventure, Science Fiction] [based on novel, mars, medallion, space travel, princess, alien, steampunk, martian, escape, edg...


# CountVectorizer를 적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환 
from sklearn.feature_extraction.text import CountVectorizer

# 딕셔너리 형태를 리스트로 변환한 genres_literal 칼럼 생성
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
from sklearn.metrics.pairwise import cosine_similarity

# 코사인 유사도
genre_sim = cosine_similarity(genre_mat, genre_mat)
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
# 장르 유사도 리스트
print(genre_sim_sorted_ind[:1])
[[   0 3494  813 ... 3038 3037 2401]]


2. 코사인 유사도가 높은 영화 검색

def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    
    # 인자로 입력된 movies_df DataFrame에서 'title' 컬럼이 입력된 title_name 값인 DataFrame추출
    title_movie = df[df['title'] == title_name]
    
    # title_named을 가진 DataFrame의 index 객체를 ndarray로 반환하고 
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n 개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]
    
    # 추출된 top_n index들 출력. top_n index는 2차원 데이터 임. 
    #dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather',10)

similar_movies
id title genres vote_average vote_count popularity keywords overview genres_literal weighted_vote
2731 240 The Godfather: Part II [Drama, Crime] 8.3 3338 105.792936 [italo-american, cuba, vororte, melancholy, praise, revenge, mafia, lawyer, blood, corrupt polit... In the continuing saga of the Corleone crime family, a young Vito Corleone grows up in Sicily an... Drama Crime 8.079586
1847 769 GoodFellas [Drama, Crime] 8.2 3128 63.654244 [prison, based on novel, florida, 1970s, mass murder, irish-american, drug traffic, biography, b... The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbo... Drama Crime 7.976937
3866 598 City of God [Drama, Crime] 8.1 1814 44.356711 [male nudity, street gang, brazilian, photographer, 1970s, puberty, ghetto, gang war, coming of ... Cidade de Deus is a shantytown that started during the 1960s and became one of Rio de Janeiro’s ... Drama Crime 7.759693
1663 311 Once Upon a Time in America [Drama, Crime] 8.2 1069 49.336397 [life and death, corruption, street gang, rape, sadistic, lovesickness, sexual abuse, money laun... A former Prohibition-era Jewish gangster returns to the Lower East Side of Manhattan over thirty... Drama Crime 7.657811
883 640 Catch Me If You Can [Drama, Crime] 7.7 3795 73.944049 [con man, biography, fbi agent, overhead camera shot, attempted jailbreak, engagement party, mis... A true story about Frank Abagnale Jr. who, before his 19th birthday, successfully conned million... Drama Crime 7.557097
281 4982 American Gangster [Drama, Crime] 7.4 1502 42.361215 [underdog, black people, drug traffic, drug smuggle, society, ambition, rise and fall, cop, drug... Following the death of his employer and mentor, Bumpy Johnson, Frank Lucas establishes himself a... Drama Crime 7.141396
4041 11798 This Is England [Drama, Crime] 7.4 363 8.395624 [holiday, skinhead, england, vandalism, independent film, gang, racism, summer, youth, violence,... A story about a troubled boy growing up in England, set in 1983. He comes across a few skinheads... Drama Crime 6.739664
1149 168672 American Hustle [Drama, Crime] 6.8 2807 49.664128 [con artist, scam, mobster, fbi agent] A con man, Irving Rosenfeld, along with his seductive partner Sydney Prosser, is forced to work ... Drama Crime 6.717525
1243 203 Mean Streets [Drama, Crime] 7.2 345 17.002096 [epilepsy, protection money, secret love, money, redemption] A small-time hood must choose from among love, friendship and the chance to rise within the mob. Drama Crime 6.626569
2839 10220 Rounders [Drama, Crime] 6.9 439 18.422008 [gambling, law, compulsive gambling, roulette, gain] A young man is a reformed gambler who must return to playing big stakes poker to help a friend p... Drama Crime 6.530427


# 유사도를 나타내는 특징 컬럼을 데이터프레임에 추가

movies_df[['title','vote_average','vote_count']].sort_values('vote_average', ascending=False)[:10]
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
percentile = 0.6
m = movies_df['vote_count'].quantile(percentile)
C = movies_df['vote_average'].mean()


3. 최종 유사도 계산

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ( (v/(v+m)) * R ) + ( (m/(m+v)) * C )
# 유사도 가중치를 나타내는 특징 컬럼을 데이터프레임에 추가

movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1) 
movies_df[['title','vote_average','weighted_vote','vote_count']].sort_values('weighted_vote',
                                                                          ascending=False)[:10]
title vote_average weighted_vote vote_count
1881 The Shawshank Redemption 8.5 8.396052 8205
3337 The Godfather 8.4 8.263591 5893
662 Fight Club 8.3 8.216455 9413
3232 Pulp Fiction 8.3 8.207102 8428
65 The Dark Knight 8.2 8.136930 12002
1818 Schindler's List 8.3 8.126069 4329
3865 Whiplash 8.3 8.123248 4254
809 Forrest Gump 8.2 8.105954 7927
2294 Spirited Away 8.3 8.105867 3840
2731 The Godfather: Part II 8.3 8.079586 3338


4. 장르 유사성이 높은 영화 추천

def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values
    
    # top_n의 2배에 해당하는 쟝르 유사성이 높은 index 추출 
    similar_indexes = sorted_ind[title_index, :(top_n*2)]
    similar_indexes = similar_indexes.reshape(-1)
    # 기준 영화 index는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    # top_n의 2배에 해당하는 후보군에서 weighted_vote 높은 순으로 top_n 만큼 추출 
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather',10)
similar_movies[['title', 'vote_average', 'weighted_vote']]
title vote_average weighted_vote
2731 The Godfather: Part II 8.3 8.079586
1847 GoodFellas 8.2 7.976937
3866 City of God 8.1 7.759693
1663 Once Upon a Time in America 8.2 7.657811
883 Catch Me If You Can 7.7 7.557097
281 American Gangster 7.4 7.141396
4041 This Is England 7.4 6.739664
1149 American Hustle 6.8 6.717525
1243 Mean Streets 7.2 6.626569
2839 Rounders 6.9 6.530427

Leave a comment