[AITech] 20220121 - Pandas

15 minute read


강의 복습 내용

Pandas I

Data loading

import pandas as pd

data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
df_data = pd.read_csv(data_url, sep='\s+', header=None)

df_data.head()
0 1 2 3 4 5 6 7 8 9 10 11 12 13
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2

Series

  • Series(data, index, dtype): Series 객체를 생성
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
'''Series 객체 생성하기'''
# 리스트로 만들기
list_data = [1,2,3,4,5]
list_name = ["a", "b", "c", "d", "e"] # 리스트로 생성 시 data 길이와 index 길이 같아야 함
# list=name = ["a", "b", "c", "d", "e", "f", "g", "h"] -> error!!!
example_obj = Series(data=list_data, index=list_name)
print(example_obj, type(example_obj))
a    1
b    2
c    3
d    4
e    5
dtype: int64 <class 'pandas.core.series.Series'>
# 딕셔너리로 만들기
dict_data = {"a":1, "b":2, "c":3, "d":4, "e":5}
indices = ["a", "b", "c", "d", "e", "f", "g", "h"] # data 길이와 달라도 index 기준
example_obj = Series(data=dict_data, index=indices, dtype=np.float32, name="example_data")
print(example_obj)
a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
Name: example_data, dtype: float32
'''인덱스 접근'''
print(example_obj["a"])
example_obj["a"] = 3.5
print(example_obj)
1.0
a    3.5
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
Name: example_data, dtype: float32
'''Series 프로퍼티'''
# values
print(example_obj.values)
# index
print(example_obj.index)
# name
example_obj.name = "number"
example_obj.index.name = "alphabet"
print(example_obj)
[3.5 2.  3.  4.  5.  nan nan nan]
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], dtype='object')
alphabet
a    3.5
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
Name: number, dtype: float32

DataFrame

  • DataFrame(data, columns, index)
'''딕셔너리 만들기'''
# 딕셔너리로 데이터프레임 생성
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
           'last_name': ['Miller', 'Jacobson', 'All', 'Milner', 'Cooze'], 
           'age': [42, 52, 36, 24, 73], 
           'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}
df = pd.DataFrame(data=raw_data)
# df = pd.DataFrame(data=raw_data, columns=['first_name', 'last_name', 'age', 'city'])
print(df)
  first_name last_name  age           city
0      Jason    Miller   42  San Francisco
1      Molly  Jacobson   52      Baltimore
2       Tina       All   36          Miami
3       Jake    Milner   24        Douglas
4        Amy     Cooze   73         Boston
# column 설정
print(DataFrame(raw_data, columns=["age", "city"]))
print(DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'city','debt']))
   age           city
0   42  San Francisco
1   52      Baltimore
2   36          Miami
3   24        Douglas
4   73         Boston
  first_name last_name  age           city debt
0      Jason    Miller   42  San Francisco  NaN
1      Molly  Jacobson   52      Baltimore  NaN
2       Tina       All   36          Miami  NaN
3       Jake    Milner   24        Douglas  NaN
4        Amy     Cooze   73         Boston  NaN
'''인덱싱'''
# column 추출(Series 추출)
print(df.first_name) # df["first_name"]

# loc: 인덱스 이름
df.index = [3,4,0,1,2]
print(df)
print(df.loc[:2])

# iloc: 인덱스 순서
print(df.iloc[:2])
0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
4      Molly  Jacobson   52      Baltimore
0       Tina       All   36          Miami
1       Jake    Milner   24        Douglas
2        Amy     Cooze   73         Boston
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
4      Molly  Jacobson   52      Baltimore
0       Tina       All   36          Miami
1       Jake    Milner   24        Douglas
2        Amy     Cooze   73         Boston
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
4      Molly  Jacobson   52      Baltimore
'''데이터프레임 핸들링'''
# 새로운 column 추가
df["debt"] = df.age > 40 # df.debt를 사용하려면 이미 그 열이 있어야만 함
print(df)

# column 삭제
del df["debt"] # df.drop(debt, axis=1, inplace=True)
print(df)
  first_name last_name  age           city   debt
3      Jason    Miller   42  San Francisco   True
4      Molly  Jacobson   52      Baltimore   True
0       Tina       All   36          Miami  False
1       Jake    Milner   24        Douglas  False
2        Amy     Cooze   73         Boston   True
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
4      Molly  Jacobson   52      Baltimore
0       Tina       All   36          Miami
1       Jake    Milner   24        Douglas
2        Amy     Cooze   73         Boston
# transpose
print(df.T)
                        3          4      0        1       2
first_name          Jason      Molly   Tina     Jake     Amy
last_name          Miller   Jacobson    All   Milner   Cooze
age                    42         52     36       24      73
city        San Francisco  Baltimore  Miami  Douglas  Boston
# 값 출력
print(df.values, type(df.values))
[['Jason' 'Miller' 42 'San Francisco']
 ['Molly' 'Jacobson' 52 'Baltimore']
 ['Tina' 'All' 36 'Miami']
 ['Jake' 'Milner' 24 'Douglas']
 ['Amy' 'Cooze' 73 'Boston']] <class 'numpy.ndarray'>
# csv 변환
print(df.to_csv())
,first_name,last_name,age,city
3,Jason,Miller,42,San Francisco
4,Molly,Jacobson,52,Baltimore
0,Tina,All,36,Miami
1,Jake,Milner,24,Douglas
2,Amy,Cooze,73,Boston

selection & drop

'''Selection with column names'''
# column 한 개
print(df["age"].head(3))
# column 여러 개
print(df[["first_name", "last_name", "age"]].head(3))
3    42
4    52
0    36
Name: age, dtype: int64
  first_name last_name  age
3      Jason    Miller   42
4      Molly  Jacobson   52
0       Tina       All   36
'''Selection with index number'''
# 인덱스 이름
print(df.loc[:2])
# 인덱스 순서
print(df.iloc[:2]) # df[:2]
print(df.iloc[[0,2,4]])
# 열과 함께 선택
print(df["first_name"][:2])
# boolean 인덱스
print(df[df.age > 40])
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
4      Molly  Jacobson   52      Baltimore
0       Tina       All   36          Miami
1       Jake    Milner   24        Douglas
2        Amy     Cooze   73         Boston
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
4      Molly  Jacobson   52      Baltimore
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
0       Tina       All   36          Miami
2        Amy     Cooze   73         Boston
3    Jason
4    Molly
Name: first_name, dtype: object
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
4      Molly  Jacobson   52      Baltimore
2        Amy     Cooze   73         Boston
'''Selection with column&index'''
# column name & index order
print(df[["first_name", "last_name"]][:3])
# index name & column name
print(df.loc[[3,4],["first_name", "last_name"]])
# index order & column order
print(df.iloc[:2,:3])
  first_name last_name
3      Jason    Miller
4      Molly  Jacobson
0       Tina       All
  first_name last_name
3      Jason    Miller
4      Molly  Jacobson
  first_name last_name  age
3      Jason    Miller   42
4      Molly  Jacobson   52
'''data drop'''
print(df)
# index name으로 drop
print(df.drop(1, axis=0))
print(df.drop([0,1,2,3],axis=0))
# column name으로 drop
print(df.drop("city", axis=1))
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
4      Molly  Jacobson   52      Baltimore
0       Tina       All   36          Miami
1       Jake    Milner   24        Douglas
2        Amy     Cooze   73         Boston
  first_name last_name  age           city
3      Jason    Miller   42  San Francisco
4      Molly  Jacobson   52      Baltimore
0       Tina       All   36          Miami
2        Amy     Cooze   73         Boston
  first_name last_name  age       city
4      Molly  Jacobson   52  Baltimore
  first_name last_name  age
3      Jason    Miller   42
4      Molly  Jacobson   52
0       Tina       All   36
1       Jake    Milner   24
2        Amy     Cooze   73

dataframe operations

'''Series operation'''
# index 기준으로 연산수행, 겹치는 index가 없을 경우 NaN값으로 반환
s1 = Series(range(1,6), index=list("abcde"))
s2 = Series(range(5,11), index=list("bcedef"))
print(s1, s2, sep='\n')
print(s1.add(s2)) # s1+s2
a    1
b    2
c    3
d    4
e    5
dtype: int64
b     5
c     6
e     7
d     8
e     9
f    10
dtype: int64
a     NaN
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f     NaN
dtype: float64
'''dataframe operation'''
df1 = DataFrame(np.arange(9).reshape(3,3), columns=list("abc"))
df2 = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd"))
print(df1, df2, sep='\n')
print(df1.add(df2, fill_value=0)) # fill_value
   a  b  c
0  0  1  2
1  3  4  5
2  6  7  8
    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
      a     b     c     d
0   0.0   2.0   4.0   3.0
1   7.0   9.0  11.0   7.0
2  14.0  16.0  18.0  11.0
3  12.0  13.0  14.0  15.0
'''series+dataframe'''
df = DataFrame(np.arange(16).reshape(4,4), columns=list('abcd'))
s2 = Series(np.arange(10,14))
print(df, s2, sep='\n')
print(df.add(s2, axis=0))
    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
0    10
1    11
2    12
3    13
dtype: int32
    a   b   c   d
0  10  11  12  13
1  15  16  17  18
2  20  21  22  23
3  25  26  27  28

lambda, map, apply

'''lambda, map'''
# 연산 적용
s1 = Series(np.arange(10))
print(s1.head(5))
print(s1.map(lambda x:x**2).head())
# dict type으로 데이터 치환(없는 값은 NaN)
z = {1:'A', 2:'B', 3:'C'}
print(s1.map(z).head())
# 같은 위치의 데이터를 s2로 교체
s2 = Series(np.arange(10, 20))
print(s1.map(s2).head())
0    0
1    1
2    2
3    3
4    4
dtype: int32
0     0
1     1
2     4
3     9
4    16
dtype: int64
0    NaN
1      A
2      B
3      C
4    NaN
dtype: object
0    10
1    11
2    12
3    13
4    14
dtype: int32
'''replace'''
# map 함수의 기능 중 '데이터 변환' 기능만 담당
raw_data = {'earn': [79571, 96396, 48710], 
           'height': [73.89, 66.23, 63.77], 
           'sex': ["male", "female", "female"], 
           'race': ['white', 'white', 'black'], 
           'age': [49, 62, 33]}
df = pd.DataFrame(data=raw_data)
print(df.sex.replace({"male":0, "female":1}))
0    0
1    1
2    1
Name: sex, dtype: int64
'''apply'''
# map과 달리, series 전체(column)에 해당 함수를 적용
df_info = df[["earn", "height", "age"]]
print(df_info.head())
f = lambda x: x.max() - x.min()
print(df_info.apply(f)) # 각 column 별로 결과값 반환

# sum, mean, std 등의 내장 연산 함수도 사용 가능
print(df_info.sum())
print(df_info.apply(sum))

# Series값으로 반환도 가능
def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])
print(df_info.apply(f))
    earn  height  age
0  79571   73.89   49
1  96396   66.23   62
2  48710   63.77   33
earn      47686.00
height       10.12
age          29.00
dtype: float64
earn      224677.00
height       203.89
age          144.00
dtype: float64
earn      224677.00
height       203.89
age          144.00
dtype: float64
      earn  height  age
min  48710   63.77   33
max  96396   73.89   62

Pandas built-in functions

# describe 
print(df.describe()) # object type은 제외
               earn     height        age
count      3.000000   3.000000   3.000000
mean   74892.333333  67.963333  48.000000
std    24184.831824   5.277967  14.525839
min    48710.000000  63.770000  33.000000
25%    64140.500000  65.000000  41.000000
50%    79571.000000  66.230000  49.000000
75%    87983.500000  70.060000  55.500000
max    96396.000000  73.890000  62.000000
# unique
print(df.sex.unique())
['male' 'female']
# 기본 연산
print(df.sum(axis=0))
print(df.mean(axis=1))
earn                224677
height              203.89
sex       malefemalefemale
race       whitewhiteblack
age                    144
dtype: object
0    26564.630000
1    32174.743333
2    16268.923333
dtype: float64
# isnull
print(df.isnull())
print(df.isnull().sum())
    earn  height    sex   race    age
0  False   False  False  False  False
1  False   False  False  False  False
2  False   False  False  False  False
earn      0
height    0
sex       0
race      0
age       0
dtype: int64
# sort_values
print(df.sort_values(["age", "earn"], ascending=True))
    earn  height     sex   race  age
2  48710   63.77  female  black   33
0  79571   73.89    male  white   49
1  96396   66.23  female  white   62
# correlation & covariance
print(df.age.corr(df.earn))
print(df.age.cov(df.earn))
print(df.corrwith(df.earn))
print(df.corr())
0.9941006402346648
349232.5
earn      1.000000
height    0.392674
age       0.994101
dtype: float64
            earn    height       age
earn    1.000000  0.392674  0.994101
height  0.392674  1.000000  0.290608
age     0.994101  0.290608  1.000000

###


2. Pandas II

groupby I

  • split -> apply -> combine
df = {"Points": [876,789,863,673,741], 
     "Rank": [1,2,2,3,3], 
     "Team": ["Riders", "Riders", "Devils", "Devils", "Kings"], 
     "Year": [2014, 2015, 2014, 2015, 2014]}
df = pd.DataFrame(df)
print(df)
   Points  Rank    Team  Year
0     876     1  Riders  2014
1     789     2  Riders  2015
2     863     2  Devils  2014
3     673     3  Devils  2015
4     741     3   Kings  2014
print(df.groupby("Team"))
print(pd.DataFrame(df.groupby("Team")))
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EEA6650F40>
        0                                                  1
0  Devils     Points  Rank    Team  Year
2     863     2 ...
1   Kings     Points  Rank   Team  Year
4     741     3  ...
2  Riders     Points  Rank    Team  Year
0     876     1 ...
print(df.groupby("Team")["Points"])
print(pd.Series(df.groupby("Team")["Points"]))
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001EEA6647D60>
0    (Devils, [863, 673])
1          (Kings, [741])
2    (Riders, [876, 789])
dtype: object
df.groupby("Team")["Points"].sum() # 묶을 기준, 연산을 적용할 컬럼, 적용할 연산
Team
Devils    1536
Kings      741
Riders    1665
Name: Points, dtype: int64
df.groupby(["Team", "Year"])["Points"].sum()
Team    Year
Devils  2014    863
        2015    673
Kings   2014    741
Riders  2014    876
        2015    789
Name: Points, dtype: int64
df.groupby(["Year", "Team"])["Points"].sum() 
Year  Team  
2014  Devils    863
      Kings     741
      Riders    876
2015  Devils    673
      Riders    789
Name: Points, dtype: int64
grouped_df = df.groupby(["Team", "Year"])["Points"].sum()
grouped_df.index
MultiIndex([('Devils', 2014),
            ('Devils', 2015),
            ( 'Kings', 2014),
            ('Riders', 2014),
            ('Riders', 2015)],
           names=['Team', 'Year'])
# 인덱싱
grouped_df["Devils":"Kings"]
Team    Year
Devils  2014    863
        2015    673
Kings   2014    741
Name: Points, dtype: int64
# 묶여진 데이터를 matrix 형태로 전환
grouped_df.unstack()
Year 2014 2015
Team
Devils 863.0 673.0
Kings 741.0 NaN
Riders 876.0 789.0
# 인덱스 변경
grouped_df.swaplevel()
Year  Team  
2014  Devils    863
2015  Devils    673
2014  Kings     741
      Riders    876
2015  Riders    789
Name: Points, dtype: int64
print(grouped_df.sum(level=0))
print(grouped_df.sum(level=1))
Team
Devils    1536
Kings      741
Riders    1665
Name: Points, dtype: int64
Year
2014    2480
2015    1462
Name: Points, dtype: int64

groupby II

grouped_df = df.groupby("Team") 

for name, group in grouped_df:
    print(name)
    print(group)
Devils
   Points  Rank    Team  Year
2     863     2  Devils  2014
3     673     3  Devils  2015
Kings
   Points  Rank   Team  Year
4     741     3  Kings  2014
Riders
   Points  Rank    Team  Year
0     876     1  Riders  2014
1     789     2  Riders  2015
grouped_df.get_group("Devils")
# grouped_df["Devils"] -> 컬럼 추출 용도(에러 발생)
Points Rank Team Year
2 863 2 Devils 2014
3 673 3 Devils 2015
  • 추출된 group 정보에는 3가지 유형의 apply가 가능함
    • Aggregation: 요약된 통계정보를 추출
    • Transformation: 해당 정보를 변환
    • Filtraion: 특정 정보를 제거하여 보여주는 필터링 기능
# Aggregation
print(grouped_df.agg(sum))

import numpy as np
print(grouped_df.agg(np.mean))

print(grouped_df["Points"].agg([np.sum, np.mean, np.std]))
        Points  Rank  Year
Team                      
Devils    1536     5  4029
Kings      741     3  2014
Riders    1665     3  4029
        Points  Rank    Year
Team                        
Devils   768.0   2.5  2014.5
Kings    741.0   3.0  2014.0
Riders   832.5   1.5  2014.5
         sum   mean         std
Team                           
Devils  1536  768.0  134.350288
Kings    741  741.0         NaN
Riders  1665  832.5   61.518290
  • transform 메서드는 key 값 별이 아닌 개별 데이터 별로 변환
# Transformation
score = lambda x: x * 2
grouped_df.transform(score)
Points Rank Year
0 1752 2 4028
1 1578 4 4030
2 1726 4 4028
3 1346 6 4030
4 1482 6 4028
  • 단, max나 min 처럼 Series 데이터에 적용되는 연산들은 key값을 기준으로 적용
score = lambda x: max(x)
grouped_df.transform(score)
Points Rank Year
0 876 2 2015
1 876 2 2015
2 863 3 2015
3 863 3 2015
4 741 3 2014
# Filtration
grouped_df.filter(lambda x: len(x) >= 2) # 데이터가 2개 이상인 그룹
Points Rank Year
0 876 1 2014
1 789 2 2015
2 863 2 2014
3 673 3 2015
grouped_df.filter(lambda x: x["Points"].sum() > 1600)
Points Rank Year
0 876 1 2014
1 789 2 2015

Pivot table & Crosstab

  • Excel에서 사용
  • Index 축은 groupby와 동일
  • column에 추가로 labeling 값을 추가하여,
  • value에 numeric type 값을 aggregation 하는 형태
import dateutil
# Pivot table
df_phone = pd.read_csv("./phone_data.csv")
df_phone['date'] = df_phone['date'].apply(dateutil.parser.parse, dayfirst=True)
df_phone.head()
index date duration item month network network_type
0 0 2014-10-15 06:58:00 34.429 data 2014-11 data data
1 1 2014-10-15 06:58:00 13.000 call 2014-11 Vodafone mobile
2 2 2014-10-15 14:46:00 23.000 call 2014-11 Meteor mobile
3 3 2014-10-15 14:48:00 4.000 call 2014-11 Tesco mobile
4 4 2014-10-15 17:27:00 4.000 call 2014-11 Tesco mobile
df_phone.pivot_table(["duration"], 
                    index=[df_phone.month, df_phone.item], 
                    columns=df_phone.network, 
                    aggfunc="sum", 
                    fill_value=0)
duration
network Meteor Tesco Three Vodafone data landline special voicemail world
month item
2014-11 call 1521 4045 12458 4316 0.000 2906 0 301 0
data 0 0 0 0 998.441 0 0 0 0
sms 10 3 25 55 0.000 0 1 0 0
2014-12 call 2010 1819 6316 1302 0.000 1424 0 690 0
data 0 0 0 0 1032.870 0 0 0 0
sms 12 1 13 18 0.000 0 0 0 4
2015-01 call 2207 2904 6445 3626 0.000 1603 0 285 0
data 0 0 0 0 1067.299 0 0 0 0
sms 10 3 33 40 0.000 0 0 0 0
2015-02 call 1188 4087 6279 1864 0.000 730 0 268 0
data 0 0 0 0 1067.299 0 0 0 0
sms 1 2 11 23 0.000 0 2 0 0
2015-03 call 274 973 4966 3513 0.000 11770 0 231 0
data 0 0 0 0 998.441 0 0 0 0
sms 0 4 5 13 0.000 0 0 0 3

Crosstab

  • 두 컬럼의 교차 빈도, 비율, 덧셈 등을 구할 때 사용
  • Pivot table의 특수한 형태
  • User-item Rating Matrix 등을 만들 때 사용 가능
df_movie = pd.read_csv("./movie_rating.csv")
df_movie.head()
critic title rating
0 Jack Matthews Lady in the Water 3.0
1 Jack Matthews Snakes on a Plane 4.0
2 Jack Matthews You Me and Dupree 3.5
3 Jack Matthews Superman Returns 5.0
4 Jack Matthews The Night Listener 3.0
pd.crosstab(index=df_movie.critic, 
           columns=df_movie.title, 
           values=df_movie.rating, 
           aggfunc="first").fillna(0)
title Just My Luck Lady in the Water Snakes on a Plane Superman Returns The Night Listener You Me and Dupree
critic
Claudia Puig 3.0 0.0 3.5 4.0 4.5 2.5
Gene Seymour 1.5 3.0 3.5 5.0 3.0 3.5
Jack Matthews 0.0 3.0 4.0 5.0 3.0 3.5
Lisa Rose 3.0 2.5 3.5 3.5 3.0 2.5
Mick LaSalle 2.0 3.0 4.0 3.0 3.0 2.0
Toby 0.0 0.0 4.5 4.0 0.0 1.0
df_movie.pivot_table(["rating"], 
               index=df_movie.critic, 
               columns=df_movie.title, 
               aggfunc="first", fill_value=0)
rating
title Just My Luck Lady in the Water Snakes on a Plane Superman Returns The Night Listener You Me and Dupree
critic
Claudia Puig 3.0 0.0 3.5 4.0 4.5 2.5
Gene Seymour 1.5 3.0 3.5 5.0 3.0 3.5
Jack Matthews 0.0 3.0 4.0 5.0 3.0 3.5
Lisa Rose 3.0 2.5 3.5 3.5 3.0 2.5
Mick LaSalle 2.0 3.0 4.0 3.0 3.0 2.0
Toby 0.0 0.0 4.5 4.0 0.0 1.0

Merge & Concat

merge

df_a = pd.DataFrame({"subject_id": [1,2,3,4,5,6], 
                   "test_score": [51,15,15,61,16,14]})
df_b = pd.DataFrame({"subject_id":[4,5,6,7,8], 
                    "first_name":["Billy", "Brian", "Bran", "Bryce", "Betty"], 
                    "last_name":["Bonder", "Black", "Balwner", "Brice", "Btisan"]})

pd.merge(df_a, df_b, on="subject_id")
# 두 데이터프레임의 컬럼명이 다를 경우
pd.merge(df_a, df_b, left_on="subject_id", right_on="subject_id")
subject_id test_score first_name last_name
0 4 61 Billy Bonder
1 5 16 Brian Black
2 6 14 Bran Balwner
print(pd.merge(df_a, df_b, on="subject_id", how="left")) # left join
print(pd.merge(df_a, df_b, on="subject_id", how="right")) # right join
print(pd.merge(df_a, df_b, on="subject_id", how="outer")) # full join
print(pd.merge(df_a, df_b, on="subject_id", how="inner")) # inner join
   subject_id  test_score first_name last_name
0           1          51        NaN       NaN
1           2          15        NaN       NaN
2           3          15        NaN       NaN
3           4          61      Billy    Bonder
4           5          16      Brian     Black
5           6          14       Bran   Balwner
   subject_id  test_score first_name last_name
0           4        61.0      Billy    Bonder
1           5        16.0      Brian     Black
2           6        14.0       Bran   Balwner
3           7         NaN      Bryce     Brice
4           8         NaN      Betty    Btisan
   subject_id  test_score first_name last_name
0           1        51.0        NaN       NaN
1           2        15.0        NaN       NaN
2           3        15.0        NaN       NaN
3           4        61.0      Billy    Bonder
4           5        16.0      Brian     Black
5           6        14.0       Bran   Balwner
6           7         NaN      Bryce     Brice
7           8         NaN      Betty    Btisan
   subject_id  test_score first_name last_name
0           4          61      Billy    Bonder
1           5          16      Brian     Black
2           6          14       Bran   Balwner
# index based join
pd.merge(df_a, df_b, right_index=True, left_index=True)
subject_id_x test_score subject_id_y first_name last_name
0 1 51 4 Billy Bonder
1 2 15 5 Brian Black
2 3 15 6 Bran Balwner
3 4 61 7 Bryce Brice
4 5 16 8 Betty Btisan

concat

print(pd.concat([df_a, df_b]))
print(pd.concat([df_a, df_b], axis=1))
   subject_id  test_score first_name last_name
0           1        51.0        NaN       NaN
1           2        15.0        NaN       NaN
2           3        15.0        NaN       NaN
3           4        61.0        NaN       NaN
4           5        16.0        NaN       NaN
5           6        14.0        NaN       NaN
0           4         NaN      Billy    Bonder
1           5         NaN      Brian     Black
2           6         NaN       Bran   Balwner
3           7         NaN      Bryce     Brice
4           8         NaN      Betty    Btisan
   subject_id  test_score  subject_id first_name last_name
0           1          51         4.0      Billy    Bonder
1           2          15         5.0      Brian     Black
2           3          15         6.0       Bran   Balwner
3           4          61         7.0      Bryce     Brice
4           5          16         8.0      Betty    Btisan
5           6          14         NaN        NaN       NaN

Persistence

  • pickle persistence
  • 가장 일반적인 python 파일 persistence
  • to_pickle, read_pickle 함수 사용
df = pd.DataFrame({"Points": [876,789,863,673,741], 
                 "Rank": [1,2,2,3,3], 
                 "Team": ["Riders", "Riders", "Devils", "Devils", "Kings"], 
                 "Year": [2014, 2015, 2014, 2015, 2014]})
print(df)
   Points  Rank    Team  Year
0     876     1  Riders  2014
1     789     2  Riders  2015
2     863     2  Devils  2014
3     673     3  Devils  2015
4     741     3   Kings  2014
df.to_pickle("./df_example.pickle")
df_pickle = pd.read_pickle("./df_example.pickle")
df_pickle
Points Rank Team Year
0 876 1 Riders 2014
1 789 2 Riders 2015
2 863 2 Devils 2014
3 673 3 Devils 2015
4 741 3 Kings 2014


Categories: ,

Updated:

Leave a comment