[AITech] 20220121 - Pandas

15 minute read

강의 복습 내용

Pandas I

Data loading

import pandas as pd

data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
df_data = pd.read_csv(data_url, sep='\s+', header=None)

df_data.head()

	0	1	2	4	5	6	7	8	9	10	11	12	13
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222.0	18.7	396.90	5.33	36.2

Series

Series(data, index, dtype): Series 객체를 생성

from pandas import Series, DataFrame
import pandas as pd
import numpy as np
'''Series 객체 생성하기'''
# 리스트로 만들기
list_data = [1,2,3,4,5]
list_name = ["a", "b", "c", "d", "e"] # 리스트로 생성 시 data 길이와 index 길이 같아야 함
# list=name = ["a", "b", "c", "d", "e", "f", "g", "h"] -> error!!!
example_obj = Series(data=list_data, index=list_name)
print(example_obj, type(example_obj))

a    1
b    2
c    3
d    4
e    5
dtype: int64 <class 'pandas.core.series.Series'>

# 딕셔너리로 만들기
dict_data = {"a":1, "b":2, "c":3, "d":4, "e":5}
indices = ["a", "b", "c", "d", "e", "f", "g", "h"] # data 길이와 달라도 index 기준
example_obj = Series(data=dict_data, index=indices, dtype=np.float32, name="example_data")
print(example_obj)

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
Name: example_data, dtype: float32

'''인덱스 접근'''
print(example_obj["a"])
example_obj["a"] = 3.5
print(example_obj)

1.0
a    3.5
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
Name: example_data, dtype: float32

'''Series 프로퍼티'''
# values
print(example_obj.values)
# index
print(example_obj.index)
# name
example_obj.name = "number"
example_obj.index.name = "alphabet"
print(example_obj)

[3.5 2.  3.  4.  5.  nan nan nan]
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], dtype='object')
alphabet
a    3.5
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
Name: number, dtype: float32

DataFrame

DataFrame(data, columns, index)

'''딕셔너리 만들기'''
# 딕셔너리로 데이터프레임 생성
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
           'last_name': ['Miller', 'Jacobson', 'All', 'Milner', 'Cooze'], 
           'age': [42, 52, 36, 24, 73], 
           'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}
df = pd.DataFrame(data=raw_data)
# df = pd.DataFrame(data=raw_data, columns=['first_name', 'last_name', 'age', 'city'])
print(df)

  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore
     Tina       All   36          Miami
     Jake    Milner   24        Douglas
      Amy     Cooze   73         Boston

# column 설정
print(DataFrame(raw_data, columns=["age", "city"]))
print(DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'city','debt']))

   age           city
 42  San Francisco
 52      Baltimore
 36          Miami
 24        Douglas
 73         Boston
  first_name last_name  age           city debt
    Jason    Miller   42  San Francisco  NaN
    Molly  Jacobson   52      Baltimore  NaN
     Tina       All   36          Miami  NaN
     Jake    Milner   24        Douglas  NaN
      Amy     Cooze   73         Boston  NaN

'''인덱싱'''
# column 추출(Series 추출)
print(df.first_name) # df["first_name"]

# loc: 인덱스 이름
df.index = [3,4,0,1,2]
print(df)
print(df.loc[:2])

# iloc: 인덱스 순서
print(df.iloc[:2])

  Jason
  Molly
   Tina
   Jake
    Amy
Name: first_name, dtype: object
  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore
     Tina       All   36          Miami
     Jake    Milner   24        Douglas
      Amy     Cooze   73         Boston
  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore
     Tina       All   36          Miami
     Jake    Milner   24        Douglas
      Amy     Cooze   73         Boston
  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore

'''데이터프레임 핸들링'''
# 새로운 column 추가
df["debt"] = df.age > 40 # df.debt를 사용하려면 이미 그 열이 있어야만 함
print(df)

# column 삭제
del df["debt"] # df.drop(debt, axis=1, inplace=True)
print(df)

  first_name last_name  age           city   debt
    Jason    Miller   42  San Francisco   True
    Molly  Jacobson   52      Baltimore   True
     Tina       All   36          Miami  False
     Jake    Milner   24        Douglas  False
      Amy     Cooze   73         Boston   True
  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore
     Tina       All   36          Miami
     Jake    Milner   24        Douglas
      Amy     Cooze   73         Boston

# transpose
print(df.T)

                        3          4      0        1       2
first_name          Jason      Molly   Tina     Jake     Amy
last_name          Miller   Jacobson    All   Milner   Cooze
age                    42         52     36       24      73
city        San Francisco  Baltimore  Miami  Douglas  Boston

# 값 출력
print(df.values, type(df.values))

[['Jason' 'Miller' 42 'San Francisco']
 ['Molly' 'Jacobson' 52 'Baltimore']
 ['Tina' 'All' 36 'Miami']
 ['Jake' 'Milner' 24 'Douglas']
 ['Amy' 'Cooze' 73 'Boston']] <class 'numpy.ndarray'>

# csv 변환
print(df.to_csv())

,first_name,last_name,age,city
3,Jason,Miller,42,San Francisco
4,Molly,Jacobson,52,Baltimore
0,Tina,All,36,Miami
1,Jake,Milner,24,Douglas
2,Amy,Cooze,73,Boston

selection & drop

'''Selection with column names'''
# column 한 개
print(df["age"].head(3))
# column 여러 개
print(df[["first_name", "last_name", "age"]].head(3))

  42
  52
  36
Name: age, dtype: int64
  first_name last_name  age
    Jason    Miller   42
    Molly  Jacobson   52
     Tina       All   36

'''Selection with index number'''
# 인덱스 이름
print(df.loc[:2])
# 인덱스 순서
print(df.iloc[:2]) # df[:2]
print(df.iloc[[0,2,4]])
# 열과 함께 선택
print(df["first_name"][:2])
# boolean 인덱스
print(df[df.age > 40])

  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore
     Tina       All   36          Miami
     Jake    Milner   24        Douglas
      Amy     Cooze   73         Boston
  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore
  first_name last_name  age           city
    Jason    Miller   42  San Francisco
     Tina       All   36          Miami
      Amy     Cooze   73         Boston
  Jason
  Molly
Name: first_name, dtype: object
  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore
      Amy     Cooze   73         Boston

'''Selection with column&index'''
# column name & index order
print(df[["first_name", "last_name"]][:3])
# index name & column name
print(df.loc[[3,4],["first_name", "last_name"]])
# index order & column order
print(df.iloc[:2,:3])

  first_name last_name
    Jason    Miller
    Molly  Jacobson
     Tina       All
  first_name last_name
    Jason    Miller
    Molly  Jacobson
  first_name last_name  age
    Jason    Miller   42
    Molly  Jacobson   52

'''data drop'''
print(df)
# index name으로 drop
print(df.drop(1, axis=0))
print(df.drop([0,1,2,3],axis=0))
# column name으로 drop
print(df.drop("city", axis=1))

  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore
     Tina       All   36          Miami
     Jake    Milner   24        Douglas
      Amy     Cooze   73         Boston
  first_name last_name  age           city
    Jason    Miller   42  San Francisco
    Molly  Jacobson   52      Baltimore
     Tina       All   36          Miami
      Amy     Cooze   73         Boston
  first_name last_name  age       city
    Molly  Jacobson   52  Baltimore
  first_name last_name  age
    Jason    Miller   42
    Molly  Jacobson   52
     Tina       All   36
     Jake    Milner   24
      Amy     Cooze   73

dataframe operations

'''Series operation'''
# index 기준으로 연산수행, 겹치는 index가 없을 경우 NaN값으로 반환
s1 = Series(range(1,6), index=list("abcde"))
s2 = Series(range(5,11), index=list("bcedef"))
print(s1, s2, sep='\n')
print(s1.add(s2)) # s1+s2

a    1
b    2
c    3
d    4
e    5
dtype: int64
b     5
c     6
e     7
d     8
e     9
f    10
dtype: int64
a     NaN
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f     NaN
dtype: float64

'''dataframe operation'''
df1 = DataFrame(np.arange(9).reshape(3,3), columns=list("abc"))
df2 = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd"))
print(df1, df2, sep='\n')
print(df1.add(df2, fill_value=0)) # fill_value

   a  b  c
0  1  2
3  4  5
6  7  8
    a   b   c   d
 0   1   2   3
 4   5   6   7
 8   9  10  11
12  13  14  15
      a     b     c     d
 0.0   2.0   4.0   3.0
 7.0   9.0  11.0   7.0
14.0  16.0  18.0  11.0
12.0  13.0  14.0  15.0

'''series+dataframe'''
df = DataFrame(np.arange(16).reshape(4,4), columns=list('abcd'))
s2 = Series(np.arange(10,14))
print(df, s2, sep='\n')
print(df.add(s2, axis=0))

    a   b   c   d
 0   1   2   3
 4   5   6   7
 8   9  10  11
12  13  14  15
  10
  11
  12
  13
dtype: int32
    a   b   c   d
10  11  12  13
15  16  17  18
20  21  22  23
25  26  27  28

lambda, map, apply

'''lambda, map'''
# 연산 적용
s1 = Series(np.arange(10))
print(s1.head(5))
print(s1.map(lambda x:x**2).head())
# dict type으로 데이터 치환(없는 값은 NaN)
z = {1:'A', 2:'B', 3:'C'}
print(s1.map(z).head())
# 같은 위치의 데이터를 s2로 교체
s2 = Series(np.arange(10, 20))
print(s1.map(s2).head())

  0
  1
  2
  3
  4
dtype: int32
   0
   1
   4
   9
  16
dtype: int64
  NaN
    A
    B
    C
  NaN
dtype: object
  10
  11
  12
  13
  14
dtype: int32

'''replace'''
# map 함수의 기능 중 '데이터 변환' 기능만 담당
raw_data = {'earn': [79571, 96396, 48710], 
           'height': [73.89, 66.23, 63.77], 
           'sex': ["male", "female", "female"], 
           'race': ['white', 'white', 'black'], 
           'age': [49, 62, 33]}
df = pd.DataFrame(data=raw_data)
print(df.sex.replace({"male":0, "female":1}))

  0
  1
  1
Name: sex, dtype: int64

'''apply'''
# map과 달리, series 전체(column)에 해당 함수를 적용
df_info = df[["earn", "height", "age"]]
print(df_info.head())
f = lambda x: x.max() - x.min()
print(df_info.apply(f)) # 각 column 별로 결과값 반환

# sum, mean, std 등의 내장 연산 함수도 사용 가능
print(df_info.sum())
print(df_info.apply(sum))

# Series값으로 반환도 가능
def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])
print(df_info.apply(f))

    earn  height  age
0  79571   73.89   49
1  96396   66.23   62
2  48710   63.77   33
earn      47686.00
height       10.12
age          29.00
dtype: float64
earn      224677.00
height       203.89
age          144.00
dtype: float64
earn      224677.00
height       203.89
age          144.00
dtype: float64
      earn  height  age
min  48710   63.77   33
max  96396   73.89   62

Pandas built-in functions

# describe 
print(df.describe()) # object type은 제외

               earn     height        age
count      3.000000   3.000000   3.000000
mean   74892.333333  67.963333  48.000000
std    24184.831824   5.277967  14.525839
min    48710.000000  63.770000  33.000000
25%    64140.500000  65.000000  41.000000
50%    79571.000000  66.230000  49.000000
75%    87983.500000  70.060000  55.500000
max    96396.000000  73.890000  62.000000

# unique
print(df.sex.unique())

['male' 'female']

# 기본 연산
print(df.sum(axis=0))
print(df.mean(axis=1))

earn                224677
height              203.89
sex       malefemalefemale
race       whitewhiteblack
age                    144
dtype: object
0    26564.630000
1    32174.743333
2    16268.923333
dtype: float64

# isnull
print(df.isnull())
print(df.isnull().sum())

    earn  height    sex   race    age
0  False   False  False  False  False
1  False   False  False  False  False
2  False   False  False  False  False
earn      0
height    0
sex       0
race      0
age       0
dtype: int64

# sort_values
print(df.sort_values(["age", "earn"], ascending=True))

    earn  height     sex   race  age
48710   63.77  female  black   33
79571   73.89    male  white   49
96396   66.23  female  white   62

# correlation & covariance
print(df.age.corr(df.earn))
print(df.age.cov(df.earn))
print(df.corrwith(df.earn))
print(df.corr())

0.9941006402346648
349232.5
earn      1.000000
height    0.392674
age       0.994101
dtype: float64
            earn    height       age
earn    1.000000  0.392674  0.994101
height  0.392674  1.000000  0.290608
age     0.994101  0.290608  1.000000

###

2. Pandas II

groupby I

split -> apply -> combine

df = {"Points": [876,789,863,673,741], 
     "Rank": [1,2,2,3,3], 
     "Team": ["Riders", "Riders", "Devils", "Devils", "Kings"], 
     "Year": [2014, 2015, 2014, 2015, 2014]}
df = pd.DataFrame(df)
print(df)

   Points  Rank    Team  Year
   876     1  Riders  2014
   789     2  Riders  2015
   863     2  Devils  2014
   673     3  Devils  2015
   741     3   Kings  2014

print(df.groupby("Team"))
print(pd.DataFrame(df.groupby("Team")))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EEA6650F40>
        0                                                  1
Devils     Points  Rank    Team  Year
   863     2 ...
 Kings     Points  Rank   Team  Year
   741     3  ...
Riders     Points  Rank    Team  Year
   876     1 ...

print(df.groupby("Team")["Points"])
print(pd.Series(df.groupby("Team")["Points"]))

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001EEA6647D60>
0    (Devils, [863, 673])
1          (Kings, [741])
2    (Riders, [876, 789])
dtype: object

df.groupby("Team")["Points"].sum() # 묶을 기준, 연산을 적용할 컬럼, 적용할 연산

Team
Devils    1536
Kings      741
Riders    1665
Name: Points, dtype: int64

df.groupby(["Team", "Year"])["Points"].sum()

Team    Year
Devils  2014    863
        2015    673
Kings   2014    741
Riders  2014    876
        2015    789
Name: Points, dtype: int64

df.groupby(["Year", "Team"])["Points"].sum() 

Year  Team  
2014  Devils    863
      Kings     741
      Riders    876
2015  Devils    673
      Riders    789
Name: Points, dtype: int64

grouped_df = df.groupby(["Team", "Year"])["Points"].sum()
grouped_df.index

MultiIndex([('Devils', 2014),
            ('Devils', 2015),
            ( 'Kings', 2014),
            ('Riders', 2014),
            ('Riders', 2015)],
           names=['Team', 'Year'])

# 인덱싱
grouped_df["Devils":"Kings"]

Team    Year
Devils  2014    863
        2015    673
Kings   2014    741
Name: Points, dtype: int64

# 묶여진 데이터를 matrix 형태로 전환
grouped_df.unstack()

Year	2014	2015
Team
Devils	863.0	673.0
Kings	741.0	NaN
Riders	876.0	789.0

# 인덱스 변경
grouped_df.swaplevel()

Year  Team  
2014  Devils    863
2015  Devils    673
2014  Kings     741
      Riders    876
2015  Riders    789
Name: Points, dtype: int64

print(grouped_df.sum(level=0))
print(grouped_df.sum(level=1))

Team
Devils    1536
Kings      741
Riders    1665
Name: Points, dtype: int64
Year
2014    2480
2015    1462
Name: Points, dtype: int64

groupby II

grouped_df = df.groupby("Team") 

for name, group in grouped_df:
    print(name)
    print(group)

Devils
   Points  Rank    Team  Year
2     863     2  Devils  2014
3     673     3  Devils  2015
Kings
   Points  Rank   Team  Year
4     741     3  Kings  2014
Riders
   Points  Rank    Team  Year
0     876     1  Riders  2014
1     789     2  Riders  2015

grouped_df.get_group("Devils")
# grouped_df["Devils"] -> 컬럼 추출 용도(에러 발생)

	Points	Rank	Team	Year
2	863	2	Devils	2014
3	673	3	Devils	2015

추출된 group 정보에는 3가지 유형의 apply가 가능함
- Aggregation: 요약된 통계정보를 추출
- Transformation: 해당 정보를 변환
- Filtraion: 특정 정보를 제거하여 보여주는 필터링 기능

# Aggregation
print(grouped_df.agg(sum))

import numpy as np
print(grouped_df.agg(np.mean))

print(grouped_df["Points"].agg([np.sum, np.mean, np.std]))

        Points  Rank  Year
Team                      
Devils    1536     5  4029
Kings      741     3  2014
Riders    1665     3  4029
        Points  Rank    Year
Team                        
Devils   768.0   2.5  2014.5
Kings    741.0   3.0  2014.0
Riders   832.5   1.5  2014.5
         sum   mean         std
Team                           
Devils  1536  768.0  134.350288
Kings    741  741.0         NaN
Riders  1665  832.5   61.518290

transform 메서드는 key 값 별이 아닌 개별 데이터 별로 변환

# Transformation
score = lambda x: x * 2
grouped_df.transform(score)

	Points	Rank	Year
0	1752	2	4028
1	1578	4	4030
2	1726	4	4028
3	1346	6	4030
4	1482	6	4028

단, max나 min 처럼 Series 데이터에 적용되는 연산들은 key값을 기준으로 적용

score = lambda x: max(x)
grouped_df.transform(score)

	Points	Rank	Year
0	876	2	2015
1	876	2	2015
2	863	3	2015
3	863	3	2015
4	741	3	2014

# Filtration
grouped_df.filter(lambda x: len(x) >= 2) # 데이터가 2개 이상인 그룹

	Points	Rank	Year
0	876	1	2014
1	789	2	2015
2	863	2	2014
3	673	3	2015

grouped_df.filter(lambda x: x["Points"].sum() > 1600)

	Points	Rank	Year
0	876	1	2014
1	789	2	2015

Pivot table & Crosstab

Excel에서 사용
Index 축은 groupby와 동일
column에 추가로 labeling 값을 추가하여,
value에 numeric type 값을 aggregation 하는 형태

import dateutil
# Pivot table
df_phone = pd.read_csv("./phone_data.csv")
df_phone['date'] = df_phone['date'].apply(dateutil.parser.parse, dayfirst=True)
df_phone.head()

	index	date	duration	item	month	network	network_type
0	0	2014-10-15 06:58:00	34.429	data	2014-11	data	data
1	1	2014-10-15 06:58:00	13.000	call	2014-11	Vodafone	mobile
2	2	2014-10-15 14:46:00	23.000	call	2014-11	Meteor	mobile
3	3	2014-10-15 14:48:00	4.000	call	2014-11	Tesco	mobile
4	4	2014-10-15 17:27:00	4.000	call	2014-11	Tesco	mobile

df_phone.pivot_table(["duration"], 
                    index=[df_phone.month, df_phone.item], 
                    columns=df_phone.network, 
                    aggfunc="sum", 
                    fill_value=0)

		duration
	network	Meteor	Tesco	Three	Vodafone	data	landline	special	voicemail	world
month	item
2014-11	call	1521	4045	12458	4316	0.000	2906	0	301	0
	data	0	0	0	0	998.441	0	0	0	0
	sms	10	3	25	55	0.000	0	1	0	0
2014-12	call	2010	1819	6316	1302	0.000	1424	0	690	0
	data	0	0	0	0	1032.870	0	0	0	0
	sms	12	1	13	18	0.000	0	0	0	4
2015-01	call	2207	2904	6445	3626	0.000	1603	0	285	0
	data	0	0	0	0	1067.299	0	0	0	0
	sms	10	3	33	40	0.000	0	0	0	0
2015-02	call	1188	4087	6279	1864	0.000	730	0	268	0
	data	0	0	0	0	1067.299	0	0	0	0
	sms	1	2	11	23	0.000	0	2	0	0
2015-03	call	274	973	4966	3513	0.000	11770	0	231	0
	data	0	0	0	0	998.441	0	0	0	0
	sms	0	4	5	13	0.000	0	0	0	3

Crosstab

두 컬럼의 교차 빈도, 비율, 덧셈 등을 구할 때 사용
Pivot table의 특수한 형태
User-item Rating Matrix 등을 만들 때 사용 가능

df_movie = pd.read_csv("./movie_rating.csv")
df_movie.head()

	critic	title	rating
0	Jack Matthews	Lady in the Water	3.0
1	Jack Matthews	Snakes on a Plane	4.0
2	Jack Matthews	You Me and Dupree	3.5
3	Jack Matthews	Superman Returns	5.0
4	Jack Matthews	The Night Listener	3.0

pd.crosstab(index=df_movie.critic, 
           columns=df_movie.title, 
           values=df_movie.rating, 
           aggfunc="first").fillna(0)

title	Just My Luck	Lady in the Water	Snakes on a Plane	Superman Returns	The Night Listener	You Me and Dupree
critic
Claudia Puig	3.0	0.0	3.5	4.0	4.5	2.5
Gene Seymour	1.5	3.0	3.5	5.0	3.0	3.5
Jack Matthews	0.0	3.0	4.0	5.0	3.0	3.5
Lisa Rose	3.0	2.5	3.5	3.5	3.0	2.5
Mick LaSalle	2.0	3.0	4.0	3.0	3.0	2.0
Toby	0.0	0.0	4.5	4.0	0.0	1.0

df_movie.pivot_table(["rating"], 
               index=df_movie.critic, 
               columns=df_movie.title, 
               aggfunc="first", fill_value=0)

	rating
title	Just My Luck	Lady in the Water	Snakes on a Plane	Superman Returns	The Night Listener	You Me and Dupree
critic
Claudia Puig	3.0	0.0	3.5	4.0	4.5	2.5
Gene Seymour	1.5	3.0	3.5	5.0	3.0	3.5
Jack Matthews	0.0	3.0	4.0	5.0	3.0	3.5
Lisa Rose	3.0	2.5	3.5	3.5	3.0	2.5
Mick LaSalle	2.0	3.0	4.0	3.0	3.0	2.0
Toby	0.0	0.0	4.5	4.0	0.0	1.0

Merge & Concat

merge

df_a = pd.DataFrame({"subject_id": [1,2,3,4,5,6], 
                   "test_score": [51,15,15,61,16,14]})
df_b = pd.DataFrame({"subject_id":[4,5,6,7,8], 
                    "first_name":["Billy", "Brian", "Bran", "Bryce", "Betty"], 
                    "last_name":["Bonder", "Black", "Balwner", "Brice", "Btisan"]})

pd.merge(df_a, df_b, on="subject_id")
# 두 데이터프레임의 컬럼명이 다를 경우
pd.merge(df_a, df_b, left_on="subject_id", right_on="subject_id")

	subject_id	test_score	first_name	last_name
0	4	61	Billy	Bonder
1	5	16	Brian	Black
2	6	14	Bran	Balwner

print(pd.merge(df_a, df_b, on="subject_id", how="left")) # left join
print(pd.merge(df_a, df_b, on="subject_id", how="right")) # right join
print(pd.merge(df_a, df_b, on="subject_id", how="outer")) # full join
print(pd.merge(df_a, df_b, on="subject_id", how="inner")) # inner join

   subject_id  test_score first_name last_name
         1          51        NaN       NaN
         2          15        NaN       NaN
         3          15        NaN       NaN
         4          61      Billy    Bonder
         5          16      Brian     Black
         6          14       Bran   Balwner
   subject_id  test_score first_name last_name
         4        61.0      Billy    Bonder
         5        16.0      Brian     Black
         6        14.0       Bran   Balwner
         7         NaN      Bryce     Brice
         8         NaN      Betty    Btisan
   subject_id  test_score first_name last_name
         1        51.0        NaN       NaN
         2        15.0        NaN       NaN
         3        15.0        NaN       NaN
         4        61.0      Billy    Bonder
         5        16.0      Brian     Black
         6        14.0       Bran   Balwner
         7         NaN      Bryce     Brice
         8         NaN      Betty    Btisan
   subject_id  test_score first_name last_name
         4          61      Billy    Bonder
         5          16      Brian     Black
         6          14       Bran   Balwner

# index based join
pd.merge(df_a, df_b, right_index=True, left_index=True)

	subject_id_x	test_score	subject_id_y	first_name	last_name
0	1	51	4	Billy	Bonder
1	2	15	5	Brian	Black
2	3	15	6	Bran	Balwner
3	4	61	7	Bryce	Brice
4	5	16	8	Betty	Btisan

concat

print(pd.concat([df_a, df_b]))
print(pd.concat([df_a, df_b], axis=1))

   subject_id  test_score first_name last_name
         1        51.0        NaN       NaN
         2        15.0        NaN       NaN
         3        15.0        NaN       NaN
         4        61.0        NaN       NaN
         5        16.0        NaN       NaN
         6        14.0        NaN       NaN
         4         NaN      Billy    Bonder
         5         NaN      Brian     Black
         6         NaN       Bran   Balwner
         7         NaN      Bryce     Brice
         8         NaN      Betty    Btisan
   subject_id  test_score  subject_id first_name last_name
         1          51         4.0      Billy    Bonder
         2          15         5.0      Brian     Black
         3          15         6.0       Bran   Balwner
         4          61         7.0      Bryce     Brice
         5          16         8.0      Betty    Btisan
         6          14         NaN        NaN       NaN

Persistence

pickle persistence
가장 일반적인 python 파일 persistence
to_pickle, read_pickle 함수 사용

df = pd.DataFrame({"Points": [876,789,863,673,741], 
                 "Rank": [1,2,2,3,3], 
                 "Team": ["Riders", "Riders", "Devils", "Devils", "Kings"], 
                 "Year": [2014, 2015, 2014, 2015, 2014]})
print(df)

   Points  Rank    Team  Year
   876     1  Riders  2014
   789     2  Riders  2015
   863     2  Devils  2014
   673     3  Devils  2015
   741     3   Kings  2014

df.to_pickle("./df_example.pickle")
df_pickle = pd.read_pickle("./df_example.pickle")
df_pickle

	Points	Rank	Team	Year
0	876	1	Riders	2014
1	789	2	Riders	2015
2	863	2	Devils	2014
3	673	3	Devils	2015
4	741	3	Kings	2014

Share on

Twitter Facebook LinkedIn

wowo0709

[AITech] 20220121 - Pandas

강의 복습 내용

Pandas I

Data loading

Series

DataFrame

selection & drop

dataframe operations

lambda, map, apply

Pandas built-in functions

2. Pandas II

groupby I

groupby II

Pivot table & Crosstab

Merge & Concat

Persistence

Share on

Leave a comment

You may also enjoy

[Python] Effective Python CH 2. 리스트와 딕셔너리 - 1

[Python] Effective Python CH 1. 파이썬답게 생각하기 - 2

[Python] Effective Python CH 1. 파이썬답게 생각하기 - 1

[Python] Effective Python 전체 목차