[AITech] 20220121 - Pandas
강의 복습 내용
Pandas I
Data loading
import pandas as pd
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
df_data = pd.read_csv(data_url, sep='\s+', header=None)
df_data.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
Series
Series(data, index, dtype)
: Series 객체를 생성
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
'''Series 객체 생성하기'''
# 리스트로 만들기
list_data = [1,2,3,4,5]
list_name = ["a", "b", "c", "d", "e"] # 리스트로 생성 시 data 길이와 index 길이 같아야 함
# list=name = ["a", "b", "c", "d", "e", "f", "g", "h"] -> error!!!
example_obj = Series(data=list_data, index=list_name)
print(example_obj, type(example_obj))
a 1
b 2
c 3
d 4
e 5
dtype: int64 <class 'pandas.core.series.Series'>
# 딕셔너리로 만들기
dict_data = {"a":1, "b":2, "c":3, "d":4, "e":5}
indices = ["a", "b", "c", "d", "e", "f", "g", "h"] # data 길이와 달라도 index 기준
example_obj = Series(data=dict_data, index=indices, dtype=np.float32, name="example_data")
print(example_obj)
a 1.0
b 2.0
c 3.0
d 4.0
e 5.0
f NaN
g NaN
h NaN
Name: example_data, dtype: float32
'''인덱스 접근'''
print(example_obj["a"])
example_obj["a"] = 3.5
print(example_obj)
1.0
a 3.5
b 2.0
c 3.0
d 4.0
e 5.0
f NaN
g NaN
h NaN
Name: example_data, dtype: float32
'''Series 프로퍼티'''
# values
print(example_obj.values)
# index
print(example_obj.index)
# name
example_obj.name = "number"
example_obj.index.name = "alphabet"
print(example_obj)
[3.5 2. 3. 4. 5. nan nan nan]
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], dtype='object')
alphabet
a 3.5
b 2.0
c 3.0
d 4.0
e 5.0
f NaN
g NaN
h NaN
Name: number, dtype: float32
DataFrame
- DataFrame(data, columns, index)
'''딕셔너리 만들기'''
# 딕셔너리로 데이터프레임 생성
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'All', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}
df = pd.DataFrame(data=raw_data)
# df = pd.DataFrame(data=raw_data, columns=['first_name', 'last_name', 'age', 'city'])
print(df)
first_name last_name age city
0 Jason Miller 42 San Francisco
1 Molly Jacobson 52 Baltimore
2 Tina All 36 Miami
3 Jake Milner 24 Douglas
4 Amy Cooze 73 Boston
# column 설정
print(DataFrame(raw_data, columns=["age", "city"]))
print(DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'city','debt']))
age city
0 42 San Francisco
1 52 Baltimore
2 36 Miami
3 24 Douglas
4 73 Boston
first_name last_name age city debt
0 Jason Miller 42 San Francisco NaN
1 Molly Jacobson 52 Baltimore NaN
2 Tina All 36 Miami NaN
3 Jake Milner 24 Douglas NaN
4 Amy Cooze 73 Boston NaN
'''인덱싱'''
# column 추출(Series 추출)
print(df.first_name) # df["first_name"]
# loc: 인덱스 이름
df.index = [3,4,0,1,2]
print(df)
print(df.loc[:2])
# iloc: 인덱스 순서
print(df.iloc[:2])
0 Jason
1 Molly
2 Tina
3 Jake
4 Amy
Name: first_name, dtype: object
first_name last_name age city
3 Jason Miller 42 San Francisco
4 Molly Jacobson 52 Baltimore
0 Tina All 36 Miami
1 Jake Milner 24 Douglas
2 Amy Cooze 73 Boston
first_name last_name age city
3 Jason Miller 42 San Francisco
4 Molly Jacobson 52 Baltimore
0 Tina All 36 Miami
1 Jake Milner 24 Douglas
2 Amy Cooze 73 Boston
first_name last_name age city
3 Jason Miller 42 San Francisco
4 Molly Jacobson 52 Baltimore
'''데이터프레임 핸들링'''
# 새로운 column 추가
df["debt"] = df.age > 40 # df.debt를 사용하려면 이미 그 열이 있어야만 함
print(df)
# column 삭제
del df["debt"] # df.drop(debt, axis=1, inplace=True)
print(df)
first_name last_name age city debt
3 Jason Miller 42 San Francisco True
4 Molly Jacobson 52 Baltimore True
0 Tina All 36 Miami False
1 Jake Milner 24 Douglas False
2 Amy Cooze 73 Boston True
first_name last_name age city
3 Jason Miller 42 San Francisco
4 Molly Jacobson 52 Baltimore
0 Tina All 36 Miami
1 Jake Milner 24 Douglas
2 Amy Cooze 73 Boston
# transpose
print(df.T)
3 4 0 1 2
first_name Jason Molly Tina Jake Amy
last_name Miller Jacobson All Milner Cooze
age 42 52 36 24 73
city San Francisco Baltimore Miami Douglas Boston
# 값 출력
print(df.values, type(df.values))
[['Jason' 'Miller' 42 'San Francisco']
['Molly' 'Jacobson' 52 'Baltimore']
['Tina' 'All' 36 'Miami']
['Jake' 'Milner' 24 'Douglas']
['Amy' 'Cooze' 73 'Boston']] <class 'numpy.ndarray'>
# csv 변환
print(df.to_csv())
,first_name,last_name,age,city
3,Jason,Miller,42,San Francisco
4,Molly,Jacobson,52,Baltimore
0,Tina,All,36,Miami
1,Jake,Milner,24,Douglas
2,Amy,Cooze,73,Boston
selection & drop
'''Selection with column names'''
# column 한 개
print(df["age"].head(3))
# column 여러 개
print(df[["first_name", "last_name", "age"]].head(3))
3 42
4 52
0 36
Name: age, dtype: int64
first_name last_name age
3 Jason Miller 42
4 Molly Jacobson 52
0 Tina All 36
'''Selection with index number'''
# 인덱스 이름
print(df.loc[:2])
# 인덱스 순서
print(df.iloc[:2]) # df[:2]
print(df.iloc[[0,2,4]])
# 열과 함께 선택
print(df["first_name"][:2])
# boolean 인덱스
print(df[df.age > 40])
first_name last_name age city
3 Jason Miller 42 San Francisco
4 Molly Jacobson 52 Baltimore
0 Tina All 36 Miami
1 Jake Milner 24 Douglas
2 Amy Cooze 73 Boston
first_name last_name age city
3 Jason Miller 42 San Francisco
4 Molly Jacobson 52 Baltimore
first_name last_name age city
3 Jason Miller 42 San Francisco
0 Tina All 36 Miami
2 Amy Cooze 73 Boston
3 Jason
4 Molly
Name: first_name, dtype: object
first_name last_name age city
3 Jason Miller 42 San Francisco
4 Molly Jacobson 52 Baltimore
2 Amy Cooze 73 Boston
'''Selection with column&index'''
# column name & index order
print(df[["first_name", "last_name"]][:3])
# index name & column name
print(df.loc[[3,4],["first_name", "last_name"]])
# index order & column order
print(df.iloc[:2,:3])
first_name last_name
3 Jason Miller
4 Molly Jacobson
0 Tina All
first_name last_name
3 Jason Miller
4 Molly Jacobson
first_name last_name age
3 Jason Miller 42
4 Molly Jacobson 52
'''data drop'''
print(df)
# index name으로 drop
print(df.drop(1, axis=0))
print(df.drop([0,1,2,3],axis=0))
# column name으로 drop
print(df.drop("city", axis=1))
first_name last_name age city
3 Jason Miller 42 San Francisco
4 Molly Jacobson 52 Baltimore
0 Tina All 36 Miami
1 Jake Milner 24 Douglas
2 Amy Cooze 73 Boston
first_name last_name age city
3 Jason Miller 42 San Francisco
4 Molly Jacobson 52 Baltimore
0 Tina All 36 Miami
2 Amy Cooze 73 Boston
first_name last_name age city
4 Molly Jacobson 52 Baltimore
first_name last_name age
3 Jason Miller 42
4 Molly Jacobson 52
0 Tina All 36
1 Jake Milner 24
2 Amy Cooze 73
dataframe operations
'''Series operation'''
# index 기준으로 연산수행, 겹치는 index가 없을 경우 NaN값으로 반환
s1 = Series(range(1,6), index=list("abcde"))
s2 = Series(range(5,11), index=list("bcedef"))
print(s1, s2, sep='\n')
print(s1.add(s2)) # s1+s2
a 1
b 2
c 3
d 4
e 5
dtype: int64
b 5
c 6
e 7
d 8
e 9
f 10
dtype: int64
a NaN
b 7.0
c 9.0
d 12.0
e 12.0
e 14.0
f NaN
dtype: float64
'''dataframe operation'''
df1 = DataFrame(np.arange(9).reshape(3,3), columns=list("abc"))
df2 = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd"))
print(df1, df2, sep='\n')
print(df1.add(df2, fill_value=0)) # fill_value
a b c
0 0 1 2
1 3 4 5
2 6 7 8
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
a b c d
0 0.0 2.0 4.0 3.0
1 7.0 9.0 11.0 7.0
2 14.0 16.0 18.0 11.0
3 12.0 13.0 14.0 15.0
'''series+dataframe'''
df = DataFrame(np.arange(16).reshape(4,4), columns=list('abcd'))
s2 = Series(np.arange(10,14))
print(df, s2, sep='\n')
print(df.add(s2, axis=0))
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
0 10
1 11
2 12
3 13
dtype: int32
a b c d
0 10 11 12 13
1 15 16 17 18
2 20 21 22 23
3 25 26 27 28
lambda, map, apply
'''lambda, map'''
# 연산 적용
s1 = Series(np.arange(10))
print(s1.head(5))
print(s1.map(lambda x:x**2).head())
# dict type으로 데이터 치환(없는 값은 NaN)
z = {1:'A', 2:'B', 3:'C'}
print(s1.map(z).head())
# 같은 위치의 데이터를 s2로 교체
s2 = Series(np.arange(10, 20))
print(s1.map(s2).head())
0 0
1 1
2 2
3 3
4 4
dtype: int32
0 0
1 1
2 4
3 9
4 16
dtype: int64
0 NaN
1 A
2 B
3 C
4 NaN
dtype: object
0 10
1 11
2 12
3 13
4 14
dtype: int32
'''replace'''
# map 함수의 기능 중 '데이터 변환' 기능만 담당
raw_data = {'earn': [79571, 96396, 48710],
'height': [73.89, 66.23, 63.77],
'sex': ["male", "female", "female"],
'race': ['white', 'white', 'black'],
'age': [49, 62, 33]}
df = pd.DataFrame(data=raw_data)
print(df.sex.replace({"male":0, "female":1}))
0 0
1 1
2 1
Name: sex, dtype: int64
'''apply'''
# map과 달리, series 전체(column)에 해당 함수를 적용
df_info = df[["earn", "height", "age"]]
print(df_info.head())
f = lambda x: x.max() - x.min()
print(df_info.apply(f)) # 각 column 별로 결과값 반환
# sum, mean, std 등의 내장 연산 함수도 사용 가능
print(df_info.sum())
print(df_info.apply(sum))
# Series값으로 반환도 가능
def f(x):
return Series([x.min(), x.max()], index=["min", "max"])
print(df_info.apply(f))
earn height age
0 79571 73.89 49
1 96396 66.23 62
2 48710 63.77 33
earn 47686.00
height 10.12
age 29.00
dtype: float64
earn 224677.00
height 203.89
age 144.00
dtype: float64
earn 224677.00
height 203.89
age 144.00
dtype: float64
earn height age
min 48710 63.77 33
max 96396 73.89 62
Pandas built-in functions
# describe
print(df.describe()) # object type은 제외
earn height age
count 3.000000 3.000000 3.000000
mean 74892.333333 67.963333 48.000000
std 24184.831824 5.277967 14.525839
min 48710.000000 63.770000 33.000000
25% 64140.500000 65.000000 41.000000
50% 79571.000000 66.230000 49.000000
75% 87983.500000 70.060000 55.500000
max 96396.000000 73.890000 62.000000
# unique
print(df.sex.unique())
['male' 'female']
# 기본 연산
print(df.sum(axis=0))
print(df.mean(axis=1))
earn 224677
height 203.89
sex malefemalefemale
race whitewhiteblack
age 144
dtype: object
0 26564.630000
1 32174.743333
2 16268.923333
dtype: float64
# isnull
print(df.isnull())
print(df.isnull().sum())
earn height sex race age
0 False False False False False
1 False False False False False
2 False False False False False
earn 0
height 0
sex 0
race 0
age 0
dtype: int64
# sort_values
print(df.sort_values(["age", "earn"], ascending=True))
earn height sex race age
2 48710 63.77 female black 33
0 79571 73.89 male white 49
1 96396 66.23 female white 62
# correlation & covariance
print(df.age.corr(df.earn))
print(df.age.cov(df.earn))
print(df.corrwith(df.earn))
print(df.corr())
0.9941006402346648
349232.5
earn 1.000000
height 0.392674
age 0.994101
dtype: float64
earn height age
earn 1.000000 0.392674 0.994101
height 0.392674 1.000000 0.290608
age 0.994101 0.290608 1.000000
###
2. Pandas II
groupby I
- split -> apply -> combine
df = {"Points": [876,789,863,673,741],
"Rank": [1,2,2,3,3],
"Team": ["Riders", "Riders", "Devils", "Devils", "Kings"],
"Year": [2014, 2015, 2014, 2015, 2014]}
df = pd.DataFrame(df)
print(df)
Points Rank Team Year
0 876 1 Riders 2014
1 789 2 Riders 2015
2 863 2 Devils 2014
3 673 3 Devils 2015
4 741 3 Kings 2014
print(df.groupby("Team"))
print(pd.DataFrame(df.groupby("Team")))
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EEA6650F40>
0 1
0 Devils Points Rank Team Year
2 863 2 ...
1 Kings Points Rank Team Year
4 741 3 ...
2 Riders Points Rank Team Year
0 876 1 ...
print(df.groupby("Team")["Points"])
print(pd.Series(df.groupby("Team")["Points"]))
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001EEA6647D60>
0 (Devils, [863, 673])
1 (Kings, [741])
2 (Riders, [876, 789])
dtype: object
df.groupby("Team")["Points"].sum() # 묶을 기준, 연산을 적용할 컬럼, 적용할 연산
Team
Devils 1536
Kings 741
Riders 1665
Name: Points, dtype: int64
df.groupby(["Team", "Year"])["Points"].sum()
Team Year
Devils 2014 863
2015 673
Kings 2014 741
Riders 2014 876
2015 789
Name: Points, dtype: int64
df.groupby(["Year", "Team"])["Points"].sum()
Year Team
2014 Devils 863
Kings 741
Riders 876
2015 Devils 673
Riders 789
Name: Points, dtype: int64
grouped_df = df.groupby(["Team", "Year"])["Points"].sum()
grouped_df.index
MultiIndex([('Devils', 2014),
('Devils', 2015),
( 'Kings', 2014),
('Riders', 2014),
('Riders', 2015)],
names=['Team', 'Year'])
# 인덱싱
grouped_df["Devils":"Kings"]
Team Year
Devils 2014 863
2015 673
Kings 2014 741
Name: Points, dtype: int64
# 묶여진 데이터를 matrix 형태로 전환
grouped_df.unstack()
Year | 2014 | 2015 |
---|---|---|
Team | ||
Devils | 863.0 | 673.0 |
Kings | 741.0 | NaN |
Riders | 876.0 | 789.0 |
# 인덱스 변경
grouped_df.swaplevel()
Year Team
2014 Devils 863
2015 Devils 673
2014 Kings 741
Riders 876
2015 Riders 789
Name: Points, dtype: int64
print(grouped_df.sum(level=0))
print(grouped_df.sum(level=1))
Team
Devils 1536
Kings 741
Riders 1665
Name: Points, dtype: int64
Year
2014 2480
2015 1462
Name: Points, dtype: int64
groupby II
grouped_df = df.groupby("Team")
for name, group in grouped_df:
print(name)
print(group)
Devils
Points Rank Team Year
2 863 2 Devils 2014
3 673 3 Devils 2015
Kings
Points Rank Team Year
4 741 3 Kings 2014
Riders
Points Rank Team Year
0 876 1 Riders 2014
1 789 2 Riders 2015
grouped_df.get_group("Devils")
# grouped_df["Devils"] -> 컬럼 추출 용도(에러 발생)
Points | Rank | Team | Year | |
---|---|---|---|---|
2 | 863 | 2 | Devils | 2014 |
3 | 673 | 3 | Devils | 2015 |
- 추출된 group 정보에는 3가지 유형의 apply가 가능함
- Aggregation: 요약된 통계정보를 추출
- Transformation: 해당 정보를 변환
- Filtraion: 특정 정보를 제거하여 보여주는 필터링 기능
# Aggregation
print(grouped_df.agg(sum))
import numpy as np
print(grouped_df.agg(np.mean))
print(grouped_df["Points"].agg([np.sum, np.mean, np.std]))
Points Rank Year
Team
Devils 1536 5 4029
Kings 741 3 2014
Riders 1665 3 4029
Points Rank Year
Team
Devils 768.0 2.5 2014.5
Kings 741.0 3.0 2014.0
Riders 832.5 1.5 2014.5
sum mean std
Team
Devils 1536 768.0 134.350288
Kings 741 741.0 NaN
Riders 1665 832.5 61.518290
- transform 메서드는 key 값 별이 아닌 개별 데이터 별로 변환
# Transformation
score = lambda x: x * 2
grouped_df.transform(score)
Points | Rank | Year | |
---|---|---|---|
0 | 1752 | 2 | 4028 |
1 | 1578 | 4 | 4030 |
2 | 1726 | 4 | 4028 |
3 | 1346 | 6 | 4030 |
4 | 1482 | 6 | 4028 |
- 단, max나 min 처럼 Series 데이터에 적용되는 연산들은 key값을 기준으로 적용
score = lambda x: max(x)
grouped_df.transform(score)
Points | Rank | Year | |
---|---|---|---|
0 | 876 | 2 | 2015 |
1 | 876 | 2 | 2015 |
2 | 863 | 3 | 2015 |
3 | 863 | 3 | 2015 |
4 | 741 | 3 | 2014 |
# Filtration
grouped_df.filter(lambda x: len(x) >= 2) # 데이터가 2개 이상인 그룹
Points | Rank | Year | |
---|---|---|---|
0 | 876 | 1 | 2014 |
1 | 789 | 2 | 2015 |
2 | 863 | 2 | 2014 |
3 | 673 | 3 | 2015 |
grouped_df.filter(lambda x: x["Points"].sum() > 1600)
Points | Rank | Year | |
---|---|---|---|
0 | 876 | 1 | 2014 |
1 | 789 | 2 | 2015 |
Pivot table & Crosstab
- Excel에서 사용
- Index 축은 groupby와 동일
- column에 추가로 labeling 값을 추가하여,
- value에 numeric type 값을 aggregation 하는 형태
import dateutil
# Pivot table
df_phone = pd.read_csv("./phone_data.csv")
df_phone['date'] = df_phone['date'].apply(dateutil.parser.parse, dayfirst=True)
df_phone.head()
index | date | duration | item | month | network | network_type | |
---|---|---|---|---|---|---|---|
0 | 0 | 2014-10-15 06:58:00 | 34.429 | data | 2014-11 | data | data |
1 | 1 | 2014-10-15 06:58:00 | 13.000 | call | 2014-11 | Vodafone | mobile |
2 | 2 | 2014-10-15 14:46:00 | 23.000 | call | 2014-11 | Meteor | mobile |
3 | 3 | 2014-10-15 14:48:00 | 4.000 | call | 2014-11 | Tesco | mobile |
4 | 4 | 2014-10-15 17:27:00 | 4.000 | call | 2014-11 | Tesco | mobile |
df_phone.pivot_table(["duration"],
index=[df_phone.month, df_phone.item],
columns=df_phone.network,
aggfunc="sum",
fill_value=0)
duration | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|
network | Meteor | Tesco | Three | Vodafone | data | landline | special | voicemail | world | |
month | item | |||||||||
2014-11 | call | 1521 | 4045 | 12458 | 4316 | 0.000 | 2906 | 0 | 301 | 0 |
data | 0 | 0 | 0 | 0 | 998.441 | 0 | 0 | 0 | 0 | |
sms | 10 | 3 | 25 | 55 | 0.000 | 0 | 1 | 0 | 0 | |
2014-12 | call | 2010 | 1819 | 6316 | 1302 | 0.000 | 1424 | 0 | 690 | 0 |
data | 0 | 0 | 0 | 0 | 1032.870 | 0 | 0 | 0 | 0 | |
sms | 12 | 1 | 13 | 18 | 0.000 | 0 | 0 | 0 | 4 | |
2015-01 | call | 2207 | 2904 | 6445 | 3626 | 0.000 | 1603 | 0 | 285 | 0 |
data | 0 | 0 | 0 | 0 | 1067.299 | 0 | 0 | 0 | 0 | |
sms | 10 | 3 | 33 | 40 | 0.000 | 0 | 0 | 0 | 0 | |
2015-02 | call | 1188 | 4087 | 6279 | 1864 | 0.000 | 730 | 0 | 268 | 0 |
data | 0 | 0 | 0 | 0 | 1067.299 | 0 | 0 | 0 | 0 | |
sms | 1 | 2 | 11 | 23 | 0.000 | 0 | 2 | 0 | 0 | |
2015-03 | call | 274 | 973 | 4966 | 3513 | 0.000 | 11770 | 0 | 231 | 0 |
data | 0 | 0 | 0 | 0 | 998.441 | 0 | 0 | 0 | 0 | |
sms | 0 | 4 | 5 | 13 | 0.000 | 0 | 0 | 0 | 3 |
Crosstab
- 두 컬럼의 교차 빈도, 비율, 덧셈 등을 구할 때 사용
- Pivot table의 특수한 형태
- User-item Rating Matrix 등을 만들 때 사용 가능
df_movie = pd.read_csv("./movie_rating.csv")
df_movie.head()
critic | title | rating | |
---|---|---|---|
0 | Jack Matthews | Lady in the Water | 3.0 |
1 | Jack Matthews | Snakes on a Plane | 4.0 |
2 | Jack Matthews | You Me and Dupree | 3.5 |
3 | Jack Matthews | Superman Returns | 5.0 |
4 | Jack Matthews | The Night Listener | 3.0 |
pd.crosstab(index=df_movie.critic,
columns=df_movie.title,
values=df_movie.rating,
aggfunc="first").fillna(0)
title | Just My Luck | Lady in the Water | Snakes on a Plane | Superman Returns | The Night Listener | You Me and Dupree |
---|---|---|---|---|---|---|
critic | ||||||
Claudia Puig | 3.0 | 0.0 | 3.5 | 4.0 | 4.5 | 2.5 |
Gene Seymour | 1.5 | 3.0 | 3.5 | 5.0 | 3.0 | 3.5 |
Jack Matthews | 0.0 | 3.0 | 4.0 | 5.0 | 3.0 | 3.5 |
Lisa Rose | 3.0 | 2.5 | 3.5 | 3.5 | 3.0 | 2.5 |
Mick LaSalle | 2.0 | 3.0 | 4.0 | 3.0 | 3.0 | 2.0 |
Toby | 0.0 | 0.0 | 4.5 | 4.0 | 0.0 | 1.0 |
df_movie.pivot_table(["rating"],
index=df_movie.critic,
columns=df_movie.title,
aggfunc="first", fill_value=0)
rating | ||||||
---|---|---|---|---|---|---|
title | Just My Luck | Lady in the Water | Snakes on a Plane | Superman Returns | The Night Listener | You Me and Dupree |
critic | ||||||
Claudia Puig | 3.0 | 0.0 | 3.5 | 4.0 | 4.5 | 2.5 |
Gene Seymour | 1.5 | 3.0 | 3.5 | 5.0 | 3.0 | 3.5 |
Jack Matthews | 0.0 | 3.0 | 4.0 | 5.0 | 3.0 | 3.5 |
Lisa Rose | 3.0 | 2.5 | 3.5 | 3.5 | 3.0 | 2.5 |
Mick LaSalle | 2.0 | 3.0 | 4.0 | 3.0 | 3.0 | 2.0 |
Toby | 0.0 | 0.0 | 4.5 | 4.0 | 0.0 | 1.0 |
Merge & Concat
merge
df_a = pd.DataFrame({"subject_id": [1,2,3,4,5,6],
"test_score": [51,15,15,61,16,14]})
df_b = pd.DataFrame({"subject_id":[4,5,6,7,8],
"first_name":["Billy", "Brian", "Bran", "Bryce", "Betty"],
"last_name":["Bonder", "Black", "Balwner", "Brice", "Btisan"]})
pd.merge(df_a, df_b, on="subject_id")
# 두 데이터프레임의 컬럼명이 다를 경우
pd.merge(df_a, df_b, left_on="subject_id", right_on="subject_id")
subject_id | test_score | first_name | last_name | |
---|---|---|---|---|
0 | 4 | 61 | Billy | Bonder |
1 | 5 | 16 | Brian | Black |
2 | 6 | 14 | Bran | Balwner |
print(pd.merge(df_a, df_b, on="subject_id", how="left")) # left join
print(pd.merge(df_a, df_b, on="subject_id", how="right")) # right join
print(pd.merge(df_a, df_b, on="subject_id", how="outer")) # full join
print(pd.merge(df_a, df_b, on="subject_id", how="inner")) # inner join
subject_id test_score first_name last_name
0 1 51 NaN NaN
1 2 15 NaN NaN
2 3 15 NaN NaN
3 4 61 Billy Bonder
4 5 16 Brian Black
5 6 14 Bran Balwner
subject_id test_score first_name last_name
0 4 61.0 Billy Bonder
1 5 16.0 Brian Black
2 6 14.0 Bran Balwner
3 7 NaN Bryce Brice
4 8 NaN Betty Btisan
subject_id test_score first_name last_name
0 1 51.0 NaN NaN
1 2 15.0 NaN NaN
2 3 15.0 NaN NaN
3 4 61.0 Billy Bonder
4 5 16.0 Brian Black
5 6 14.0 Bran Balwner
6 7 NaN Bryce Brice
7 8 NaN Betty Btisan
subject_id test_score first_name last_name
0 4 61 Billy Bonder
1 5 16 Brian Black
2 6 14 Bran Balwner
# index based join
pd.merge(df_a, df_b, right_index=True, left_index=True)
subject_id_x | test_score | subject_id_y | first_name | last_name | |
---|---|---|---|---|---|
0 | 1 | 51 | 4 | Billy | Bonder |
1 | 2 | 15 | 5 | Brian | Black |
2 | 3 | 15 | 6 | Bran | Balwner |
3 | 4 | 61 | 7 | Bryce | Brice |
4 | 5 | 16 | 8 | Betty | Btisan |
concat
print(pd.concat([df_a, df_b]))
print(pd.concat([df_a, df_b], axis=1))
subject_id test_score first_name last_name
0 1 51.0 NaN NaN
1 2 15.0 NaN NaN
2 3 15.0 NaN NaN
3 4 61.0 NaN NaN
4 5 16.0 NaN NaN
5 6 14.0 NaN NaN
0 4 NaN Billy Bonder
1 5 NaN Brian Black
2 6 NaN Bran Balwner
3 7 NaN Bryce Brice
4 8 NaN Betty Btisan
subject_id test_score subject_id first_name last_name
0 1 51 4.0 Billy Bonder
1 2 15 5.0 Brian Black
2 3 15 6.0 Bran Balwner
3 4 61 7.0 Bryce Brice
4 5 16 8.0 Betty Btisan
5 6 14 NaN NaN NaN
Persistence
- pickle persistence
- 가장 일반적인 python 파일 persistence
- to_pickle, read_pickle 함수 사용
df = pd.DataFrame({"Points": [876,789,863,673,741],
"Rank": [1,2,2,3,3],
"Team": ["Riders", "Riders", "Devils", "Devils", "Kings"],
"Year": [2014, 2015, 2014, 2015, 2014]})
print(df)
Points Rank Team Year
0 876 1 Riders 2014
1 789 2 Riders 2015
2 863 2 Devils 2014
3 673 3 Devils 2015
4 741 3 Kings 2014
df.to_pickle("./df_example.pickle")
df_pickle = pd.read_pickle("./df_example.pickle")
df_pickle
Points | Rank | Team | Year | |
---|---|---|---|---|
0 | 876 | 1 | Riders | 2014 |
1 | 789 | 2 | Riders | 2015 |
2 | 863 | 2 | Devils | 2014 |
3 | 673 | 3 | Devils | 2015 |
4 | 741 | 3 | Kings | 2014 |
Leave a comment