320x100
# Reshaping DataFrames
## Transposing DataFrames
import pandas as pd
titanic = pd.read_csv('titanic.csv')
titanic.head()
'''
survived pclass sex age sibsp parch fare embarked deck
0 0 3 male 22.0 1 0 7.2500 S NaN
1 1 1 female 38.0 1 0 71.2833 C C
2 1 3 female 26.0 0 0 7.9250 S NaN
3 1 1 female 35.0 1 0 53.1000 S C
4 0 3 male 35.0 0 0 8.0500 S NaN
'''
# 처음보는 기능이다.
titanic.T
# 보아하니, 행과 열의 위치를 뒤바꿔 주는 역할을 하는 것 같다.
# 이거랑 T라고 하는거랑 똑같다.
# T는 transpose의 약자인것같다.
titanic.transpose()
titanic.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 deck 203 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB
'''
titanic = titanic.T
titanic.head()
'''
0 1 2 3 4 5 6 7 8 9 ... 881 882 883 884 885 886 887 888 889 890
survived 0 1 1 1 0 0 0 0 1 1 ... 0 0 0 0 0 0 1 0 1 0
pclass 3 1 3 1 3 3 1 3 3 2 ... 3 3 2 3 3 2 1 3 1 3
sex male female female female male male male male female female ... male female male male female male female female male male
age 22.0 38.0 26.0 35.0 35.0 NaN 54.0 2.0 27.0 14.0 ... 33.0 22.0 28.0 25.0 39.0 27.0 19.0 NaN 26.0 32.0
sibsp 1 1 0 1 0 0 0 3 0 1 ... 0 0 0 0 0 0 0 1 0 0
5 rows × 891 columns
'''
titanic.info()
'''
<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, survived to deck
Columns: 891 entries, 0 to 890
dtypes: object(891)
memory usage: 63.0+ KB
'''
titanic = titanic.T
titanic.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null object
1 pclass 891 non-null object
2 sex 891 non-null object
3 age 714 non-null object
4 sibsp 891 non-null object
5 parch 891 non-null object
6 fare 891 non-null object
7 embarked 889 non-null object
8 deck 203 non-null object
dtypes: object(9)
memory usage: 62.8+ KB
'''
# transpose를 이용해서 뒤집어주고, 다시 뒤집어준것 까지는 좋았는데
# 그 이후에 모든 데이터 타입이 object로 변환되었다.
titanic.survived.mean()
# 0.3838383838383838
# object인데도 작동이 되는게 신기하다.
titanic.groupby(['sex', 'pclass']).survived.mean()
'''
sex pclass
female 1 0.968085
2 0.921053
3 0.500000
male 1 0.368852
2 0.157407
3 0.135447
Name: survived, dtype: float64
'''
# 음... 이게 되네
# 원래는 데이터 타입의 문제때문에 이렇게 부터는 안되는 걸로 알고 있다.
# 데이터 타입때문에 다시 재설정
titanic = pd.read_csv('titanic.csv')
titanic.info()
titanic.groupby(['sex', 'pclass']).survived.mean()
'''
sex pclass
female 1 0.968085
2 0.921053
3 0.500000
male 1 0.368852
2 0.157407
3 0.135447
Name: survived, dtype: float64
'''
# 그리고 이 상태는 '무언가가 groupby에 의해서 쌓인 상태'이다.
# 그 '쌓여 있는 것'을 unstack으로 풀어줄 것이다.
titanic.groupby(['sex', 'pclass']).survived.mean().unstack()
'''
pclass 1 2 3
sex
female 0.968085 0.921053 0.500000
male 0.368852 0.157407 0.135447
'''
# 무언가로인해 설정도 안되어있고, 설정이 쌓여있는 것이 아니기 때문에
# 이 데이터 프레임은 '재구조화'를 진행하기에 좋다.
titanic.T.T.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null object
1 pclass 891 non-null object
2 sex 891 non-null object
3 age 714 non-null object
4 sibsp 891 non-null object
5 parch 891 non-null object
6 fare 891 non-null object
7 embarked 889 non-null object
8 deck 203 non-null object
dtypes: object(9)
memory usage: 62.8+ KB
'''
## Pivoting DataFrames with pivot()
import pandas as pd
table1 = pd.read_csv('table1.csv')
table1.head()
'''
Country Medal Count
0 AFG Bronze 2
1 AHO Silver 1
2 ALG Bronze 8
3 ALG Gold 5
4 ALG Silver 2
'''
table1.tail()
'''
Country Medal Count
342 ZIM Gold 18
343 ZIM Silver 4
344 ZZX Bronze 10
345 ZZX Gold 23
346 ZZX Silver 15
'''
table1.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347 entries, 0 to 346
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Country 347 non-null object
1 Medal 347 non-null object
2 Count 347 non-null int64
dtypes: int64(1), object(2)
memory usage: 8.3+ KB
'''
table1.shape
# (347, 3)
table1.pivot(index = 'Country', columns = 'Medal', values = 'Count')
'''
Medal Bronze Gold Silver
Country
AFG 2.0 NaN NaN
AHO NaN NaN 1.0
ALG 8.0 5.0 2.0
ANZ 5.0 20.0 4.0
ARG 91.0 69.0 99.0
... ... ... ...
VIE NaN NaN 2.0
YUG 118.0 143.0 174.0
ZAM 1.0 NaN 1.0
ZIM 1.0 18.0 4.0
ZZX 10.0 23.0 15.0
147 rows × 3 columns
'''
table1_piv = table1.pivot(index = 'Country', columns = 'Medal', values = 'Count').fillna(0)
table1_piv.shape
# (147, 3)
# 기존의 300개가 넘던 행을 1/3수준으로 줄였다.
# 그게 가능한 이유는 pivot을 통해서 medal을 컬럼으로 잡아주었기 때문이다.
table1_piv.info()
'''
<class 'pandas.core.frame.DataFrame'>
Index: 147 entries, AFG to ZZX
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Bronze 147 non-null float64
1 Gold 147 non-null float64
2 Silver 147 non-null float64
dtypes: float64(3)
memory usage: 4.6+ KB
'''
table1.set_index(['Country', 'Medal']).unstack(fill_value = 0)
'''
Count
Medal Bronze Gold Silver
Country
AFG 2 0 0
AHO 0 0 1
ALG 8 5 2
ANZ 5 20 4
ARG 91 69 99
... ... ... ...
VIE 0 0 2
YUG 118 143 174
ZAM 1 0 1
ZIM 1 18 4
ZZX 10 23 15
147 rows × 3 columns
'''
table1.set_index(['Country', 'Medal']).unstack(fill_value = 0)
'''
Count
Medal Bronze Gold Silver
Country
AFG 2 0 0
AHO 0 0 1
ALG 8 5 2
ANZ 5 20 4
ARG 91 69 99
... ... ... ...
VIE 0 0 2
YUG 118 143 174
ZAM 1 0 1
ZIM 1 18 4
ZZX 10 23 15
147 rows × 3 columns
'''
table1_piv
'''
Medal Bronze Gold Silver
Country
AFG 2.0 0.0 0.0
AHO 0.0 0.0 1.0
ALG 8.0 5.0 2.0
ANZ 5.0 20.0 4.0
ARG 91.0 69.0 99.0
... ... ... ...
VIE 0.0 0.0 2.0
YUG 118.0 143.0 174.0
ZAM 1.0 0.0 1.0
ZIM 1.0 18.0 4.0
ZZX 10.0 23.0 15.0
147 rows × 3 columns
'''
# 결과적으로 위의 set_index 처리와 pivot처리 된 것은 '실질적 값'에 있어서는 완전히 같다.
# 다만, count의 표현, int가 float으로 된다든지 표현에 있어서는 다르다.
## Limits of pivot()
import pandas as pd
table2 = pd.read_csv('table2.csv')
table2.head(20)
'''
Country Year Medal Count
0 AFG 2008 Bronze 1
1 AFG 2012 Bronze 1
2 AHO 1988 Silver 1
3 ALG 1984 Bronze 2
4 ALG 1992 Bronze 1
5 ALG 1992 Gold 1
6 ALG 1996 Bronze 1
7 ALG 1996 Gold 2
8 ALG 2000 Bronze 3
9 ALG 2000 Gold 1
10 ALG 2000 Silver 1
11 ALG 2008 Bronze 1
12 ALG 2008 Silver 1
13 ALG 2012 Gold 1
14 ANZ 1908 Bronze 2
15 ANZ 1908 Gold 15
16 ANZ 1908 Silver 2
17 ANZ 1912 Bronze 3
18 ANZ 1912 Gold 5
19 ANZ 1912 Silver 2
'''
table2.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2547 entries, 0 to 2546
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Country 2547 non-null object
1 Year 2547 non-null int64
2 Medal 2547 non-null object
3 Count 2547 non-null int64
dtypes: int64(2), object(2)
memory usage: 79.7+ KB
'''
table2.shape
# (2547, 4)
table2.pivot(index = 'Country', columns = 'Medal', values = 'Count')
# ValueError: Index contains duplicate entries, cannot reshape
# 복제하고자 하는데 전체적으로 모양을 다시 잡을 수 없다. 는 메세지가 뜬다.
# 중복되는 값들이 많아서인데, 이번 케이스에 한에서 연도 열 때문에 문제가 생기는 것이다.
# pivot은 멀티인덱스에 적합하지 않다.
table2.groupby(['Country', 'Year', 'Medal']).Count.sum()
'''
Country Year Medal
AFG 2008 Bronze 1
2012 Bronze 1
AHO 1988 Silver 1
ALG 1984 Bronze 2
1992 Bronze 1
..
ZZX 1900 Bronze 8
Gold 18
Silver 8
1904 Gold 3
Silver 5
Name: Count, Length: 2547, dtype: int64
'''
table2.groupby(['Country', 'Year', 'Medal']).Count.sum().unstack().fillna(0)
'''
Medal Bronze Gold Silver
Country Year
AFG 2008 1.0 0.0 0.0
2012 1.0 0.0 0.0
AHO 1988 0.0 0.0 1.0
ALG 1984 2.0 0.0 0.0
1992 1.0 1.0 0.0
... ... ... ... ...
ZIM 2004 1.0 1.0 1.0
2008 0.0 1.0 3.0
ZZX 1896 2.0 2.0 2.0
1900 8.0 18.0 8.0
1904 0.0 3.0 5.0
1157 rows × 3 columns
'''
## pivot_table()(Part1)
import pandas as pd
table2 = pd.read_csv('table2.csv')
table2.head(10)
'''
Country Year Medal Count
0 AFG 2008 Bronze 1
1 AFG 2012 Bronze 1
2 AHO 1988 Silver 1
3 ALG 1984 Bronze 2
4 ALG 1992 Bronze 1
5 ALG 1992 Gold 1
6 ALG 1996 Bronze 1
7 ALG 1996 Gold 2
8 ALG 2000 Bronze 3
9 ALG 2000 Gold 1
'''
table2.pivot_table(index = 'Country', columns = 'Medal', values = 'Count')
'''
Medal Bronze Gold Silver
Country
AFG 1.000000 NaN NaN
AHO NaN NaN 1.000000
ALG 1.600000 1.250000 1.000000
ANZ 2.500000 10.000000 2.000000
ARG 6.066667 7.666667 7.071429
... ... ... ...
VIE NaN NaN 1.000000
YUG 11.800000 11.000000 11.600000
ZAM 1.000000 NaN 1.000000
ZIM 1.000000 6.000000 2.000000
ZZX 5.000000 7.666667 5.000000
147 rows × 3 columns
'''
table2.groupby(['Country', 'Medal']).Count.sum().unstack(fill_value = 0)
'''
Medal Bronze Gold Silver
Country
AFG 2 0 0
AHO 0 0 1
ALG 8 5 2
ANZ 5 20 4
ARG 91 69 99
... ... ... ...
VIE 0 0 2
YUG 118 143 174
ZAM 1 0 1
ZIM 1 18 4
ZZX 10 23 15
147 rows × 3 columns
'''
agg = table2.groupby(['Country', 'Medal']).Count.sum().unstack(fill_value = 0)
'''
Medal Bronze Gold Silver
Country
AFG 2 0 0
AHO 0 0 1
ALG 8 5 2
ANZ 5 20 4
ARG 91 69 99
... ... ... ...
VIE 0 0 2
YUG 118 143 174
ZAM 1 0 1
ZIM 1 18 4
ZZX 10 23 15
147 rows × 3 columns
'''
agg['All'] = agg.sum(axis = 1)
# transform 처럼 각 행에 대해서 연산을 해준다.
agg.loc['All'] = agg.sum(axis = 0)
'''
Medal
Bronze 10369
Gold 10484
Silver 10308
All 31161
dtype: int64
'''
table2.pivot_table(index = 'Country', columns = 'Medal', values = 'Count', aggfunc = 'mean')
'''
Medal Bronze Gold Silver
Country
AFG 1.000000 NaN NaN
AHO NaN NaN 1.000000
ALG 1.600000 1.250000 1.000000
ANZ 2.500000 10.000000 2.000000
ARG 6.066667 7.666667 7.071429
... ... ... ...
VIE NaN NaN 1.000000
YUG 11.800000 11.000000 11.600000
ZAM 1.000000 NaN 1.000000
ZIM 1.000000 6.000000 2.000000
ZZX 5.000000 7.666667 5.000000
147 rows × 3 columns
'''
table2.pivot_table(index = 'Country', columns = 'Medal', values = 'Count', aggfunc = 'sum', fill_value = 0)
'''
Medal Bronze Gold Silver
Country
AFG 2 0 0
AHO 0 0 1
ALG 8 5 2
ANZ 5 20 4
ARG 91 69 99
... ... ... ...
VIE 0 0 2
YUG 118 143 174
ZAM 1 0 1
ZIM 1 18 4
ZZX 10 23 15
147 rows × 3 columns
'''
table2.groupby(['Country', 'Medal']).Count.sum()
'''
Country Medal
AFG Bronze 2
AHO Silver 1
ALG Bronze 8
Gold 5
Silver 2
..
ZIM Gold 18
Silver 4
ZZX Bronze 10
Gold 23
Silver 15
Name: Count, Length: 347, dtype: int64
'''
table2.groupby(['Country','Medal']).Count.sum().unstack(fill_value = 0)
'''
Medal Bronze Gold Silver
Country
AFG 2 0 0
AHO 0 0 1
ALG 8 5 2
ANZ 5 20 4
ARG 91 69 99
... ... ... ...
VIE 0 0 2
YUG 118 143 174
ZAM 1 0 1
ZIM 1 18 4
ZZX 10 23 15
147 rows × 3 columns
'''
# 결측값에 0을 채워 넣어주었다.
table2.pivot_table(index = 'Country', columns = 'Medal', values = 'Count',
aggfunc = 'sum', fill_value = 0, margins = True)
'''
Medal Bronze Gold Silver All
Country
AFG 2 0 0 2
AHO 0 0 1 1
ALG 8 5 2 15
ANZ 5 20 4 29
ARG 91 69 99 259
... ... ... ... ...
YUG 118 143 174 435
ZAM 1 0 1 2
ZIM 1 18 4 23
ZZX 10 23 15 48
All 10369 10484 10308 31161
148 rows × 4 columns
'''
table2.groupby(['Country', 'Medal']).Count.sum().unstack(fill_value = 0)
'''
Medal Bronze Gold Silver
Country
AFG 2 0 0
AHO 0 0 1
ALG 8 5 2
ANZ 5 20 4
ARG 91 69 99
... ... ... ...
VIE 0 0 2
YUG 118 143 174
ZAM 1 0 1
ZIM 1 18 4
ZZX 10 23 15
147 rows × 3 columns
'''
agg = table2.groupby(['Country', 'Medal']).Count.sum().unstack(fill_value = 0)
'''
Medal Bronze Gold Silver
Country
AFG 2 0 0
AHO 0 0 1
ALG 8 5 2
ANZ 5 20 4
ARG 91 69 99
... ... ... ...
VIE 0 0 2
YUG 118 143 174
ZAM 1 0 1
ZIM 1 18 4
ZZX 10 23 15
147 rows × 3 columns
'''
agg['All'] = agg.sum(axis = 1)
agg.head()
'''
Medal Bronze Gold Silver All
Country
AFG 2 0 0 4
AHO 0 0 1 2
ALG 8 5 2 30
ANZ 5 20 4 58
ARG 91 69 99 518
'''
agg.loc['All'] = agg.sum(axis = 0)
agg
'''
Medal Bronze Gold Silver All
Country
AFG 2 0 0 4
AHO 0 0 1 2
ALG 8 5 2 30
ANZ 5 20 4 58
ARG 91 69 99 518
... ... ... ... ...
YUG 118 143 174 870
ZAM 1 0 1 4
ZIM 1 18 4 46
ZZX 10 23 15 96
All 20738 20968 20616 124644
148 rows × 4 columns
'''
# margins_name 으로 인해서 마지막 pivot으로 설정된 컬럼의 이름을 바꾸어 주었다.
table2.pivot_table(index = 'Country', columns = 'Medal', values = 'Count', aggfunc = 'sum', fill_value = 0, margins = True, margins_name = 'Total')
'''
Medal Bronze Gold Silver Total
Country
AFG 2 0 0 2
AHO 0 0 1 1
ALG 8 5 2 15
ANZ 5 20 4 29
ARG 91 69 99 259
... ... ... ... ...
YUG 118 143 174 435
ZAM 1 0 1 2
ZIM 1 18 4 23
ZZX 10 23 15 48
Total 10369 10484 10308 31161
148 rows × 4 columns
'''
## pd.crosstab()
import pandas as pd
titanic = pd.read_csv('titanic.csv')
titanic.head()
'''
survived pclass sex age sibsp parch fare embarked deck
0 0 3 male 22.0 1 0 7.2500 S NaN
1 1 1 female 38.0 1 0 71.2833 C C
2 1 3 female 26.0 0 0 7.9250 S NaN
3 1 1 female 35.0 1 0 53.1000 S C
4 0 3 male 35.0 0 0 8.0500 S NaN
'''
titanic.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 deck 203 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB
'''
# 두 인자의 교차표와 빈도표
pd.crosstab(titanic.sex, titanic.pclass)
'''
pclass 1 2 3
sex
female 94 76 144
male 122 108 347
'''
pd.crosstab(titanic.sex, titanic.pclass, margins = True, normalize = True)
'''
pclass 1 2 3 All
sex
female 0.105499 0.085297 0.161616 0.352413
male 0.136925 0.121212 0.389450 0.647587
All 0.242424 0.206510 0.551066 1.000000
'''
titanic.groupby(['sex', 'pclass']).pclass.count()
'''
sex pclass
female 1 94
2 76
3 144
male 1 122
2 108
3 347
Name: pclass, dtype: int64
'''
titanic.groupby(['sex', 'pclass']).pclass.count().unstack()
'''
pclass 1 2 3
sex
female 94 76 144
male 122 108 347
'''
# crosstab 과 pivot_table의 기능은 유사하다고 한다.
# 다만 crosstab이 더 좋다고 한다.
pd.crosstab(index= titanic.sex, columns = titanic.pclass, values = titanic.age, aggfunc = 'mean')
'''
pclass 1 2 3
sex
female 34.611765 28.722973 21.750000
male 41.281386 30.740707 26.507589
'''
pd.crosstab(index = titanic.sex, columns = titanic.pclass, values = titanic.fare, aggfunc = 'sum')
'''
pclass 1 2 3
sex
female 9975.8250 1669.7292 2321.1086
male 8201.5875 2132.1125 4393.5865
'''
# crosstab의 합계 컬럼 생성
# 절대 수치 얻기
pd.crosstab(index = titanic.sex, columns = titanic.pclass, values = titanic.fare, aggfunc = 'sum', margins = True)
'''
pclass 1 2 3 All
sex
female 9975.8250 1669.7292 2321.1086 13966.6628
male 8201.5875 2132.1125 4393.5865 14727.2865
All 18177.4125 3801.8417 6714.6951 28693.9493
'''
# 상대 수치 얻기
pd.crosstab(index = titanic.sex, columns = titanic.pclass,
values = titanic.fare, aggfunc = 'sum', margins = True, normalize = True)
'''
pclass 1 2 3 All
sex
female 0.347663 0.058191 0.080892 0.486746
male 0.285830 0.074305 0.153119 0.513254
All 0.633493 0.132496 0.234011 1.000000
'''
summer = pd.read_csv('summer.csv')
summer.head()
'''
Year City Sport Discipline Athlete Country Gender Event Medal
0 1896 Athens Aquatics Swimming HAJOS, Alfred HUN Men 100M Freestyle Gold
1 1896 Athens Aquatics Swimming HERSCHMANN, Otto AUT Men 100M Freestyle Silver
2 1896 Athens Aquatics Swimming DRIVAS, Dimitrios GRE Men 100M Freestyle For Sailors Bronze
3 1896 Athens Aquatics Swimming MALOKINIS, Ioannis GRE Men 100M Freestyle For Sailors Gold
4 1896 Athens Aquatics Swimming CHASAPIS, Spiridon GRE Men 100M Freestyle For Sailors Silver
'''
summer.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31165 entries, 0 to 31164
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Year 31165 non-null int64
1 City 31165 non-null object
2 Sport 31165 non-null object
3 Discipline 31165 non-null object
4 Athlete 31165 non-null object
5 Country 31161 non-null object
6 Gender 31165 non-null object
7 Event 31165 non-null object
8 Medal 31165 non-null object
dtypes: int64(1), object(8)
memory usage: 2.1+ MB
'''
# 표만들기 방법 1 - by crosstab
pd.crosstab(index = [summer.Year, summer.Country], columns = summer.Medal, aggfunc = 'count',
values = 'Athlete', margins = True).fillna(0)
'''
Medal Bronze Gold Silver All
Year Country
1896 AUS 0.0 2.0 0.0 2
AUT 2.0 2.0 1.0 5
DEN 3.0 1.0 2.0 6
FRA 2.0 5.0 4.0 11
GBR 2.0 2.0 3.0 7
... ... ... ... ... ...
2012 UKR 12.0 9.0 4.0 25
USA 46.0 147.0 57.0 250
UZB 2.0 1.0 0.0 3
VEN 0.0 1.0 0.0 1
All 10369.0 10484.0 10308.0 31161
1158 rows × 4 columns
'''
# 표만들기 방법 2 - by pivot_table
summer.pivot_table(index = ['Year', 'Country'], columns = 'Medal', aggfunc = 'count',
values = 'Athlete', fill_value = 0, margins = True)
'''
Medal Bronze Gold Silver All
Year Country
1896 AUS 0 2 0 2
AUT 2 2 1 5
DEN 3 1 2 6
FRA 2 5 4 11
GBR 2 2 3 7
... ... ... ... ... ...
2012 UKR 12 9 4 25
USA 46 147 57 250
UZB 2 1 0 3
VEN 0 1 0 1
All 10369 10484 10308 31161
1158 rows × 4 columns
'''
# All은 margins로 부터 비롯된다.
# 표만들기 방법3 - groupby
# 근데 All은 없다.
summer.groupby(['Year', 'Country', 'Medal']).Medal.count().unstack(fill_value = 0)
'''
Medal Bronze Gold Silver
Year Country
1896 AUS 0 2 0
AUT 2 2 1
DEN 3 1 2
FRA 2 5 4
GBR 2 2 3
... ... ... ... ...
2012 UGA 0 1 0
UKR 12 9 4
USA 46 147 57
UZB 2 1 0
VEN 0 1 0
1157 rows × 3 columns
'''
## Melting DataFrames with melt()
- melt는 pivot의 반대되는 것이라고 보면 된다고 한다.
import pandas as pd
table_2012 = pd.read_csv('table_2012.csv')
table_2012
'''
Country Gold Silver Bronze
0 USA 147 57 46
1 CHN 56 44 28
2 GBR 48 30 48
3 RUS 47 32 51
4 GER 45 28 21
5 FRA 30 30 22
6 NED 21 29 19
7 AUS 19 36 59
8 KOR 18 13 32
9 MEX 18 5 3
10 ITA 16 22 30
11 NOR 15 1 1
12 CRO 15 4 16
13 BRA 14 34 11
14 HUN 12 8 6
15 NZL 9 3 14
16 UKR 9 4 12
17 JAM 8 9 8
18 JPN 7 44 33
19 RSA 6 2 1
'''
table_2012.melt(id_vars = 'Country', value_vars = ['Gold', 'Silver', 'Bronze'])
'''
Country variable value
0 USA Gold 147
1 CHN Gold 56
2 GBR Gold 48
3 RUS Gold 47
4 GER Gold 45
5 FRA Gold 30
6 NED Gold 21
7 AUS Gold 19
8 KOR Gold 18
9 MEX Gold 18
10 ITA Gold 16
11 NOR Gold 15
12 CRO Gold 15
13 BRA Gold 14
14 HUN Gold 12
15 NZL Gold 9
16 UKR Gold 9
17 JAM Gold 8
18 JPN Gold 7
19 RSA Gold 6
20 USA Silver 57
21 CHN Silver 44
22 GBR Silver 30
23 RUS Silver 32
24 GER Silver 28
25 FRA Silver 30
26 NED Silver 29
27 AUS Silver 36
28 KOR Silver 13
29 MEX Silver 5
30 ITA Silver 22
31 NOR Silver 1
32 CRO Silver 4
33 BRA Silver 34
34 HUN Silver 8
35 NZL Silver 3
36 UKR Silver 4
37 JAM Silver 9
38 JPN Silver 44
39 RSA Silver 2
40 USA Bronze 46
41 CHN Bronze 28
42 GBR Bronze 48
43 RUS Bronze 51
44 GER Bronze 21
45 FRA Bronze 22
46 NED Bronze 19
47 AUS Bronze 59
48 KOR Bronze 32
49 MEX Bronze 3
50 ITA Bronze 30
51 NOR Bronze 1
52 CRO Bronze 16
53 BRA Bronze 11
54 HUN Bronze 6
55 NZL Bronze 14
56 UKR Bronze 12
57 JAM Bronze 8
58 JPN Bronze 33
59 RSA Bronze 1
'''
melt_2012 = table_2012.melt(id_vars = 'Country', value_vars = ['Gold', 'Silver', 'Bronze'],
var_name = 'Medal', value_name = 'Count')
melt_2012.pivot(index = 'Country', columns = 'Medal', values = 'Count').sort_values('Gold', ascending=False)
'''
Medal Bronze Gold Silver
Country
USA 46 147 57
CHN 28 56 44
GBR 48 48 30
RUS 51 47 32
GER 21 45 28
FRA 22 30 30
NED 19 21 29
AUS 59 19 36
KOR 32 18 13
MEX 3 18 5
ITA 30 16 22
NOR 1 15 1
CRO 16 15 4
BRA 11 14 34
HUN 6 12 8
NZL 14 9 3
UKR 12 9 4
JAM 8 8 9
JPN 33 7 44
RSA 1 6 2
'''
table_2012.shape
# (20, 4)
table_2012.melt(id_vars = 'Country', value_vars = ['Gold', 'Silver', 'Bronze'])
'''
Country variable value
0 USA Gold 147
1 CHN Gold 56
2 GBR Gold 48
3 RUS Gold 47
4 GER Gold 45
5 FRA Gold 30
6 NED Gold 21
7 AUS Gold 19
8 KOR Gold 18
9 MEX Gold 18
10 ITA Gold 16
11 NOR Gold 15
12 CRO Gold 15
13 BRA Gold 14
14 HUN Gold 12
15 NZL Gold 9
16 UKR Gold 9
17 JAM Gold 8
18 JPN Gold 7
19 RSA Gold 6
20 USA Silver 57
21 CHN Silver 44
22 GBR Silver 30
23 RUS Silver 32
24 GER Silver 28
25 FRA Silver 30
26 NED Silver 29
27 AUS Silver 36
28 KOR Silver 13
29 MEX Silver 5
30 ITA Silver 22
31 NOR Silver 1
32 CRO Silver 4
33 BRA Silver 34
34 HUN Silver 8
35 NZL Silver 3
36 UKR Silver 4
37 JAM Silver 9
38 JPN Silver 44
39 RSA Silver 2
40 USA Bronze 46
41 CHN Bronze 28
42 GBR Bronze 48
43 RUS Bronze 51
44 GER Bronze 21
45 FRA Bronze 22
46 NED Bronze 19
47 AUS Bronze 59
48 KOR Bronze 32
49 MEX Bronze 3
50 ITA Bronze 30
51 NOR Bronze 1
52 CRO Bronze 16
53 BRA Bronze 11
54 HUN Bronze 6
55 NZL Bronze 14
56 UKR Bronze 12
57 JAM Bronze 8
58 JPN Bronze 33
59 RSA Bronze 1
'''
table_2012.melt(id_vars = 'Country', value_vars = ['Gold', 'Silver', 'Bronze'],
var_name = 'Medal', value_name = 'Count')
melt_2012.shape
# (60, 3)
melt_2012.pivot(index = 'Country', columns = 'Medal', values = 'Count').sort_values('Gold', ascending= False)
'''
Medal Bronze Gold Silver
Country
USA 46 147 57
CHN 28 56 44
GBR 48 48 30
RUS 51 47 32
GER 21 45 28
FRA 22 30 30
NED 19 21 29
AUS 59 19 36
KOR 32 18 13
MEX 3 18 5
ITA 30 16 22
NOR 1 15 1
CRO 16 15 4
BRA 11 14 34
HUN 6 12 8
NZL 14 9 3
UKR 12 9 4
JAM 8 8 9
JPN 33 7 44
RSA 1 6 2
'''
전체적으로 melt가 뭔지, pivot은 어떻게 쓰는건지, crosstab의 깊은 용도는 무엇인지
무엇하나 아는게 없다.
점점 모르는게 많이 쌓여가는 것같다.
공부를 어떻게 해야할까?
300x250
'개발일지 > 임시카테고리' 카테고리의 다른 글
SQL 문자형 함수, 숫자형 함수, 날짜형 함수, 변환형 함수 (0) | 2022.08.16 |
---|---|
pandas 판다스 틀린문제 10 crosstab, pivot_table (0) | 2022.08.15 |
프로젝트 두번째. 효율적인 여행을 위한 핀찍기 (0) | 2022.08.15 |
pandas 틀린문제 9, groupby, unstack, nlargest (0) | 2022.08.12 |
pandas 판다스 기초 19 groupby, relabeling, aggregation, transform, apply, stack (0) | 2022.08.12 |