실습에 사용할 데이터 확인¶

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important;}</style>"))

import pandas as pd

df = pd.DataFrame([['a','a','b','a','b'],[1,1,1,2,2],[1,1,2,2,2]],index=['c1','c2','c3']).T
df

데이터 정보 확인¶

# 데이터프레임 크기
print(df.shape)
print()

# 데이터프레임 정보
df.info()

(5, 3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   c1      5 non-null      object
 1   c2      5 non-null      object
 2   c3      5 non-null      object
dtypes: object(3)
memory usage: 248.0+ bytes

특정 column을 index로 설정¶

df.set_index('c3') # == df.set_index(['c3'])

기존의 DataFrame에서 index(행) 새롭게 넣기¶

new_index = [0,1,2,3,4,5,6]
df.reindex(new_index)

# 값 채우기
df.reindex(new_index,fill_value='hello')

# 새롭게 인덱스 생성
df.reset_index()

결측치¶

new_index = [0,1,2,3,4,5,6]
df2 = df.reindex(new_index)
df2

# 결측치를 제외하고 함수 적용 (default)
print(df2.sum(axis=0,skipna=True))

# 결측치를 제외하지 않고 함수 적용
df2.sum(axis=0,skipna=False)

c2    7
c3    8
dtype: int64

c2   NaN
c3   NaN
dtype: float64

# 결측치(NaN) 확인 True: NaN / False : 결측치 x
df2.isna()
df2.isnull()

# 결측치(NaN) 채우기
df2.fillna(0)

# NaN 직전 행의 값으로 채우기
df2.fillna(method='ffill')

df3 = df2

# 행 추가
df3.loc[7] = [3,5,1]

# NaN 직후 행의 값으로 채우기
df3.fillna(method='bfill')

# 결측치(NaN) 제거
df2.dropna()

# 특정 위치 데이터 바꾸기
df2.iloc[[5],[0]] = 1
df2.loc[[6],['c1']] = 2

# NaN이 있는 column 삭제
df2.dropna(axis=1)

# column(axis=1)에 있는 데이터의 NaN 값이 3개 이상(thresh=3)이면 열 삭제
df2.dropna(axis=1,thresh=3)

중복 데이터¶

# 중복된 데이터면 True, 아니면 False
print(df.duplicated())
print()

# 중복된 데이터 개수
print(df.duplicated().sum())
print()

# 중복된 데이터 처리
df4 = df.drop_duplicates()
df4

0    False
1     True
2    False
3    False
4    False
dtype: bool

1

데이터 개수 확인¶

# 각 컬럼별 데이터 개수 확인
df.count()

c1    5
c2    5
c3    5
dtype: int64

# 각 컬럼의 고유값 개수
df.c1.value_counts()

# df.value_counts() 는 오류남

a    3
b    2
Name: c1, dtype: int64

통계 함수 적용¶

import seaborn as sns

tat = sns.load_dataset('titanic')
tat

# 합계
tat.age.sum()
tat['age'].sum()

21205.17

# 평균
tat.age.mean()
tat['age'].mean()

29.69911764705882

# 중간값
tat.age.median()
tat['age'].median()

28.0

# 최빈값
tat.age.mode()
tat['age'].mode()

0    24.0
dtype: float64

# 최소값
tat.age.min()
tat['age'].min()

0.42

# 최대값
tat.age.max()
tat['age'].max()

80.0

# 표준편차
tat.age.std()
tat['age'].std()

14.526497332334044

# 상관계수
tat.corr()

# 각 컬럼별 통계수치 확인
tat.mean()

survived       0.383838
pclass         2.308642
age           29.699118
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

데이터 종류 확인¶

# 값의 종류
tat.age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

# 값의 종류의 개수
tat.age.nunique()

88

# 종류별 개수
tat.age.value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: age, Length: 88, dtype: int64

pivot¶

tat.pivot_table(index='sex',columns='class',aggfunc='size')

	survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	0	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	0	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	0	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
886	0	2	male	27.0	0	0	13.0000	S	Second	man	True	NaN	Southampton	no	True
887	1	1	female	19.0	0	0	30.0000	S	First	woman	False	B	Southampton	yes	True
888	0	3	female	NaN	1	2	23.4500	S	Third	woman	False	NaN	Southampton	no	False
889	1	1	male	26.0	0	0	30.0000	C	First	man	True	C	Cherbourg	yes	True
890	0	3	male	32.0	0	0	7.7500	Q	Third	man	True	NaN	Queenstown	no	True

	survived	pclass	age	sibsp	parch	fare	adult_male	alone
survived	1.000000	-0.338481	-0.077221	-0.035322	0.081629	0.257307	-0.557080	-0.203367
pclass	-0.338481	1.000000	-0.369226	0.083081	0.018443	-0.549500	0.094035	0.135207
age	-0.077221	-0.369226	1.000000	-0.308247	-0.189119	0.096067	0.280328	0.198270
sibsp	-0.035322	0.083081	-0.308247	1.000000	0.414838	0.159651	-0.253586	-0.584471
parch	0.081629	0.018443	-0.189119	0.414838	1.000000	0.216225	-0.349943	-0.583398
fare	0.257307	-0.549500	0.096067	0.159651	0.216225	1.000000	-0.182024	-0.271832
adult_male	-0.557080	0.094035	0.280328	-0.253586	-0.349943	-0.182024	1.000000	0.404744
alone	-0.203367	0.135207	0.198270	-0.584471	-0.583398	-0.271832	0.404744	1.000000

class	First	Second	Third
sex
female	94	76	144
male	122	108	347

판다스 이해하기 - 분할, 더미변수, 문자형 날짜형 변환 (0)	2020.07.16
판다스 이해하기 - 조건문, concat, append, 그룹화, 함수적용, join (0)	2020.07.16
판다스 이해하기 - 시리즈, 데이터프레임 이해, 생성, loc, iloc (0)	2020.07.15
판다스 이해하기 - 참고 사이트 모음 (1)	2020.07.15
파일 읽고 쓰기 (0)	2020.06.15

공부하려고 만든 블로그

판다스 이해하기 - 데이터 정보 확인, 결측치, 중복 데이터, pivot

실습에 사용할 데이터 확인¶

데이터 정보 확인¶

특정 column을 index로 설정¶

기존의 DataFrame에서 index(행) 새롭게 넣기¶

결측치¶

중복 데이터¶

데이터 개수 확인¶

통계 함수 적용¶

데이터 종류 확인¶

pivot¶

'코딩 > Python' 카테고리의 다른 글

+ Recent posts

티스토리툴바

	c1	c2	c3
0	a	1	1
1	a	1	1
2	b	1	2
3	a	2	2
4	b	2	2
5	hello	hello	hello
6	hello	hello	hello

	index	c1	c2	c3
0	0	a	1	1
1	1	a	1	1
2	2	b	1	2
3	3	a	2	2
4	4	b	2	2