판다스 정리3
In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important;}</style>"))

조건

Query 사용 and DataFrame[[컬럼명]][조건]

In [12]:
import pandas as pd

emp = pd.read_csv("c:/data/emp3.csv")
emp
Out[12]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
In [14]:
emp.query('sal<=3000')
emp[emp.sal<=3000]
Out[14]:
index empno ename job mgr hiredate sal comm deptno
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
In [18]:
emp.query('job in ["SALESMAN"]')
emp[emp.job.isin(["SALESMAN"])]
Out[18]:
index empno ename job mgr hiredate sal comm deptno
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
In [20]:
emp.query('job not in ["SALESMAN"]')
emp[~emp.job.isin(['SALESMAN'])] # not in == ~
Out[20]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10

대문자 소문자 바꾸기

In [34]:
emp[['ename']] = emp['ename'].str.lower()
emp
Out[34]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 king PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 blake MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 clark MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 jones MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 martin SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 allen SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 turner SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 james CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 ward SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 ford ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 smith CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 scott ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 adams CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 miller CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
In [37]:
emp[['ename']] = emp.ename.str.upper()
emp
Out[37]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
In [38]:
# Series로 출력
emp['ename']
Out[38]:
0       KING
1      BLAKE
2      CLARK
3      JONES
4     MARTIN
5      ALLEN
6     TURNER
7      JAMES
8       WARD
9       FORD
10     SMITH
11     SCOTT
12     ADAMS
13    MILLER
Name: ename, dtype: object
In [39]:
# DataFrame으로 출력
emp[['ename']]
Out[39]:
ename
0 KING
1 BLAKE
2 CLARK
3 JONES
4 MARTIN
5 ALLEN
6 TURNER
7 JAMES
8 WARD
9 FORD
10 SMITH
11 SCOTT
12 ADAMS
13 MILLER

합치기(concat,append)

In [103]:
new = [15, 7321, 'DEWY', 'MANAGER', np.NaN, '2020-07-16', 3000, np.nan, 50]

# index, columns는 반드시 리스트 형으로 넣어줘야함
# 참고 : https://freedata.tistory.com/53
new_df = pd.DataFrame(new,index=emp.columns,columns=['A']).T
new_df
Out[103]:
index empno ename job mgr hiredate sal comm deptno
A 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50
In [104]:
# 데이터프레임을 데이터프레임에 넣으면 append나 concat이나 같은결과가 나온다.
emp.append(new_df,ignore_index=True)
Out[104]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
14 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50
In [107]:
pd.concat([emp,new_df],axis=0,ignore_index=False)
Out[107]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
A 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50

일반 리스트형을 append하면 결과값 이상하게 나옴 시리즈도 마찬가지...

In [91]:
new2 = pd.Series([15, 7321, 'DEWY', 'MANAGER', np.NaN, '2020-07-16', 3000, np.nan, 50])

emp.append(new2,ignore_index=True)
pd.concat([emp,new_df],axis=1)
Out[91]:
index empno ename job mgr hiredate sal comm deptno index empno ename job mgr hiredate sal comm deptno
0 1.0 7839.0 KING PRESIDENT NaN 1981-11-17 0:00 5000.0 NaN 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2.0 7698.0 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850.0 NaN 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 3.0 7782.0 CLARK MANAGER 7839.0 1981-05-09 0:00 2450.0 NaN 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 4.0 7566.0 JONES MANAGER 7839.0 1981-04-01 0:00 2975.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 5.0 7654.0 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250.0 1400.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 6.0 7499.0 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600.0 300.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
6 7.0 7844.0 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500.0 0.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
7 8.0 7900.0 JAMES CLERK 7698.0 1981-12-11 0:00 950.0 NaN 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 9.0 7521.0 WARD SALESMAN 7698.0 1981-02-23 0:00 1250.0 500.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 10.0 7902.0 FORD ANALYST 7566.0 1981-12-11 0:00 3000.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
10 11.0 7369.0 SMITH CLERK 7902.0 1980-12-09 0:00 800.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
11 12.0 7788.0 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
12 13.0 7876.0 ADAMS CLERK 7788.0 1983-01-15 0:00 1100.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
13 14.0 7934.0 MILLER CLERK 7782.0 1982-01-11 0:00 1300.0 NaN 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
15 NaN NaN NaN NaN NaN NaN NaN NaN NaN 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50

그룹화(groupby)

In [116]:
emp.groupby('job').sal.sum().reset_index()
Out[116]:
job sal
0 ANALYST 6000
1 CLERK 4150
2 MANAGER 8275
3 PRESIDENT 5000
4 SALESMAN 5600

함수 적용(apply)

In [165]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
Out[165]:
A B C D
2013-01-01 0.181765 0.421731 -0.391242 -0.949458
2013-01-02 -1.293275 -0.690032 1.194403 -0.189533
2013-01-03 0.471382 1.351683 0.648124 -0.128018
2013-01-04 -0.748874 -0.648833 -0.393539 0.435896
2013-01-05 0.098486 -2.092289 1.048243 -1.068479
2013-01-06 -1.919552 1.020284 0.426926 -0.297642
In [166]:
df.apply(lambda x: (x-x.min()) / (x.max() - x.min()))
Out[166]:
A B C D
2013-01-01 0.878869 0.729977 0.001447 0.079117
2013-01-02 0.261938 0.407163 1.000000 0.584260
2013-01-03 1.000000 1.000000 0.655983 0.625151
2013-01-04 0.489632 0.419125 0.000000 1.000000
2013-01-05 0.844038 0.000000 0.907956 0.000000
2013-01-06 0.000000 0.903774 0.516685 0.512397

Join

In [132]:
emp2 = emp.append(new_df,ignore_index=True)
emp2
Out[132]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
14 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50
In [126]:
dept = pd.read_csv("c:/data/dept.csv")
dept
Out[126]:
deptno dname loc
0 10 ACCOUNTING NEW YORK
1 20 RESEARCH DALLAS
2 30 SALES CHICAGO
3 40 OPERATIONS BOSTON
In [139]:
pd.merge(emp2,dept,on='deptno') # default : how = 'inner'
Out[139]:
index empno ename job mgr hiredate sal comm deptno dname loc
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10 ACCOUNTING NEW YORK
1 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10 ACCOUNTING NEW YORK
2 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10 ACCOUNTING NEW YORK
3 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30 SALES CHICAGO
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30 SALES CHICAGO
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30 SALES CHICAGO
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30 SALES CHICAGO
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30 SALES CHICAGO
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30 SALES CHICAGO
9 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20 RESEARCH DALLAS
10 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20 RESEARCH DALLAS
11 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20 RESEARCH DALLAS
12 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20 RESEARCH DALLAS
13 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20 RESEARCH DALLAS
In [141]:
pd.merge(emp2,dept,on='deptno',how='left')
Out[141]:
index empno ename job mgr hiredate sal comm deptno dname loc
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10 ACCOUNTING NEW YORK
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30 SALES CHICAGO
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10 ACCOUNTING NEW YORK
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20 RESEARCH DALLAS
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30 SALES CHICAGO
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30 SALES CHICAGO
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30 SALES CHICAGO
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30 SALES CHICAGO
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30 SALES CHICAGO
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20 RESEARCH DALLAS
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20 RESEARCH DALLAS
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20 RESEARCH DALLAS
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20 RESEARCH DALLAS
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10 ACCOUNTING NEW YORK
14 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50 NaN NaN
In [142]:
pd.merge(emp2,dept,on='deptno',how='right')
Out[142]:
index empno ename job mgr hiredate sal comm deptno dname loc
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10.0 ACCOUNTING NEW YORK
1 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10.0 ACCOUNTING NEW YORK
2 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10.0 ACCOUNTING NEW YORK
3 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30.0 SALES CHICAGO
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30.0 SALES CHICAGO
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30.0 SALES CHICAGO
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30.0 SALES CHICAGO
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30.0 SALES CHICAGO
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30.0 SALES CHICAGO
9 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20.0 RESEARCH DALLAS
10 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20.0 RESEARCH DALLAS
11 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20.0 RESEARCH DALLAS
12 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20.0 RESEARCH DALLAS
13 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20.0 RESEARCH DALLAS
14 NaN NaN NaN NaN NaN NaN NaN NaN 40.0 OPERATIONS BOSTON

압축(stack)

In [152]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
df2
Out[152]:
A B
first second
bar one -0.298864 0.247955
two -0.045396 0.610319
baz one -0.681461 -0.217126
two 0.706999 0.656189
In [155]:
stacked = df2.stack()
stacked
Out[155]:
first  second   
bar    one     A   -0.298864
               B    0.247955
       two     A   -0.045396
               B    0.610319
baz    one     A   -0.681461
               B   -0.217126
       two     A    0.706999
               B    0.656189
dtype: float64
In [156]:
stacked.unstack()
Out[156]:
A B
first second
bar one -0.298864 0.247955
two -0.045396 0.610319
baz one -0.681461 -0.217126
two 0.706999 0.656189
판다스 정리2

실습에 사용할 데이터 확인

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important;}</style>"))
In [3]:
import pandas as pd

df = pd.DataFrame([['a','a','b','a','b'],[1,1,1,2,2],[1,1,2,2,2]],index=['c1','c2','c3']).T
df
Out[3]:
c1 c2 c3
0 a 1 1
1 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2

데이터 정보 확인

In [46]:
# 데이터프레임 크기
print(df.shape)
print()

# 데이터프레임 정보
df.info()
(5, 3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   c1      5 non-null      object
 1   c2      5 non-null      object
 2   c3      5 non-null      object
dtypes: object(3)
memory usage: 248.0+ bytes

특정 column을 index로 설정

In [5]:
df.set_index('c3') # == df.set_index(['c3'])
Out[5]:
c1 c3
c2
1 a 1
1 a 1
1 b 2
2 a 2
2 b 2

기존의 DataFrame에서 index(행) 새롭게 넣기

In [9]:
new_index = [0,1,2,3,4,5,6]
df.reindex(new_index)
Out[9]:
c1 c2 c3
0 a 1 1
1 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2
5 NaN NaN NaN
6 NaN NaN NaN
In [13]:
# 값 채우기
df.reindex(new_index,fill_value='hello')
Out[13]:
c1 c2 c3
0 a 1 1
1 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2
5 hello hello hello
6 hello hello hello
In [12]:
# 새롭게 인덱스 생성
df.reset_index()
Out[12]:
index c1 c2 c3
0 0 a 1 1
1 1 a 1 1
2 2 b 1 2
3 3 a 2 2
4 4 b 2 2

결측치

In [153]:
new_index = [0,1,2,3,4,5,6]
df2 = df.reindex(new_index)
df2
Out[153]:
c1 c2 c3
0 a 1 1
1 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2
5 NaN NaN NaN
6 NaN NaN NaN
In [92]:
# 결측치를 제외하고 함수 적용 (default)
print(df2.sum(axis=0,skipna=True))

# 결측치를 제외하지 않고 함수 적용
df2.sum(axis=0,skipna=False)
c2    7
c3    8
dtype: int64
Out[92]:
c2   NaN
c3   NaN
dtype: float64
In [58]:
# 결측치(NaN) 확인 True: NaN / False : 결측치 x
df2.isna()
df2.isnull()
Out[58]:
c1 c2 c3
0 False False False
1 False False False
2 False False False
3 False False False
4 False False False
5 True True True
6 True True True
In [17]:
# 결측치(NaN) 채우기
df2.fillna(0)
Out[17]:
c1 c2 c3
0 a 1 1
1 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2
5 0 0 0
6 0 0 0
In [95]:
# NaN 직전 행의 값으로 채우기
df2.fillna(method='ffill')
Out[95]:
c1 c2 c3
0 a 1 1
1 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2
5 b 2 2
6 b 2 2
In [135]:
df3 = df2

# 행 추가
df3.loc[7] = [3,5,1]

# NaN 직후 행의 값으로 채우기
df3.fillna(method='bfill')
Out[135]:
c1 c2 c3
0 a 1 1
1 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2
5 3 5 1
6 3 5 1
7 3 5 1
In [18]:
# 결측치(NaN) 제거
df2.dropna()
Out[18]:
c1 c2 c3
0 a 1 1
1 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2
In [160]:
# 특정 위치 데이터 바꾸기
df2.iloc[[5],[0]] = 1
df2.loc[[6],['c1']] = 2

# NaN이 있는 column 삭제
df2.dropna(axis=1)
Out[160]:
c1
0 a
1 a
2 b
3 a
4 b
5 1
6 2
In [126]:
# column(axis=1)에 있는 데이터의 NaN 값이 3개 이상(thresh=3)이면 열 삭제
df2.dropna(axis=1,thresh=3)
Out[126]:
c1 c2 c3
0 a 1 1
1 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2
5 NaN NaN NaN
6 NaN NaN NaN

중복 데이터

In [105]:
# 중복된 데이터면 True, 아니면 False
print(df.duplicated())
print()

# 중복된 데이터 개수
print(df.duplicated().sum())
print()

# 중복된 데이터 처리
df4 = df.drop_duplicates()
df4
0    False
1     True
2    False
3    False
4    False
dtype: bool

1

Out[105]:
c1 c2 c3
0 a 1 1
2 b 1 2
3 a 2 2
4 b 2 2

데이터 개수 확인

In [56]:
# 각 컬럼별 데이터 개수 확인
df.count()
Out[56]:
c1    5
c2    5
c3    5
dtype: int64
In [55]:
# 각 컬럼의 고유값 개수
df.c1.value_counts()

# df.value_counts() 는 오류남
Out[55]:
a    3
b    2
Name: c1, dtype: int64

통계 함수 적용

In [60]:
import seaborn as sns

tat = sns.load_dataset('titanic')
tat
Out[60]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 13.0000 S Second man True NaN Southampton no True
887 1 1 female 19.0 0 0 30.0000 S First woman False B Southampton yes True
888 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

891 rows × 15 columns

In [62]:
# 합계
tat.age.sum()
tat['age'].sum()
Out[62]:
21205.17
In [63]:
# 평균
tat.age.mean()
tat['age'].mean()
Out[63]:
29.69911764705882
In [64]:
# 중간값
tat.age.median()
tat['age'].median()
Out[64]:
28.0
In [69]:
# 최빈값
tat.age.mode()
tat['age'].mode()
Out[69]:
0    24.0
dtype: float64
In [65]:
# 최소값
tat.age.min()
tat['age'].min()
Out[65]:
0.42
In [68]:
# 최대값
tat.age.max()
tat['age'].max()
Out[68]:
80.0
In [72]:
# 표준편차
tat.age.std()
tat['age'].std()
Out[72]:
14.526497332334044
In [74]:
# 상관계수
tat.corr()
Out[74]:
survived pclass age sibsp parch fare adult_male alone
survived 1.000000 -0.338481 -0.077221 -0.035322 0.081629 0.257307 -0.557080 -0.203367
pclass -0.338481 1.000000 -0.369226 0.083081 0.018443 -0.549500 0.094035 0.135207
age -0.077221 -0.369226 1.000000 -0.308247 -0.189119 0.096067 0.280328 0.198270
sibsp -0.035322 0.083081 -0.308247 1.000000 0.414838 0.159651 -0.253586 -0.584471
parch 0.081629 0.018443 -0.189119 0.414838 1.000000 0.216225 -0.349943 -0.583398
fare 0.257307 -0.549500 0.096067 0.159651 0.216225 1.000000 -0.182024 -0.271832
adult_male -0.557080 0.094035 0.280328 -0.253586 -0.349943 -0.182024 1.000000 0.404744
alone -0.203367 0.135207 0.198270 -0.584471 -0.583398 -0.271832 0.404744 1.000000
In [71]:
# 각 컬럼별 통계수치 확인
tat.mean()
Out[71]:
survived       0.383838
pclass         2.308642
age           29.699118
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

데이터 종류 확인

In [76]:
# 값의 종류
tat.age.unique()
Out[76]:
array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])
In [78]:
# 값의 종류의 개수
tat.age.nunique()
Out[78]:
88
In [79]:
# 종류별 개수
tat.age.value_counts()
Out[79]:
24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: age, Length: 88, dtype: int64

pivot

In [84]:
tat.pivot_table(index='sex',columns='class',aggfunc='size')
Out[84]:
class First Second Third
sex
female 94 76 144
male 122 108 347
판다스 정리

Series (시리즈)

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important;}</style>"))
In [3]:
import pandas as pd

# dictionary -> Series
dict_data = {'a':'hello','b':np.nan,'c':3}
sr = pd.Series(dict_data)
print(type(sr))
print(sr)
<class 'pandas.core.series.Series'>
a    hello
b      NaN
c        3
dtype: object
In [4]:
# list -> Series
sr2 = pd.Series(['2020-07-15',True,5.8])
print(type(sr2))
print(sr2)
<class 'pandas.core.series.Series'>
0    2020-07-15
1          True
2           5.8
dtype: object
In [5]:
# 인덱스, 데이터값 확인
print(sr.index,sr2.index)
print(sr.values,sr2.values)
Index(['a', 'b', 'c'], dtype='object') RangeIndex(start=0, stop=3, step=1)
['hello' nan 3] ['2020-07-15' True 5.8]
In [6]:
# tuple -> Series
tuple_data = ('홍길동','남',50,False)
sr3 = pd.Series(tuple_data,index=['이름','성별','나이','결혼여부'])
print(sr3)
print()

print(sr3[0]) # == sr3['이름']
print()

print(sr3[[0,1]]) # == sr3[['이름','성별']]
print()

print(sr3[0:3]) # == sr3['이름':'나이']
print()

# print(sr3[[0:2]]) 오류남
이름        홍길동
성별          남
나이         50
결혼여부    False
dtype: object

홍길동

이름    홍길동
성별      남
dtype: object

이름    홍길동
성별      남
나이     50
dtype: object

시리즈 연산

In [51]:
student1 = pd.Series([np.nan,100,50],index=['국어','수학','영어'])
print(student1)
print()

student2 = pd.Series([20,80],index=['국어','수학'])
print(student2)
print()

# fill_value=0 덕에 비어있는 값들을 0으로 만들어줘서 연산이 가능함
st_add = student1.add(student2,fill_value=0)
st_sub = student1.sub(student2,fill_value=0)
st_mul = student1.mul(student2,fill_value=0)
st_div = student1.div(student2,fill_value=0)

# 비어있는 값들(NaN) 때문에 연산이 제대로 되지 않음
st_add2 = student1 + student2
st_sub2 = student1 - student2
st_mul2 = student1 * student2
st_div2 = student1 / student2

result = pd.DataFrame([st_add,st_add2,st_sub,st_sub2,st_mul,st_mul2,st_div,st_div2],
                      index=['+','+','-','-','*','*','/','/'])
result
국어      NaN
수학    100.0
영어     50.0
dtype: float64

국어    20
수학    80
dtype: int64

Out[51]:
국어 수학 영어
+ 20.0 180.00 50.0
+ NaN 180.00 NaN
- -20.0 20.00 50.0
- NaN 20.00 NaN
* 0.0 8000.00 0.0
* NaN 8000.00 NaN
/ 0.0 1.25 inf
/ NaN 1.25 NaN

DataFrame (데이터프레임)

In [7]:
dates = pd.date_range('20200705',periods=3)
print(dates)

dict_data = {'A':0,'B':1,'C':2,'D':3}

df = pd.DataFrame([[1,2,3,4],[2,3,4,5],[3,4,5,6]],index=dates,columns=dict_data.keys())
df
DatetimeIndex(['2020-07-05', '2020-07-06', '2020-07-07'], dtype='datetime64[ns]', freq='D')
Out[7]:
A B C D
2020-07-05 1 2 3 4
2020-07-06 2 3 4 5
2020-07-07 3 4 5 6
In [39]:
df2 = pd.DataFrame(np.arange(12).reshape(3,4),index=dates,columns=list('abcd'))
df2
Out[39]:
a b c d
2020-07-05 0 1 2 3
2020-07-06 4 5 6 7
2020-07-07 8 9 10 11
In [9]:
df3 = pd.DataFrame({'A':1,
                    'B':pd.date_range('20200715',periods=4),
                    'C':pd.Series(3,index=list(range(4)),dtype='float'),
                    'D':pd.Categorical(['hi','my','name','is']),
                    'F':'dewy'})
df3
Out[9]:
A B C D F
0 1 2020-07-15 3.0 hi dewy
1 1 2020-07-16 3.0 my dewy
2 1 2020-07-17 3.0 name dewy
3 1 2020-07-18 3.0 is dewy
In [10]:
# DataFrame 컬럼 타입 보여줌
df3.dtypes
Out[10]:
A             int64
B    datetime64[ns]
C           float64
D          category
F            object
dtype: object
In [11]:
# DataFrame에 대한 통계 정보 보여줌
df.describe()
Out[11]:
A B C D
count 3.0 3.0 3.0 3.0
mean 2.0 3.0 4.0 5.0
std 1.0 1.0 1.0 1.0
min 1.0 2.0 3.0 4.0
25% 1.5 2.5 3.5 4.5
50% 2.0 3.0 4.0 5.0
75% 2.5 3.5 4.5 5.5
max 3.0 4.0 5.0 6.0
In [12]:
# 전치 .T는 속성이므로 괄호() 안씀 (오류남)
df.T
Out[12]:
2020-07-05 2020-07-06 2020-07-07
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
In [13]:
df
Out[13]:
A B C D
2020-07-05 1 2 3 4
2020-07-06 2 3 4 5
2020-07-07 3 4 5 6
In [14]:
# 정렬
df.sort_index(axis=0,ascending=False)
Out[14]:
A B C D
2020-07-07 3 4 5 6
2020-07-06 2 3 4 5
2020-07-05 1 2 3 4
In [15]:
df.sort_index(axis=1,ascending=False)
Out[15]:
D C B A
2020-07-05 4 3 2 1
2020-07-06 5 4 3 2
2020-07-07 6 5 4 3
In [16]:
df.sort_values(by='A',ascending=False)
Out[16]:
A B C D
2020-07-07 3 4 5 6
2020-07-06 2 3 4 5
2020-07-05 1 2 3 4

cloumns, index 이름 바꾸기

In [17]:
df.columns = ['가','나','다','라']
df.index = ['a','b','c']

df
Out[17]:
a 1 2 3 4
b 2 3 4 5
c 3 4 5 6
In [18]:
# inplace = True를 해야 원본 데이터 바뀜
df.rename(columns={'가':'a','나':'b','다':'c','라':'d'},inplace=True)
df.rename(index={'a':'A','b':'B','c':'C'},inplace=True)

df
Out[18]:
a b c d
A 1 2 3 4
B 2 3 4 5
C 3 4 5 6

columns, index 삭제

In [19]:
# axis=0 : 행 / axis=1 : 열
df.drop('A')
Out[19]:
a b c d
B 2 3 4 5
C 3 4 5 6
In [20]:
df.drop('a',axis=1)
Out[20]:
b c d
A 2 3 4
B 3 4 5
C 4 5 6
In [21]:
df.drop(['b','d'],axis=1,inplace=True)
df
Out[21]:
a c
A 1 3
B 2 4
C 3 5
In [22]:
df4 = df
df4
Out[22]:
a c
A 1 3
B 2 4
C 3 5

columns, index 선택

In [23]:
df3
Out[23]:
A B C D F
0 1 2020-07-15 3.0 hi dewy
1 1 2020-07-16 3.0 my dewy
2 1 2020-07-17 3.0 name dewy
3 1 2020-07-18 3.0 is dewy
In [24]:
# 행(index)은 slice 사용 가능
df5 = df3[0:3]
df5
Out[24]:
A B C D F
0 1 2020-07-15 3.0 hi dewy
1 1 2020-07-16 3.0 my dewy
2 1 2020-07-17 3.0 name dewy
In [25]:
# 열(columns)은 컬럼명으로 해야함
df6 = df3[['A','C','F']]
df6
Out[25]:
A C F
0 1 3.0 dewy
1 1 3.0 dewy
2 1 3.0 dewy
3 1 3.0 dewy
In [26]:
# 특정 열(컬럼) 선택
print(df3.F) # = df3['B']


# 특정 행(인덱스) 선택
df3[0:1] # 특정 행을 선택하기 위해서는 한개를 호출할지라도 반드시 범위로 표현해줘야함
0    dewy
1    dewy
2    dewy
3    dewy
Name: F, dtype: object
Out[26]:
A B C D F
0 1 2020-07-15 3.0 hi dewy

loc : 이름을 이용하여 선택하기

In [27]:
df2.loc['2020-07-05':'2020-07-07','a':'c'] # == df2.loc[dates[0:3],'a':'c']
Out[27]:
a b c
2020-07-05 0 1 2
2020-07-06 4 5 6
2020-07-07 8 9 10
In [71]:
print(df3.loc[0,:])

df3.loc[[0,1,2],:]
A                      1
B    2020-07-15 00:00:00
C                      3
D                     hi
F                   dewy
Name: 0, dtype: object
Out[71]:
A B C D F
0 1 2020-07-15 3.0 hi dewy
1 1 2020-07-16 3.0 my dewy
2 1 2020-07-17 3.0 name dewy
In [53]:
df2.loc[:,['a','c']]
Out[53]:
a c
2020-07-05 0 2
2020-07-06 4 6
2020-07-07 8 10
In [28]:
# 특정 값 하나만 추출하기 위해서는 .at을 써도 된다.
df2.loc['2020-07-05','a'] # == df2.at['2020-07-05','a']
Out[28]:
0

iloc : 위치를 이용하여 선택하기

In [29]:
print(df2.iloc[2]) # 인덱스 2번째 (2020-07-07) 데이터 추출
df2.iloc[0:3,2:4]
a     8
b     9
c    10
d    11
Name: 2020-07-07 00:00:00, dtype: int32
Out[29]:
c d
2020-07-05 2 3
2020-07-06 6 7
2020-07-07 10 11
In [30]:
# 특정 열(컬럼)에 대해서 추출
df2.iloc[:,1:3]
Out[30]:
b c
2020-07-05 1 2
2020-07-06 5 6
2020-07-07 9 10
In [31]:
# 특정 행(인덱스)에 대해서 추출
df2.iloc[0:2,:]
Out[31]:
a b c d
2020-07-05 0 1 2 3
2020-07-06 4 5 6 7
In [32]:
# 특정 위치에 대해서만 추출
df2.iloc[[0,2],[0,2]]
Out[32]:
a c
2020-07-05 0 2
2020-07-07 8 10
In [33]:
# 특정 값 하나만 추출하기 위해서는 .iat을 써도 된다.
df2.iloc[1,1] # == df2.iat[1,1]
Out[33]:
5

columns, index 추가

In [34]:
# 열(컬럼) 추가
df['E']=10
df
Out[34]:
a c E
A 1 3 10
B 2 4 10
C 3 5 10
In [35]:
# 행(인덱스) 추가
df.loc['K'] = 5
df
Out[35]:
a c E
A 1 3 10
B 2 4 10
C 3 5 10
K 5 5 5
In [36]:
# 특정 값 바꾸기
df.iloc[0,0] = 100
df
Out[36]:
a c E
A 100 3 10
B 2 4 10
C 3 5 10
K 5 5 5

 

 

참고 사이트

 

판다스(pandas) 기본 사용법 익히기

데이터 분석을 위한 파이썬 라이브러리인 판다스(pandas) 의 기본 사용법을 소개해 놓은 ‘10 Minutes to pandas’ 를 번역해 놓은 글입니다. pandas 의 기본 사용법을 익히시려는 분들에게 실습을 천천히

dandyrilla.github.io

 

Pandas_01

day2_pandas1 판다스(Pandas) 란?¶ 데이터 처리와 분석을 위한 파이썬 라이브러리 파이썬계의 엑셀 http://pandas.pydata.org 아나콘다 설치시 자동으로 설치 외부모듈이라서 임포트는? import p..

daily-life-of-bsh.tistory.com

 

'Python/Pandas' 카테고리의 글 목록

이것저것 공부공간입니다.

kongdols-room.tistory.com

 

 


 

 

 

판다스 페이지

 

pandas documentation — pandas 1.0.5 documentation

API reference The reference guide contains a detailed description of the pandas API. The reference describes how the methods work and which parameters can be used. It assumes that you have an understanding of the key concepts.

pandas.pydata.org

 

API reference — pandas 1.0.5 documentation

This page gives an overview of all public pandas objects, functions and methods. All classes and functions exposed in pandas.* namespace are public. Some subpackages are public which include pandas.errors, pandas.plotting, and pandas.testing. Public functi

pandas.pydata.org

 

 

윤동주_별헤는밤.txt
0.00MB

 

lena.png

 

※ 파일을 열 때의 위치는 .ipynb와 같은 위치에 있으면 굳이 경로를 설정하지 않아도 되지만,

만약에 같은 위치에 있는 데이터가 아니라면 경로를 전부 써줘야한다.

 

f = open('윤동주_별헤는밤.txt','r',encoding='utf-8')

 

f = open('d:\윤동주_별헤는밤.txt','r',encoding='utf8')

 

 

 

 

1. 텍스트 파일 읽고 출력하기 (read)

1
2
3
4
5
6
7
8
9
10
# 텍스트 파일 읽고 출력하기 (read)
# 파일 불러오고 읽기
file = open('윤동주_별헤는밤.txt','r',encoding='UTF8'# 한글 데이터면 인코딩 필수
 
# 파일 전체 출력
data = file.read()
 
print(data)
 
file.close() # 다 읽고 나면 파일을 닫아주는 것을 
cs

 

 

2. 텍스트 파일 한줄씩 읽고 출력하기 (readline)

1
2
3
4
5
6
7
8
9
10
11
# 텍스트 파일 한 줄 읽고 출력하기 (readline)
= open('윤동주_별헤는밤.txt','r',encoding='UTF8')
poet = f.readline()
 
print(poet) # 별헤는 밤
 
# 시 전문 출력
while poet :
    print(poet,end='')
    poet = f.readline()
f.close()
cs

 

 

3. 사용자로부터 입력받은 내용을 파일로 만들기 (write)

1
2
3
4
5
6
# 사용자로부터 입력받은 내용을 파일로 만들기 (write)
text = input('파일에 저장할 내용을 입력하세요 : ')
 
= open('data.txt','w')
f.write(text)
f.close()
cs

 

 

결과

 

 

4. 텍스트 파일에 한 줄씩 쓰기 (writelines)

1
2
3
4
5
6
7
8
9
10
# 텍스트 파일에 한줄씩 쓰기 (writelines)
listdata = ['a','b','c','d','e','f','g']
 
= open('data2.txt','w')
f.write(str(listdata)) # 문자형으로 반환해서 씀
f.close()
 
= open('data3.txt','w')
f.writelines(listdata)
f.close()
cs

 

 

write하면 리스트 전체가 담김

 

writelines하면 요소가 담김

 

5. 텍스트 파일 복사하기

1
2
3
4
5
6
7
8
9
# 텍스트 파일 복사하기
= open('data3.txt','r')
= open('data3_copy.txt','w')
 
data = f.read()
c.write(data)
 
f.close()
c.close()
cs

 

복사한 결과

 

 

6. 바이너리(이미지,동영상) 파일 복사

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 이미지 파일 보기
import numpy as np
import PIL.Image as pilimg
import matplotlib.pyplot as plt
 
image = pilimg.open('lena.png')
img = np.array(image)
 
plt.imshow(img)
 
 
# 바이너리 파일 (이미지,동영상) 복사하기
bufsize = 1024
 
= open('lena.png','rb')
= open('lena_copy.png','wb')
 
data = f.read(bufsize)
while data : # 1KB(1024)씩 읽어와서 읽고 쓰고를 반복하는 구조
    c.write(data)
    data = f.read(bufsize)
 
f.close()
c.close()
cs

 

 

 

7. 자동으로 파일 열고 닫기 (with~as)

1
2
3
4
5
6
7
8
9
10
11
# with~as절 사용 x
= open('윤동주_별헤는밤.txt','r',encoding='utf-8')
data = f.read()
print(data)
f.close
 
 
# with~as절 사용 o
with open('윤동주_별헤는밤.txt','r',encoding='utf-8') as f :
    for i in f.readlines() :
        print(i,end='')
cs

 

 

 

+ Recent posts