판다스 정리3
In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important;}</style>"))

조건

Query 사용 and DataFrame[[컬럼명]][조건]

In [12]:
import pandas as pd

emp = pd.read_csv("c:/data/emp3.csv")
emp
Out[12]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
In [14]:
emp.query('sal<=3000')
emp[emp.sal<=3000]
Out[14]:
index empno ename job mgr hiredate sal comm deptno
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
In [18]:
emp.query('job in ["SALESMAN"]')
emp[emp.job.isin(["SALESMAN"])]
Out[18]:
index empno ename job mgr hiredate sal comm deptno
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
In [20]:
emp.query('job not in ["SALESMAN"]')
emp[~emp.job.isin(['SALESMAN'])] # not in == ~
Out[20]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10

대문자 소문자 바꾸기

In [34]:
emp[['ename']] = emp['ename'].str.lower()
emp
Out[34]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 king PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 blake MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 clark MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 jones MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 martin SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 allen SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 turner SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 james CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 ward SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 ford ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 smith CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 scott ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 adams CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 miller CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
In [37]:
emp[['ename']] = emp.ename.str.upper()
emp
Out[37]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
In [38]:
# Series로 출력
emp['ename']
Out[38]:
0       KING
1      BLAKE
2      CLARK
3      JONES
4     MARTIN
5      ALLEN
6     TURNER
7      JAMES
8       WARD
9       FORD
10     SMITH
11     SCOTT
12     ADAMS
13    MILLER
Name: ename, dtype: object
In [39]:
# DataFrame으로 출력
emp[['ename']]
Out[39]:
ename
0 KING
1 BLAKE
2 CLARK
3 JONES
4 MARTIN
5 ALLEN
6 TURNER
7 JAMES
8 WARD
9 FORD
10 SMITH
11 SCOTT
12 ADAMS
13 MILLER

합치기(concat,append)

In [103]:
new = [15, 7321, 'DEWY', 'MANAGER', np.NaN, '2020-07-16', 3000, np.nan, 50]

# index, columns는 반드시 리스트 형으로 넣어줘야함
# 참고 : https://freedata.tistory.com/53
new_df = pd.DataFrame(new,index=emp.columns,columns=['A']).T
new_df
Out[103]:
index empno ename job mgr hiredate sal comm deptno
A 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50
In [104]:
# 데이터프레임을 데이터프레임에 넣으면 append나 concat이나 같은결과가 나온다.
emp.append(new_df,ignore_index=True)
Out[104]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
14 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50
In [107]:
pd.concat([emp,new_df],axis=0,ignore_index=False)
Out[107]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
A 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50

일반 리스트형을 append하면 결과값 이상하게 나옴 시리즈도 마찬가지...

In [91]:
new2 = pd.Series([15, 7321, 'DEWY', 'MANAGER', np.NaN, '2020-07-16', 3000, np.nan, 50])

emp.append(new2,ignore_index=True)
pd.concat([emp,new_df],axis=1)
Out[91]:
index empno ename job mgr hiredate sal comm deptno index empno ename job mgr hiredate sal comm deptno
0 1.0 7839.0 KING PRESIDENT NaN 1981-11-17 0:00 5000.0 NaN 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2.0 7698.0 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850.0 NaN 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 3.0 7782.0 CLARK MANAGER 7839.0 1981-05-09 0:00 2450.0 NaN 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 4.0 7566.0 JONES MANAGER 7839.0 1981-04-01 0:00 2975.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 5.0 7654.0 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250.0 1400.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 6.0 7499.0 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600.0 300.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
6 7.0 7844.0 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500.0 0.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
7 8.0 7900.0 JAMES CLERK 7698.0 1981-12-11 0:00 950.0 NaN 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 9.0 7521.0 WARD SALESMAN 7698.0 1981-02-23 0:00 1250.0 500.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 10.0 7902.0 FORD ANALYST 7566.0 1981-12-11 0:00 3000.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
10 11.0 7369.0 SMITH CLERK 7902.0 1980-12-09 0:00 800.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
11 12.0 7788.0 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
12 13.0 7876.0 ADAMS CLERK 7788.0 1983-01-15 0:00 1100.0 NaN 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
13 14.0 7934.0 MILLER CLERK 7782.0 1982-01-11 0:00 1300.0 NaN 10.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
15 NaN NaN NaN NaN NaN NaN NaN NaN NaN 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50

그룹화(groupby)

In [116]:
emp.groupby('job').sal.sum().reset_index()
Out[116]:
job sal
0 ANALYST 6000
1 CLERK 4150
2 MANAGER 8275
3 PRESIDENT 5000
4 SALESMAN 5600

함수 적용(apply)

In [165]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
Out[165]:
A B C D
2013-01-01 0.181765 0.421731 -0.391242 -0.949458
2013-01-02 -1.293275 -0.690032 1.194403 -0.189533
2013-01-03 0.471382 1.351683 0.648124 -0.128018
2013-01-04 -0.748874 -0.648833 -0.393539 0.435896
2013-01-05 0.098486 -2.092289 1.048243 -1.068479
2013-01-06 -1.919552 1.020284 0.426926 -0.297642
In [166]:
df.apply(lambda x: (x-x.min()) / (x.max() - x.min()))
Out[166]:
A B C D
2013-01-01 0.878869 0.729977 0.001447 0.079117
2013-01-02 0.261938 0.407163 1.000000 0.584260
2013-01-03 1.000000 1.000000 0.655983 0.625151
2013-01-04 0.489632 0.419125 0.000000 1.000000
2013-01-05 0.844038 0.000000 0.907956 0.000000
2013-01-06 0.000000 0.903774 0.516685 0.512397

Join

In [132]:
emp2 = emp.append(new_df,ignore_index=True)
emp2
Out[132]:
index empno ename job mgr hiredate sal comm deptno
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10
14 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50
In [126]:
dept = pd.read_csv("c:/data/dept.csv")
dept
Out[126]:
deptno dname loc
0 10 ACCOUNTING NEW YORK
1 20 RESEARCH DALLAS
2 30 SALES CHICAGO
3 40 OPERATIONS BOSTON
In [139]:
pd.merge(emp2,dept,on='deptno') # default : how = 'inner'
Out[139]:
index empno ename job mgr hiredate sal comm deptno dname loc
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10 ACCOUNTING NEW YORK
1 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10 ACCOUNTING NEW YORK
2 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10 ACCOUNTING NEW YORK
3 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30 SALES CHICAGO
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30 SALES CHICAGO
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30 SALES CHICAGO
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30 SALES CHICAGO
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30 SALES CHICAGO
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30 SALES CHICAGO
9 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20 RESEARCH DALLAS
10 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20 RESEARCH DALLAS
11 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20 RESEARCH DALLAS
12 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20 RESEARCH DALLAS
13 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20 RESEARCH DALLAS
In [141]:
pd.merge(emp2,dept,on='deptno',how='left')
Out[141]:
index empno ename job mgr hiredate sal comm deptno dname loc
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10 ACCOUNTING NEW YORK
1 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30 SALES CHICAGO
2 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10 ACCOUNTING NEW YORK
3 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20 RESEARCH DALLAS
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30 SALES CHICAGO
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30 SALES CHICAGO
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30 SALES CHICAGO
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30 SALES CHICAGO
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30 SALES CHICAGO
9 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20 RESEARCH DALLAS
10 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20 RESEARCH DALLAS
11 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20 RESEARCH DALLAS
12 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20 RESEARCH DALLAS
13 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10 ACCOUNTING NEW YORK
14 15 7321 DEWY MANAGER NaN 2020-07-16 3000 NaN 50 NaN NaN
In [142]:
pd.merge(emp2,dept,on='deptno',how='right')
Out[142]:
index empno ename job mgr hiredate sal comm deptno dname loc
0 1 7839 KING PRESIDENT NaN 1981-11-17 0:00 5000 NaN 10.0 ACCOUNTING NEW YORK
1 3 7782 CLARK MANAGER 7839.0 1981-05-09 0:00 2450 NaN 10.0 ACCOUNTING NEW YORK
2 14 7934 MILLER CLERK 7782.0 1982-01-11 0:00 1300 NaN 10.0 ACCOUNTING NEW YORK
3 2 7698 BLAKE MANAGER 7839.0 1981-05-01 0:00 2850 NaN 30.0 SALES CHICAGO
4 5 7654 MARTIN SALESMAN 7698.0 1981-09-10 0:00 1250 1400.0 30.0 SALES CHICAGO
5 6 7499 ALLEN SALESMAN 7698.0 1981-02-11 0:00 1600 300.0 30.0 SALES CHICAGO
6 7 7844 TURNER SALESMAN 7698.0 1981-08-21 0:00 1500 0.0 30.0 SALES CHICAGO
7 8 7900 JAMES CLERK 7698.0 1981-12-11 0:00 950 NaN 30.0 SALES CHICAGO
8 9 7521 WARD SALESMAN 7698.0 1981-02-23 0:00 1250 500.0 30.0 SALES CHICAGO
9 4 7566 JONES MANAGER 7839.0 1981-04-01 0:00 2975 NaN 20.0 RESEARCH DALLAS
10 10 7902 FORD ANALYST 7566.0 1981-12-11 0:00 3000 NaN 20.0 RESEARCH DALLAS
11 11 7369 SMITH CLERK 7902.0 1980-12-09 0:00 800 NaN 20.0 RESEARCH DALLAS
12 12 7788 SCOTT ANALYST 7566.0 1982-12-22 0:00 3000 NaN 20.0 RESEARCH DALLAS
13 13 7876 ADAMS CLERK 7788.0 1983-01-15 0:00 1100 NaN 20.0 RESEARCH DALLAS
14 NaN NaN NaN NaN NaN NaN NaN NaN 40.0 OPERATIONS BOSTON

압축(stack)

In [152]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
df2
Out[152]:
A B
first second
bar one -0.298864 0.247955
two -0.045396 0.610319
baz one -0.681461 -0.217126
two 0.706999 0.656189
In [155]:
stacked = df2.stack()
stacked
Out[155]:
first  second   
bar    one     A   -0.298864
               B    0.247955
       two     A   -0.045396
               B    0.610319
baz    one     A   -0.681461
               B   -0.217126
       two     A    0.706999
               B    0.656189
dtype: float64
In [156]:
stacked.unstack()
Out[156]:
A B
first second
bar one -0.298864 0.247955
two -0.045396 0.610319
baz one -0.681461 -0.217126
two 0.706999 0.656189

+ Recent posts