In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important;}</style>"))
In [12]:
import pandas as pd
emp = pd.read_csv("c:/data/emp3.csv")
emp
Out[12]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10 |
1 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 |
2 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 |
3 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 |
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 |
9 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 |
10 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 |
11 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 |
12 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 |
13 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 |
In [14]:
emp.query('sal<=3000')
emp[emp.sal<=3000]
Out[14]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
1 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 |
2 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 |
3 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 |
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 |
9 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 |
10 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 |
11 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 |
12 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 |
13 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 |
In [18]:
emp.query('job in ["SALESMAN"]')
emp[emp.job.isin(["SALESMAN"])]
Out[18]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 |
In [20]:
emp.query('job not in ["SALESMAN"]')
emp[~emp.job.isin(['SALESMAN'])] # not in == ~
Out[20]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10 |
1 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 |
2 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 |
3 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 |
9 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 |
10 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 |
11 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 |
12 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 |
13 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 |
대문자 소문자 바꾸기¶
In [34]:
emp[['ename']] = emp['ename'].str.lower()
emp
Out[34]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | king | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10 |
1 | 2 | 7698 | blake | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 |
2 | 3 | 7782 | clark | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 |
3 | 4 | 7566 | jones | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 |
4 | 5 | 7654 | martin | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 |
5 | 6 | 7499 | allen | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 |
6 | 7 | 7844 | turner | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 |
7 | 8 | 7900 | james | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 |
8 | 9 | 7521 | ward | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 |
9 | 10 | 7902 | ford | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 |
10 | 11 | 7369 | smith | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 |
11 | 12 | 7788 | scott | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 |
12 | 13 | 7876 | adams | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 |
13 | 14 | 7934 | miller | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 |
In [37]:
emp[['ename']] = emp.ename.str.upper()
emp
Out[37]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10 |
1 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 |
2 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 |
3 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 |
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 |
9 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 |
10 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 |
11 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 |
12 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 |
13 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 |
In [38]:
# Series로 출력
emp['ename']
Out[38]:
0 KING 1 BLAKE 2 CLARK 3 JONES 4 MARTIN 5 ALLEN 6 TURNER 7 JAMES 8 WARD 9 FORD 10 SMITH 11 SCOTT 12 ADAMS 13 MILLER Name: ename, dtype: object
In [39]:
# DataFrame으로 출력
emp[['ename']]
Out[39]:
ename | |
---|---|
0 | KING |
1 | BLAKE |
2 | CLARK |
3 | JONES |
4 | MARTIN |
5 | ALLEN |
6 | TURNER |
7 | JAMES |
8 | WARD |
9 | FORD |
10 | SMITH |
11 | SCOTT |
12 | ADAMS |
13 | MILLER |
합치기(concat,append)¶
In [103]:
new = [15, 7321, 'DEWY', 'MANAGER', np.NaN, '2020-07-16', 3000, np.nan, 50]
# index, columns는 반드시 리스트 형으로 넣어줘야함
# 참고 : https://freedata.tistory.com/53
new_df = pd.DataFrame(new,index=emp.columns,columns=['A']).T
new_df
Out[103]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
A | 15 | 7321 | DEWY | MANAGER | NaN | 2020-07-16 | 3000 | NaN | 50 |
In [104]:
# 데이터프레임을 데이터프레임에 넣으면 append나 concat이나 같은결과가 나온다.
emp.append(new_df,ignore_index=True)
Out[104]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10 |
1 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 |
2 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 |
3 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 |
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 |
9 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 |
10 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 |
11 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 |
12 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 |
13 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 |
14 | 15 | 7321 | DEWY | MANAGER | NaN | 2020-07-16 | 3000 | NaN | 50 |
In [107]:
pd.concat([emp,new_df],axis=0,ignore_index=False)
Out[107]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10 |
1 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 |
2 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 |
3 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 |
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 |
9 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 |
10 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 |
11 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 |
12 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 |
13 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 |
A | 15 | 7321 | DEWY | MANAGER | NaN | 2020-07-16 | 3000 | NaN | 50 |
일반 리스트형을 append하면 결과값 이상하게 나옴 시리즈도 마찬가지...¶
In [91]:
new2 = pd.Series([15, 7321, 'DEWY', 'MANAGER', np.NaN, '2020-07-16', 3000, np.nan, 50])
emp.append(new2,ignore_index=True)
pd.concat([emp,new_df],axis=1)
Out[91]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 7839.0 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000.0 | NaN | 10.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2.0 | 7698.0 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850.0 | NaN | 30.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 3.0 | 7782.0 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450.0 | NaN | 10.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 4.0 | 7566.0 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975.0 | NaN | 20.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 5.0 | 7654.0 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250.0 | 1400.0 | 30.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | 6.0 | 7499.0 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600.0 | 300.0 | 30.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
6 | 7.0 | 7844.0 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500.0 | 0.0 | 30.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | 8.0 | 7900.0 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950.0 | NaN | 30.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
8 | 9.0 | 7521.0 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250.0 | 500.0 | 30.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9 | 10.0 | 7902.0 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000.0 | NaN | 20.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 | 11.0 | 7369.0 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800.0 | NaN | 20.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11 | 12.0 | 7788.0 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000.0 | NaN | 20.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
12 | 13.0 | 7876.0 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100.0 | NaN | 20.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13 | 14.0 | 7934.0 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300.0 | NaN | 10.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
15 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 15 | 7321 | DEWY | MANAGER | NaN | 2020-07-16 | 3000 | NaN | 50 |
그룹화(groupby)¶
In [116]:
emp.groupby('job').sal.sum().reset_index()
Out[116]:
job | sal | |
---|---|---|
0 | ANALYST | 6000 |
1 | CLERK | 4150 |
2 | MANAGER | 8275 |
3 | PRESIDENT | 5000 |
4 | SALESMAN | 5600 |
In [165]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
Out[165]:
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.181765 | 0.421731 | -0.391242 | -0.949458 |
2013-01-02 | -1.293275 | -0.690032 | 1.194403 | -0.189533 |
2013-01-03 | 0.471382 | 1.351683 | 0.648124 | -0.128018 |
2013-01-04 | -0.748874 | -0.648833 | -0.393539 | 0.435896 |
2013-01-05 | 0.098486 | -2.092289 | 1.048243 | -1.068479 |
2013-01-06 | -1.919552 | 1.020284 | 0.426926 | -0.297642 |
In [166]:
df.apply(lambda x: (x-x.min()) / (x.max() - x.min()))
Out[166]:
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.878869 | 0.729977 | 0.001447 | 0.079117 |
2013-01-02 | 0.261938 | 0.407163 | 1.000000 | 0.584260 |
2013-01-03 | 1.000000 | 1.000000 | 0.655983 | 0.625151 |
2013-01-04 | 0.489632 | 0.419125 | 0.000000 | 1.000000 |
2013-01-05 | 0.844038 | 0.000000 | 0.907956 | 0.000000 |
2013-01-06 | 0.000000 | 0.903774 | 0.516685 | 0.512397 |
Join¶
In [132]:
emp2 = emp.append(new_df,ignore_index=True)
emp2
Out[132]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10 |
1 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 |
2 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 |
3 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 |
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 |
9 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 |
10 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 |
11 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 |
12 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 |
13 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 |
14 | 15 | 7321 | DEWY | MANAGER | NaN | 2020-07-16 | 3000 | NaN | 50 |
In [126]:
dept = pd.read_csv("c:/data/dept.csv")
dept
Out[126]:
deptno | dname | loc | |
---|---|---|---|
0 | 10 | ACCOUNTING | NEW YORK |
1 | 20 | RESEARCH | DALLAS |
2 | 30 | SALES | CHICAGO |
3 | 40 | OPERATIONS | BOSTON |
In [139]:
pd.merge(emp2,dept,on='deptno') # default : how = 'inner'
Out[139]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | dname | loc | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10 | ACCOUNTING | NEW YORK |
1 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 | ACCOUNTING | NEW YORK |
2 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 | ACCOUNTING | NEW YORK |
3 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 | SALES | CHICAGO |
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 | SALES | CHICAGO |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 | SALES | CHICAGO |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 | SALES | CHICAGO |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 | SALES | CHICAGO |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 | SALES | CHICAGO |
9 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 | RESEARCH | DALLAS |
10 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 | RESEARCH | DALLAS |
11 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 | RESEARCH | DALLAS |
12 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 | RESEARCH | DALLAS |
13 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 | RESEARCH | DALLAS |
In [141]:
pd.merge(emp2,dept,on='deptno',how='left')
Out[141]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | dname | loc | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10 | ACCOUNTING | NEW YORK |
1 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30 | SALES | CHICAGO |
2 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10 | ACCOUNTING | NEW YORK |
3 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20 | RESEARCH | DALLAS |
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30 | SALES | CHICAGO |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30 | SALES | CHICAGO |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30 | SALES | CHICAGO |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30 | SALES | CHICAGO |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30 | SALES | CHICAGO |
9 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20 | RESEARCH | DALLAS |
10 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20 | RESEARCH | DALLAS |
11 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20 | RESEARCH | DALLAS |
12 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20 | RESEARCH | DALLAS |
13 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10 | ACCOUNTING | NEW YORK |
14 | 15 | 7321 | DEWY | MANAGER | NaN | 2020-07-16 | 3000 | NaN | 50 | NaN | NaN |
In [142]:
pd.merge(emp2,dept,on='deptno',how='right')
Out[142]:
index | empno | ename | job | mgr | hiredate | sal | comm | deptno | dname | loc | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7839 | KING | PRESIDENT | NaN | 1981-11-17 0:00 | 5000 | NaN | 10.0 | ACCOUNTING | NEW YORK |
1 | 3 | 7782 | CLARK | MANAGER | 7839.0 | 1981-05-09 0:00 | 2450 | NaN | 10.0 | ACCOUNTING | NEW YORK |
2 | 14 | 7934 | MILLER | CLERK | 7782.0 | 1982-01-11 0:00 | 1300 | NaN | 10.0 | ACCOUNTING | NEW YORK |
3 | 2 | 7698 | BLAKE | MANAGER | 7839.0 | 1981-05-01 0:00 | 2850 | NaN | 30.0 | SALES | CHICAGO |
4 | 5 | 7654 | MARTIN | SALESMAN | 7698.0 | 1981-09-10 0:00 | 1250 | 1400.0 | 30.0 | SALES | CHICAGO |
5 | 6 | 7499 | ALLEN | SALESMAN | 7698.0 | 1981-02-11 0:00 | 1600 | 300.0 | 30.0 | SALES | CHICAGO |
6 | 7 | 7844 | TURNER | SALESMAN | 7698.0 | 1981-08-21 0:00 | 1500 | 0.0 | 30.0 | SALES | CHICAGO |
7 | 8 | 7900 | JAMES | CLERK | 7698.0 | 1981-12-11 0:00 | 950 | NaN | 30.0 | SALES | CHICAGO |
8 | 9 | 7521 | WARD | SALESMAN | 7698.0 | 1981-02-23 0:00 | 1250 | 500.0 | 30.0 | SALES | CHICAGO |
9 | 4 | 7566 | JONES | MANAGER | 7839.0 | 1981-04-01 0:00 | 2975 | NaN | 20.0 | RESEARCH | DALLAS |
10 | 10 | 7902 | FORD | ANALYST | 7566.0 | 1981-12-11 0:00 | 3000 | NaN | 20.0 | RESEARCH | DALLAS |
11 | 11 | 7369 | SMITH | CLERK | 7902.0 | 1980-12-09 0:00 | 800 | NaN | 20.0 | RESEARCH | DALLAS |
12 | 12 | 7788 | SCOTT | ANALYST | 7566.0 | 1982-12-22 0:00 | 3000 | NaN | 20.0 | RESEARCH | DALLAS |
13 | 13 | 7876 | ADAMS | CLERK | 7788.0 | 1983-01-15 0:00 | 1100 | NaN | 20.0 | RESEARCH | DALLAS |
14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.0 | OPERATIONS | BOSTON |
압축(stack)¶
In [152]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
df2
Out[152]:
A | B | ||
---|---|---|---|
first | second | ||
bar | one | -0.298864 | 0.247955 |
two | -0.045396 | 0.610319 | |
baz | one | -0.681461 | -0.217126 |
two | 0.706999 | 0.656189 |
In [155]:
stacked = df2.stack()
stacked
Out[155]:
first second bar one A -0.298864 B 0.247955 two A -0.045396 B 0.610319 baz one A -0.681461 B -0.217126 two A 0.706999 B 0.656189 dtype: float64
In [156]:
stacked.unstack()
Out[156]:
A | B | ||
---|---|---|---|
first | second | ||
bar | one | -0.298864 | 0.247955 |
two | -0.045396 | 0.610319 | |
baz | one | -0.681461 | -0.217126 |
two | 0.706999 | 0.656189 |
'코딩 > Python' 카테고리의 다른 글
판다스 이해하기 - 시리즈에서 문자열 조작하기 (0) | 2020.07.20 |
---|---|
판다스 이해하기 - 분할, 더미변수, 문자형 날짜형 변환 (0) | 2020.07.16 |
판다스 이해하기 - 데이터 정보 확인, 결측치, 중복 데이터, pivot (0) | 2020.07.16 |
판다스 이해하기 - 시리즈, 데이터프레임 이해, 생성, loc, iloc (0) | 2020.07.15 |
판다스 이해하기 - 참고 사이트 모음 (1) | 2020.07.15 |