内容目录
Pandas数据规整¶
层次化索引¶
In [366]:
import numpy as np
import pandas as pd
data = pd.Series(np.random.randn(9),
index=[['a','a','a','b','b','c','c','d','d'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
data
Out[366]:
a 1 0.452458 2 1.769188 3 0.149187 b 1 -0.576214 3 0.566276 c 1 -1.178043 2 -0.131641 d 2 -0.819872 3 -0.780530 dtype: float64
In [367]:
data['a']
Out[367]:
1 0.452458 2 1.769188 3 0.149187 dtype: float64
In [368]:
data['a':'c']
Out[368]:
a 1 0.452458 2 1.769188 3 0.149187 b 1 -0.576214 3 0.566276 c 1 -1.178043 2 -0.131641 dtype: float64
In [371]:
data.loc[['b','d']]
Out[371]:
b 1 -0.576214 3 0.566276 d 2 -0.819872 3 -0.780530 dtype: float64
数据连接¶
In [374]:
left = pd.DataFrame({'key':['K0', 'K1', 'K2', 'K3'],
'A':['A0', 'A1', 'A2', 'A3'],
'B':['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key':['K0', 'K1', 'K2', 'K3'],
'C':['C0', 'C1', 'C2', 'C3'],
'D':['D0', 'D1', 'D2', 'D3']})
left
Out[374]:
key | A | B | |
---|---|---|---|
0 | K0 | A0 | B0 |
1 | K1 | A1 | B1 |
2 | K2 | A2 | B2 |
3 | K3 | A3 | B3 |
In [375]:
right
Out[375]:
key | C | D | |
---|---|---|---|
0 | K0 | C0 | D0 |
1 | K1 | C1 | D1 |
2 | K2 | C2 | D2 |
3 | K3 | C3 | D3 |
In [376]:
# 连接两个表格
pd.merge(left, right)
Out[376]:
key | A | B | C | D | |
---|---|---|---|---|---|
0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | A1 | B1 | C1 | D1 |
2 | K2 | A2 | B2 | C2 | D2 |
3 | K3 | A3 | B3 | C3 | D3 |
In [380]:
left = pd.DataFrame({'key1':['K0', 'K0', 'K1', 'K2'],
'key2':['K0', 'K1', 'K0', 'K1'],
'A':['A0', 'A1', 'A2', 'A3'],
'B':['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1':['K0', 'K1', 'K1', 'K2'],
'key2':['K0', 'K0', 'K0', 'K0'],
'C':['C0', 'C1', 'C2', 'C3'],
'D':['D0', 'D1', 'D2', 'D3']})
left
Out[380]:
key1 | key2 | A | B | |
---|---|---|---|---|
0 | K0 | K0 | A0 | B0 |
1 | K0 | K1 | A1 | B1 |
2 | K1 | K0 | A2 | B2 |
3 | K2 | K1 | A3 | B3 |
In [381]:
right
Out[381]:
key1 | key2 | C | D | |
---|---|---|---|---|
0 | K0 | K0 | C0 | D0 |
1 | K1 | K0 | C1 | D1 |
2 | K1 | K0 | C2 | D2 |
3 | K2 | K0 | C3 | D3 |
In [382]:
# 指定多个键值进行数据连接
pd.merge(left, right, on=['key1', 'key2'])
Out[382]:
key1 | key2 | A | B | C | D | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | K0 | A2 | B2 | C1 | D1 |
2 | K1 | K0 | A2 | B2 | C2 | D2 |
In [383]:
# 右连接:指定right作为基础进行连接
pd.merge(left, right, how='right', on=['key1','key2'])
Out[383]:
key1 | key2 | A | B | C | D | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | K0 | A2 | B2 | C1 | D1 |
2 | K1 | K0 | A2 | B2 | C2 | D2 |
3 | K2 | K0 | NaN | NaN | C3 | D3 |
In [384]:
# 左连接:指定left作为基础进行连接
pd.merge(left, right, how='left', on=['key1','key2'])
Out[384]:
key1 | key2 | A | B | C | D | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K0 | K1 | A1 | B1 | NaN | NaN |
2 | K1 | K0 | A2 | B2 | C1 | D1 |
3 | K1 | K0 | A2 | B2 | C2 | D2 |
4 | K2 | K1 | A3 | B3 | NaN | NaN |
In [386]:
# 外连接:求两个表的并集
pd.merge(left, right, how='outer', on=['key1','key2'])
Out[386]:
key1 | key2 | A | B | C | D | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K0 | K1 | A1 | B1 | NaN | NaN |
2 | K1 | K0 | A2 | B2 | C1 | D1 |
3 | K1 | K0 | A2 | B2 | C2 | D2 |
4 | K2 | K1 | A3 | B3 | NaN | NaN |
5 | K2 | K0 | NaN | NaN | C3 | D3 |
In [388]:
# 按照索引进行连接
df_obj1 = pd.DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data': np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key':['a', 'b', 'd'],
'data': np.random.randint(0,10,3)})
In [389]:
df_obj1
Out[389]:
key | data | |
---|---|---|
0 | b | 8 |
1 | b | 7 |
2 | a | 0 |
3 | c | 8 |
4 | a | 0 |
5 | a | 2 |
6 | b | 0 |
In [390]:
df_obj2
Out[390]:
key | data | |
---|---|---|
0 | a | 2 |
1 | b | 6 |
2 | d | 8 |
In [392]:
pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left','_right'))
Out[392]:
key | data_left | data_right | |
---|---|---|---|
0 | b | 8 | 6 |
1 | b | 7 | 6 |
2 | b | 0 | 6 |
3 | a | 0 | 2 |
4 | a | 0 | 2 |
5 | a | 2 | 2 |
In [401]:
# 指定索引进行连接
df_obj1 = pd.DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2': np.random.randint(0,10,3)},
index=['a','b','d'])
In [402]:
df_obj1
Out[402]:
key | data1 | |
---|---|---|
0 | b | 8 |
1 | b | 7 |
2 | a | 8 |
3 | c | 4 |
4 | a | 1 |
5 | a | 1 |
6 | b | 8 |
In [403]:
df_obj2
Out[403]:
data2 | |
---|---|
a | 5 |
b | 1 |
d | 8 |
In [404]:
# df_obj2没有key,所以使用索引a\b\c与df_obj1的key进行连接
pd.merge(df_obj1, df_obj2, left_on='key', suffixes=('_left','_right'), right_index=True)
Out[404]:
key | data1 | data2 | |
---|---|---|---|
0 | b | 8 | 1 |
1 | b | 7 | 1 |
6 | b | 8 | 1 |
2 | a | 8 | 5 |
4 | a | 1 | 5 |
5 | a | 1 | 5 |
数据合并¶
In [405]:
left2 = pd.DataFrame([[1,2],[3,4],[5,6]],
index=['a','c','e'],
columns=['语文','数学'])
right2 = pd.DataFrame([[7,8],[9,10],[11,12],[13,14]],
index=['b','c','d','e'],
columns=['英语','综合'])
In [406]:
left2
Out[406]:
语文 | 数学 | |
---|---|---|
a | 1 | 2 |
c | 3 | 4 |
e | 5 | 6 |
In [407]:
right2
Out[407]:
英语 | 综合 | |
---|---|---|
b | 7 | 8 |
c | 9 | 10 |
d | 11 | 12 |
e | 13 | 14 |
In [408]:
# 将两个表格通过外连接合并,同时左边和右边均已行索引进行合并
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
Out[408]:
语文 | 数学 | 英语 | 综合 | |
---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN |
b | NaN | NaN | 7.0 | 8.0 |
c | 3.0 | 4.0 | 9.0 | 10.0 |
d | NaN | NaN | 11.0 | 12.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 |
In [409]:
# join方法使用时,两个表格不能有重复的列
left2.join(right2, how='outer')
Out[409]:
语文 | 数学 | 英语 | 综合 | |
---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN |
b | NaN | NaN | 7.0 | 8.0 |
c | 3.0 | 4.0 | 9.0 | 10.0 |
d | NaN | NaN | 11.0 | 12.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 |
np.concatenate数据连接¶
In [412]:
import numpy as np
import pandas as pd
arr1 = np.random.randint(0, 10, (3,4))
arr2 = np.random.randint(0, 10, (3,4))
print(arr1)
print(arr2)
[[0 7 5 1] [8 6 5 8] [2 6 7 4]] [[0 5 9 4] [3 6 5 1] [9 5 9 7]]
In [415]:
print(np.concatenate([arr1, arr2]))
[[0 7 5 1] [8 6 5 8] [2 6 7 4] [0 5 9 4] [3 6 5 1] [9 5 9 7]]
In [416]:
print(np.concatenate([arr1, arr2], axis=1))
[[0 7 5 1 0 5 9 4] [8 6 5 8 3 6 5 1] [2 6 7 4 9 5 9 7]]
pd.concat数据连接¶
In [418]:
df1 = pd.DataFrame(np.arange(6).reshape(3,2),
index=list('abc'),
columns=['one','two'])
df2 = pd.DataFrame(np.arange(4).reshape(2,2),
index=list('ac'),
columns=['three','four'])
In [419]:
df1
Out[419]:
one | two | |
---|---|---|
a | 0 | 1 |
b | 2 | 3 |
c | 4 | 5 |
In [420]:
df2
Out[420]:
three | four | |
---|---|---|
a | 0 | 1 |
c | 2 | 3 |
In [422]:
pd.concat([df1, df2])
Out[422]:
one | two | three | four | |
---|---|---|---|---|
a | 0.0 | 1.0 | NaN | NaN |
b | 2.0 | 3.0 | NaN | NaN |
c | 4.0 | 5.0 | NaN | NaN |
a | NaN | NaN | 0.0 | 1.0 |
c | NaN | NaN | 2.0 | 3.0 |
In [423]:
pd.concat([df1, df2], axis=1)
Out[423]:
one | two | three | four | |
---|---|---|---|---|
a | 0 | 1 | 0.0 | 1.0 |
b | 2 | 3 | NaN | NaN |
c | 4 | 5 | 2.0 | 3.0 |
重塑和轴向旋转¶
In [425]:
data = pd.DataFrame(np.arange(6).reshape(2,3),
index=pd.Index(['老王','小刘'], name='姓名'),
columns=pd.Index(['语文','数学','英语'], name='科目'))
In [426]:
data
Out[426]:
科目 | 语文 | 数学 | 英语 |
---|---|---|---|
姓名 | |||
老王 | 0 | 1 | 2 |
小刘 | 3 | 4 | 5 |
In [431]:
# 重塑层次化索引
# stack:将Dataframe→Series
r = data.stack()
r
Out[431]:
姓名 科目 老王 语文 0 数学 1 英语 2 小刘 语文 3 数学 4 英语 5 dtype: int64
In [432]:
# unstack:将Series→Dataframe
r.unstack()
Out[432]:
科目 | 语文 | 数学 | 英语 |
---|---|---|---|
姓名 | |||
老王 | 0 | 1 | 2 |
小刘 | 3 | 4 | 5 |
In [433]:
#轴向旋转
df3 = pd.DataFrame({'date':['2018-11-22','2018-11-22','2018-11-23','2018-11-23','2018-11-24'],
'class':['a','b','b','c','c'],
'values':[5,3,2,6,1]},
columns=['date','class','values'])
df3
Out[433]:
date | class | values | |
---|---|---|---|
0 | 2018-11-22 | a | 5 |
1 | 2018-11-22 | b | 3 |
2 | 2018-11-23 | b | 2 |
3 | 2018-11-23 | c | 6 |
4 | 2018-11-24 | c | 1 |
In [441]:
# 使用 pivot() 方法进行重塑,指定'date'列作为行索引,没有指定列参数
pivot_df = df3.pivot(index='date', columns='class')
pivot_df
Out[441]:
values | |||
---|---|---|---|
class | a | b | c |
date | |||
2018-11-22 | 5.0 | 3.0 | NaN |
2018-11-23 | NaN | 2.0 | 6.0 |
2018-11-24 | NaN | NaN | 1.0 |
In [442]:
# 使用set_index+unstack的方式与pivot的方式效果一样
df3.set_index(['date','class']).unstack('class')
Out[442]:
values | |||
---|---|---|---|
class | a | b | c |
date | |||
2018-11-22 | 5.0 | 3.0 | NaN |
2018-11-23 | NaN | 2.0 | 6.0 |
2018-11-24 | NaN | NaN | 1.0 |
数据分组和聚合¶
In [444]:
df_ = pd.read_csv('testdata/Employee Sample Data.csv',encoding='latin1')
df_.head(5)
Out[444]:
EEID | Full Name | Job Title | Department | Business Unit | Gender | Ethnicity | Age | Hire Date | Annual Salary | Bonus % | Country | City | Exit Date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | E02387 | Emily Davis | Sr. Manger | IT | Research & Development | Female | Black | 55 | 4/8/2016 | $141,604 | 15% | United States | Seattle | 10/16/2021 |
1 | E04105 | Theodore Dinh | Technical Architect | IT | Manufacturing | Male | Asian | 59 | 11/29/1997 | $99,975 | 0% | China | Chongqing | NaN |
2 | E02572 | Luna Sanders | Director | Finance | Speciality Products | Female | Caucasian | 50 | 10/26/2006 | $163,099 | 20% | United States | Chicago | NaN |
3 | E02832 | Penelope Jordan | Computer Systems Manager | IT | Manufacturing | Female | Caucasian | 26 | 9/27/2019 | $84,913 | 7% | United States | Chicago | NaN |
4 | E01639 | Austin Vo | Sr. Analyst | Finance | Manufacturing | Male | Asian | 55 | 11/20/1995 | $95,409 | 0% | United States | Phoenix | NaN |
如果要获取Country中美国和中国各自雇员的人数,就需要使用分组和聚合功能,其示意图如下:
In [453]:
# 使用groupby进行分组
# 根据Country进行分组
group = dict(list(df_.groupby(by='Country')))
group.keys()
Out[453]:
dict_keys(['Brazil', 'China', 'United States'])
In [456]:
# 聚合
# 场景模拟:根据EEID唯一编号求各个国家的人数
df_.groupby(by='Country')['EEID'].count()
Out[456]:
Country Brazil 139 China 218 United States 643 Name: EEID, dtype: int64
In [457]:
# 场景模拟:根据Age求各个国家的年龄平均值
df_.groupby(by='Country')['Age'].mean()
Out[457]:
Country Brazil 43.654676 China 45.389908 United States 44.197512 Name: Age, dtype: float64
In [459]:
# as_index会为结果增加行索引
df_.groupby(by='Country', as_index=False)['Age'].mean()
Out[459]:
Country | Age | |
---|---|---|
0 | Brazil | 43.654676 |
1 | China | 45.389908 |
2 | United States | 44.197512 |
In [461]:
# 场景模拟:计算每个国家年龄的差值
def diff(arr):
return arr.max() - arr.min()
df_.groupby(by='Country')['Age'].agg(diff)
Out[461]:
Country Brazil 40 China 40 United States 40 Name: Age, dtype: int64
In [462]:
# 场景模拟:计算每个国家每个城市年龄的平均值
df_.groupby(by=['Country','City'])['Age'].mean()
Out[462]:
Country City Brazil Manaus 46.000000 Rio de Janerio 42.454545 Sao Paulo 42.792453 China Beijing 46.309091 Chengdu 42.456522 Chongqing 46.692308 Shanghai 45.384615 United States Austin 44.848485 Chicago 43.652174 Columbus 43.716814 Miami 44.375000 Phoenix 44.302752 Seattle 44.271186 Name: Age, dtype: float64
In [463]:
# 使用字典进行分组
dict_m = {'Country': 'C','City':'C','Gender':'G','Department':'D'}
df_.groupby(dict_m, axis=1).sum()
Out[463]:
C | D | G | |
---|---|---|---|
0 | United StatesSeattle | IT | Female |
1 | ChinaChongqing | IT | Male |
2 | United StatesChicago | Finance | Female |
3 | United StatesChicago | IT | Female |
4 | United StatesPhoenix | Finance | Male |
… | … | … | … |
995 | United StatesColumbus | Marketing | Male |
996 | ChinaChengdu | Finance | Female |
997 | United StatesMiami | Marketing | Male |
998 | ChinaChengdu | Finance | Female |
999 | United StatesMiami | Accounting | Female |
1000 rows × 3 columns