Pandas_DataFrameV6

Pandas_DataFrame

In [912]:
import numpy as np
import pandas as pd

创建

pandas.DataFrame(
          data=None
        , index=None
        , columns=None
        , dtype=None
        , copy=False
        )
data : numpy ndarray (structured or homogeneous), dict, or DataFrame
Dict can contain Series, arrays, constants, or list-like objects
Changed in version 0.23.0: If data is a dict, argument order is maintained for Python 3.6 and later.
index : Index or array-like
Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided
columns : Index or array-like
Column labels to use for resulting frame. Will default to RangeIndex (0, 1, 2, …, n) if no column labels are provided
dtype : dtype, default None
Data type to force. Only a single dtype is allowed. If None, infer
copy : boolean, default False
Copy data from inputs. Only affects DataFrame / 2d ndarray input

根據Series直接生成 from a Series

  • The result will be a DataFrame with the same index as the input Series, and with one column whose name is the original name of the Series (only if no other column name provided).
In [913]:
s=pd.Series(np.random.randn(5))
pd.DataFrame(s) # Series的每一個元素是DataFrame的一條行記錄
Out[913]:
0
0 0.053168
1 0.644198
2 -1.012411
3 2.371181
4 1.440502
In [914]:
s=pd.Series([1,2,3,4,5],name='Jasper')
pd.DataFrame(s, columns=['Jasper','Casper'])
Out[914]:
Jasper Casper
0 1 NaN
1 2 NaN
2 3 NaN
3 4 NaN
4 5 NaN

根據Series組成的字典創建 from dict of Series

In [915]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])
    }
pd.DataFrame(d)
Out[915]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0
In [916]:
pd.DataFrame(d, index=['d', 'b', 'a'])
Out[916]:
one two
d NaN 4.0
b 2.0 2.0
a 1.0 1.0
In [917]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])
Out[917]:
two three
d 4.0 NaN
b 2.0 NaN
a 1.0 NaN
In [918]:
df=pd.DataFrame(d)
df.index
Out[918]:
Index(['a', 'b', 'c', 'd'], dtype='object')
In [919]:
pd.DataFrame(d)
df.columns
Out[919]:
Index(['one', 'two'], dtype='object')

根據列表組成的字典創建 from dict of lists

  • list中的每一個元素,作爲df的一行
In [920]:
# 也可直接根據列表創建
l=[1., 2., 3., 4.]
pd.DataFrame(l, columns=['one'])
Out[920]:
one
0 1.0
1 2.0
2 3.0
3 4.0
In [921]:
d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]}
pd.DataFrame(d)
Out[921]:
one two
0 1.0 4.0
1 2.0 3.0
2 3.0 2.0
3 4.0 1.0
In [922]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])
Out[922]:
one two
a 1.0 4.0
b 2.0 3.0
c 3.0 2.0
d 4.0 1.0
In [923]:
d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]}
pd.DataFrame.from_dict(d)
Out[923]:
one two
0 1.0 4.0
1 2.0 3.0
2 3.0 2.0
3 4.0 1.0
In [924]:
# orient='index'
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names
d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]}
pd.DataFrame.from_dict(d
                       , orient='index'
                       , columns=['one', 'two', 'three', 'four']
)
Out[924]:
one two three four
one 1.0 2.0 3.0 4.0
two 4.0 3.0 2.0 1.0

根據數組組成的字典創建 from dict of ndarrays

In [925]:
d = {'one': np.array([1.,2.,3.,4.]), 'two': np.array([4.,2.,3.,1.])}
pd.DataFrame(d)
Out[925]:
one two
0 1.0 4.0
1 2.0 2.0
2 3.0 3.0
3 4.0 1.0
In [926]:
d = {'one': np.array([1.,2.,3.,4.]), 'two': np.array([4.,2.,3.,1.])}
pd.DataFrame(d, index=['A','B','C','D'])
Out[926]:
one two
A 1.0 4.0
B 2.0 2.0
C 3.0 3.0
D 4.0 1.0

根據字典組成的列表創建 from a list of dicts

  • list中的每一個元素,作爲df的一行(同樣的),但是此刻每一行記錄是一個字典,期間的每一個(key和)value,作爲了df的每一列.
  • 當使用爬蟲採集數據時,對每一條字典類記錄,可逐一加入到一個list中, 然後該list生成一個DataFrame
In [927]:
l = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(l, index=['first', 'second'], columns=['c', 'a', 'b'])
Out[927]:
c a b
first NaN 1 2
second 20.0 5 10

根據元組組成的字典創建 from a dict of tuples

In [928]:
t={  # 字典里面嵌套字典, key里面嵌套tuples, values里面嵌套字典
    ('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
    ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
    ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
    ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
    ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}
}
for i in t.keys():    # 獲取字典key的方法
    print(type(i), i)
print('-'*100)
for v in t.values():    # 獲取字典value的方法
    print(type(v), v)
print('-'*100)
for j in t.keys():    # 獲取字典key中的元組的元素的方法
    print(j[0], j[1])
print('-'*100)
for x, y in t.keys():   # 獲取字典key中的元組的元素的第二種方法
    print(x, y)
print('-'*100)
for v2 in t.values():   # 獲取字典value中的字典的元素的key和Value
    for k3, v3 in v2.items():
        print(k3, ' | ', v3)
    print('-'*20)
pd.DataFrame(t)
<class 'tuple'> ('a', 'b')
<class 'tuple'> ('a', 'a')
<class 'tuple'> ('a', 'c')
<class 'tuple'> ('b', 'a')
<class 'tuple'> ('b', 'b')
----------------------------------------------------------------------------------------------------
<class 'dict'> {('A', 'B'): 1, ('A', 'C'): 2}
<class 'dict'> {('A', 'C'): 3, ('A', 'B'): 4}
<class 'dict'> {('A', 'B'): 5, ('A', 'C'): 6}
<class 'dict'> {('A', 'C'): 7, ('A', 'B'): 8}
<class 'dict'> {('A', 'D'): 9, ('A', 'B'): 10}
----------------------------------------------------------------------------------------------------
a b
a a
a c
b a
b b
----------------------------------------------------------------------------------------------------
a b
a a
a c
b a
b b
----------------------------------------------------------------------------------------------------
('A', 'B')  |  1
('A', 'C')  |  2
--------------------
('A', 'C')  |  3
('A', 'B')  |  4
--------------------
('A', 'B')  |  5
('A', 'C')  |  6
--------------------
('A', 'C')  |  7
('A', 'B')  |  8
--------------------
('A', 'D')  |  9
('A', 'B')  |  10
--------------------
Out[928]:
a b
b a c a b
A B 1.0 4.0 5.0 8.0 10.0
C 2.0 3.0 6.0 7.0 NaN
D NaN NaN NaN NaN 9.0
In [929]:
pd.DataFrame({  # key里面的tuple成为列组合,
                # value里面的dict里面key里面的tuple成为行组合
                # value里面的dict里面value成为DataFrame的元素
    ('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
})
Out[929]:
a
b
A B 1
C 2
In [930]:
# 再舉一例
pd.DataFrame({
    ('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
    ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
})
Out[930]:
a
b a
A B 1 4
C 2 3

根據結構化數組創建 from structured array

In [931]:
# 注意與數組字典的區別:直接使用數組(而非字典)生成的DataFrame
ar=np.array([[1.,2.,3.,4.],[3,4,5,6]])
print(pd.DataFrame(ar, dtype=int, columns=list('ABCD'))) # 數組元素(列表)內的元素形成各列
print(pd.DataFrame(ar).index) 
pd.DataFrame(ar)
   A  B  C  D
0  1  2  3  4
1  3  4  5  6
RangeIndex(start=0, stop=2, step=1)
Out[931]:
0 1 2 3
0 1.0 2.0 3.0 4.0
1 3.0 4.0 5.0 6.0

根據結構化記錄創建 from records

In [932]:
data = np.zeros((2,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])
data[:] = [(1,2.,'Hello'), (2,3.,"World")]
print(type(data))
print(data)
pd.DataFrame(data, index=['first', 'second'], columns=['C', 'A', 'B'])
<class 'numpy.ndarray'>
[(1, 2., b'Hello') (2, 3., b'World')]
Out[932]:
C A B
first b'Hello' 1 2.0
second b'World' 2 3.0
In [933]:
pd.DataFrame.from_records(data, index='C')
Out[933]:
A B
C
b'Hello' 1 2.0
b'World' 2 3.0

使用MultiIndex創建多維度索引的DataFrame

MultiIndex.from_arrays

In [934]:
arrays = [[1, 1, 2, 2], ['Oct', 'Nov', 'Oct', 'Nov']]
index = pd.MultiIndex.from_arrays(arrays, names=('Quarter', 'periods'))
print(index)
columns = pd.MultiIndex.from_tuples([('company1', 'area1'),
                                     ('company2', 'area2')])
print(columns)
df = pd.DataFrame([(389.0, 18),
                   ( 24.0, 59),
                   ( 80.5, None),
                   (np.nan, 550)],
                  index=index,
                  columns=columns)
df
MultiIndex(levels=[[1, 2], ['Nov', 'Oct']],
           labels=[[0, 0, 1, 1], [1, 0, 1, 0]],
           names=['Quarter', 'periods'])
MultiIndex(levels=[['company1', 'company2'], ['area1', 'area2']],
           labels=[[0, 1], [0, 1]])
Out[934]:
company1 company2
area1 area2
Quarter periods
1 Oct 389.0 18.0
Nov 24.0 59.0
2 Oct 80.5 NaN
Nov NaN 550.0

MultiIndex.from_tuples

In [935]:
index = pd.MultiIndex.from_tuples([('bird', 'falcon'),  # 與zip()搭配使用效果會很好
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('species', 'type')])
print(index)
df = pd.DataFrame([(389.0, 'fly'),
                   ( 24.0, 'fly'),
                   ( 80.5, 'run'),
                   (np.nan, 'jump')],
                  index=index,
                  columns=columns)
df
MultiIndex(levels=[['bird', 'mammal'], ['falcon', 'lion', 'monkey', 'parrot']],
           labels=[[0, 0, 1, 1], [0, 3, 1, 2]],
           names=['class', 'name'])
Out[935]:
speed species
max type
class name
bird falcon 389.0 fly
parrot 24.0 fly
mammal lion 80.5 run
monkey NaN jump
In [936]:
m_index=pd.Index([("A","x1"),("A","x2"),("B","y1"),("B","y2"),("B","y3"),("B","y4"),("B","y5"),("B","y6"),("B","y7")],name=["class1","class2"])
print(m_index)
df = pd.DataFrame(
      data=[[45, 30], [200, 100], [1.5, 1], [30, 20], [250, 150], [1.5, 0.8], [320, 250], [1, 0.8], [0.3,0.2]]
    , index=m_index
    , columns=['big', 'small']
)
df
MultiIndex(levels=[['A', 'B'], ['x1', 'x2', 'y1', 'y2', 'y3', 'y4', 'y5', 'y6', 'y7']],
           labels=[[0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 1, 2, 3, 4, 5, 6, 7, 8]],
           names=['class1', 'class2'])
Out[936]:
big small
class1 class2
A x1 45.0 30.0
x2 200.0 100.0
B y1 1.5 1.0
y2 30.0 20.0
y3 250.0 150.0
y4 1.5 0.8
y5 320.0 250.0
y6 1.0 0.8
y7 0.3 0.2

MultiIndex.from_product

In [937]:
m_index=pd.MultiIndex.from_product([["Y1","Y2"],['Q1','Q2','Q3','Q4']],names=["Year","Quarter"])
print(m_index)
df=pd.DataFrame(
                  np.random.randint(5,15,(3,8))
                , index=['one','two','three']
                , columns=m_index
)
df
MultiIndex(levels=[['Y1', 'Y2'], ['Q1', 'Q2', 'Q3', 'Q4']],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]],
           names=['Year', 'Quarter'])
Out[937]:
Year Y1 Y2
Quarter Q1 Q2 Q3 Q4 Q1 Q2 Q3 Q4
one 9 11 13 8 11 10 7 13
two 14 14 9 12 12 12 13 14
three 13 12 11 11 6 6 14 9

应用程序接口参考 API Reference

通用函数 General functions

数据操作 Data manipulations

pd.melt()

  • pandas.melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None)
  • 数据透视表的反操作
  • 将宽表转换为长表
  • 将列名转变为变量
  • Unpivots ” a DataFrame from wide format to long format, optionally leaving identifier variables set.

    frame : 要处理的数据集 DataFrame
    id_vars : tuple, list, or ndarray, optional
    不需要被转换的列名,用作标识变量 Column(s) to use as identifier variables.
    value_vars : tuple, list, or ndarray, optional
    需要转换的列名,如果剩下的列全部都要转换,就不用写了 Column(s) to unpivot . If not specified, uses all columns that are not set as id_vars.
    var_name : scalar
    自定义设置对应的列名 Name to use for the ‘variable’ column. If None it uses frame.columns.name or ‘variable’.
    value_name : scalar, default ‘value’
    自定义设置对应的列名 Name to use for the ‘value’ column.
    col_level : int or string, optional
    如果列是MultiIndex,则使用此级别 If columns are a MultiIndex then use this level to melt.

In [938]:
df = pd.DataFrame({
                    'A': {0: 'a', 1: 'b', 2: 'c'}
                  , 'B': {0: 1, 1: 3, 2: 5}
                  , 'C': {0: 2, 1: 4, 2: 6}
                  })
print(df)
pd.melt(df, id_vars=['A'], value_vars=['B'])
   A  B  C
0  a  1  2
1  b  3  4
2  c  5  6
Out[938]:
A variable value
0 a B 1
1 b B 3
2 c B 5
In [939]:
df.columns = [list('ABC'), list('DEF')]
print(df)
pd.melt(
      df
    , id_vars=[('A','D')]
    , value_vars=[('B','E'),('C','F')]
)
   A  B  C
   D  E  F
0  a  1  2
1  b  3  4
2  c  5  6
Out[939]:
(A, D) variable_0 variable_1 value
0 a B E 1
1 b B E 3
2 c B E 5
3 a C F 2
4 b C F 4
5 c C F 6

pd.pivot_table()

  • df.pivot_table(values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')
  • 根据长表创建数据透视表
  • Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame.

    values : column to aggregate, optional
    index : column, Grouper, array, or list of the previous
    If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values.
    columns : column, Grouper, array, or list of the previous
    If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values.
    aggfunc : function, list of functions, dict, default numpy.mean
    If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) If dict is passed, the key is column to aggregate and value is function or list of functions
    fill_value : scalar, default None
    Value to replace missing values with
    margins : boolean, default False
    Add all row / columns (e.g. for subtotal / grand totals)
    dropna : boolean, default True
    Do not include columns whose entries are all NaN
    margins_name : string, default ‘All’
    Name of the row / column that will contain the totals when margins is True.

In [940]:
df = pd.DataFrame({
                    'A': {0: 'a', 1: 'b', 2: 'c'}
                  , 'B': {0: 1, 1: 3, 2: 5}
                  , 'C': {0: 2, 1: 4, 2: np.NaN}
                  })
print(df)
print('-'*50)
df.columns = [list('ABC'), list('DEF')]
print(df)
print('-'*50)
df_m=pd.melt(
      df
    , id_vars=[('A','D')]
    , value_vars=[('B','E'),('C','F')]
)
df_m.columns = list('ABCD')
print(df_m)
print('-'*50)
table = pd.pivot_table(
      df_m
    , values='D'
    , index=['B','C']
    , columns=['A']
    , aggfunc=np.mean
    , fill_value=100
)
table
   A  B    C
0  a  1  2.0
1  b  3  4.0
2  c  5  NaN
--------------------------------------------------
   A  B    C
   D  E    F
0  a  1  2.0
1  b  3  4.0
2  c  5  NaN
--------------------------------------------------
   A  B  C    D
0  a  B  E  1.0
1  b  B  E  3.0
2  c  B  E  5.0
3  a  C  F  2.0
4  b  C  F  4.0
5  c  C  F  NaN
--------------------------------------------------
Out[940]:
A a b c
B C
B E 1 3 5
C F 2 4 100

pd.crosstab()

  • pandas.crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False)
  • Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed
In [941]:
a = np.array(["foo", "foo", "foo", "foo", "nam", "bar", "bar", "nam", "foo", "foo", "foo"], dtype=object)
b = np.array(["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], dtype=object)
c = np.array(["dul", "dul", "shi", "dul", "jas", "shi", "shi", "cas", "shi", "shi", "shi"], dtype=object)
d = np.array([100,1,1,1,1,1,1,1,1,1,1], dtype=int)
In [942]:
pd.crosstab(
      a
    , [b, c]
    , values=d
    , rownames=['a']
    , colnames=['Jasper', 'Casper']
    , aggfunc=np.sum
    , dropna=True
    , margins=True
    , margins_name='Total'
)
Out[942]:
Jasper one two Total
Casper dul jas shi cas dul shi
a
bar NaN NaN 2.0 NaN NaN NaN 2
foo 101.0 NaN 2.0 NaN 1.0 2.0 106
nam NaN 1.0 NaN 1.0 NaN NaN 2
Total 101.0 1.0 4.0 1.0 1.0 2.0 110
In [943]:
# 交换index和column的位置(及其名称的位置),就相当于转置
# 此处计算的是count,计算count的应用场景会更多一些
pd.crosstab(
      [b, c]
    , a
    , rownames=['Jasper', 'Casper']
    , colnames=['a']
)
Out[943]:
a bar foo nam
Jasper Casper
one dul 0 2 0
jas 0 0 1
shi 2 2 0
two cas 0 0 1
dul 0 1 0
shi 0 2 0

pd.cut()

  • pandas.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise')
  • 将数值转换为区间
  • 返回值为pandas.Categorical, Series, 或 ndarray
  • 使用场景:先分组pd.cut(),再计算频次Series.value_counts(),再排序Series.sort_values()
  • Bin values into discrete intervals.
  • Use cut when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a categorical variable. For example, cut could convert ages to groups of age ranges. Supports binning into an equal number of bins, or a pre-specified array of bins.
  • Returns:pandas.Categorical, Series, or ndarray
In [944]:
print(type(pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)))
print(type(pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)))
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
<class 'pandas.core.arrays.categorical.Categorical'>
<class 'tuple'>
Out[944]:
([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
 Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]],
 array([0.994, 3.   , 5.   , 7.   ]))
In [945]:
print(pd.cut(np.array([1, 7, 5, 4, 6, 3, 9, 9]), 3, labels=["bad", "medium", "good"]))
pd.cut(np.array([1, 7, 5, 4, 6, 3, 9, 9]), 3, labels=["bad", "medium", "good"]).value_counts().sort_values(ascending=False)
[bad, good, medium, medium, medium, bad, good, good]
Categories (3, object): [bad < medium < good]
Out[945]:
good      3
medium    3
bad       2
dtype: int64
In [946]:
s = pd.Series(np.array([2, 4, 6, 8, 10]), index=['a', 'b', 'c', 'd', 'e'])
print(s)
print(pd.cut(s, 3))
pd.cut(s, 3).value_counts()
a     2
b     4
c     6
d     8
e    10
dtype: int64
a    (1.992, 4.667]
b    (1.992, 4.667]
c    (4.667, 7.333]
d     (7.333, 10.0]
e     (7.333, 10.0]
dtype: category
Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, 7.333] < (7.333, 10.0]]
Out[946]:
(7.333, 10.0]     2
(1.992, 4.667]    2
(4.667, 7.333]    1
dtype: int64

pd.qcut()

  • pandas.qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
  • 保持各个区间中的变量数目相同
  • Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example 1000 values for 10 quantiles would produce a Categorical object indicating quantile membership for each data point.
  • 通过比较pd.cut()和pd.qcut()可以明显看出二者的区别
In [947]:
pd.qcut(np.random.randn(1000000), 5).value_counts() # 区间变量数相同
Out[947]:
(-4.53, -0.844]     200000
(-0.844, -0.256]    200000
(-0.256, 0.251]     200000
(0.251, 0.84]       200000
(0.84, 4.777]       200000
dtype: int64
In [948]:
pd.cut(np.random.randn(1000000), 5).value_counts() # 区间变量符合正态分布
Out[948]:
(-4.81, -2.902]       1830
(-2.902, -1.004]    156230
(-1.004, 0.894]     656875
(0.894, 2.793]      182473
(2.793, 4.691]        2592
dtype: int64

pd.merge()

  • pandas.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
  • Merge DataFrame objects by performing a database-style join operation by columns or indexes.
  • If joining columns on columns, the DataFrame indexes will be ignored. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on.
  • how的选项:inner, pouter, left, right
  • 相当于SQL的join
In [949]:
df1=pd.DataFrame({'name':['kate','herz','catherine','sally'], 'age':[25,28,39,35]}) 
df2=pd.DataFrame({'name':['kate','herz','sally'], 'score':[70,60,90]}) 
print(df1)
print('-'*50)
print(df2)
print('-'*50)
print(pd.merge(df1,df2))
print('-'*50)
print(pd.merge(df1,df2, how='outer'))
        name  age
0       kate   25
1       herz   28
2  catherine   39
3      sally   35
--------------------------------------------------
    name  score
0   kate     70
1   herz     60
2  sally     90
--------------------------------------------------
    name  age  score
0   kate   25     70
1   herz   28     60
2  sally   35     90
--------------------------------------------------
        name  age  score
0       kate   25   70.0
1       herz   28   60.0
2  catherine   39    NaN
3      sally   35   90.0
In [950]:
df1.merge(df2)
Out[950]:
name age score
0 kate 25 70
1 herz 28 60
2 sally 35 90
In [951]:
df1=pd.DataFrame({'name1':['kate','herz','catherine','sally'], 'age':[25,28,39,35]}) 
df2=pd.DataFrame({'name2':['kate','herz','sally'], 'score':[70,60,90]}) 
print(df1.merge(df2, left_on='name1', right_on='name2', how='outer'))
df1.merge(df2, left_on='name1', right_on='name2', how='outer').drop('name2',axis=1).fillna(0)
       name1  age  name2  score
0       kate   25   kate   70.0
1       herz   28   herz   60.0
2  catherine   39    NaN    NaN
3      sally   35  sally   90.0
Out[951]:
name1 age score
0 kate 25 70.0
1 herz 28 60.0
2 catherine 39 0.0
3 sally 35 90.0

pd.concat()

  • pandas.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=None, copy=True)
  • 根据不同的轴将数据进行简单融合
  • concat不会去重,要达到去重的效果可以使用drop_duplicates方法
  • Concatenate pandas objects along a particular axis with optional set logic along the other axes.
  • Can also add a layer of hierarchical indexing on the concatenation axis, which may be useful if the labels are the same (or overlapping) on the passed axis number.
In [952]:
df1 = pd.DataFrame([['a', 1], ['b', 2], ['c', 5]], columns=['letter', 'number'])
print(df1)
print('-'*50)
df2 = pd.DataFrame([['c', 3], ['d', 4]], columns=['letter', 'number'])
print(df2)
print('-'*50)
df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], columns=['letter', 'number', 'animal'])
print(df3)
print('-'*50)
print(pd.concat([df1, df2]))
print('-'*50)
print(pd.concat([df1, df3], sort=False, keys=['oneblock', 'twoblock'], names=['block', 'number']))
print('-'*50)
print(pd.concat([df1, df3], sort=False, ignore_index=True)) # 忽略原有的index,重新建立index
  letter  number
0      a       1
1      b       2
2      c       5
--------------------------------------------------
  letter  number
0      c       3
1      d       4
--------------------------------------------------
  letter  number animal
0      c       3    cat
1      d       4    dog
--------------------------------------------------
  letter  number
0      a       1
1      b       2
2      c       5
0      c       3
1      d       4
--------------------------------------------------
                letter  number animal
block    number                      
oneblock 0           a       1    NaN
         1           b       2    NaN
         2           c       5    NaN
twoblock 0           c       3    cat
         1           d       4    dog
--------------------------------------------------
  letter  number animal
0      a       1    NaN
1      b       2    NaN
2      c       5    NaN
3      c       3    cat
4      d       4    dog
In [953]:
print(pd.concat([df1, df2], axis=1, join='inner'))
print('-'*50)
print(pd.concat([df1, df2], axis=1, join='outer'))
  letter  number letter  number
0      a       1      c       3
1      b       2      d       4
--------------------------------------------------
  letter  number letter  number
0      a       1      c     3.0
1      b       2      d     4.0
2      c       5    NaN     NaN

pd.join()

  • DataFrame.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False)
  • join方法默认根据index作关联,默认为左外连接how=left
  • Join columns with other DataFrame either on index or on a key column. Efficiently Join multiple DataFrame objects by index at once by passing a list.
In [954]:
df1=pd.DataFrame({'Red':[1,3,5],'Green':[5,0,3]},index=list('abd'))
df2=pd.DataFrame({'Blue':[1,9],'Yellow':[6,6]},index=list('ce'))
df3=pd.DataFrame({'Brown':[3,4,5],'White':[1,1,2]},index=list('aed'))
print(df1.join([df2,df3]))  # 默认是left join
#df1.join([df2,df3], sort=False, how='outer') # A future version of pandas will change to not sort by default.
df1.join(df2, sort=False, how='outer').join(df3, sort=False, how='outer')
   Red  Green  Blue  Yellow  Brown  White
a    1      5   NaN     NaN    3.0    1.0
b    3      0   NaN     NaN    NaN    NaN
d    5      3   NaN     NaN    5.0    2.0
Out[954]:
Red Green Blue Yellow Brown White
a 1.0 5.0 NaN NaN 3.0 1.0
b 3.0 0.0 NaN NaN NaN NaN
c NaN NaN 1.0 6.0 NaN NaN
d 5.0 3.0 NaN NaN 5.0 2.0
e NaN NaN 9.0 6.0 4.0 1.0
In [955]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                    'key': ['K0', 'K1', 'K0', 'K1']})
right = pd.DataFrame({'C': ['C0', 'C1'],
                      'D': ['D0', 'D1']},
                      index=['K0', 'K1'])
right2 = pd.DataFrame({'v': [7, 8, 9]}, index=['K1', 'K1', 'K2'])
print(left)
print(right)
print(right2)
print(left.join(right, on='key').join(right2, on='key'))
result = left.join([right, right2], how='outer')
result
    A   B key
0  A0  B0  K0
1  A1  B1  K1
2  A2  B2  K0
3  A3  B3  K1
     C   D
K0  C0  D0
K1  C1  D1
    v
K1  7
K1  8
K2  9
    A   B key   C   D    v
0  A0  B0  K0  C0  D0  NaN
1  A1  B1  K1  C1  D1  7.0
1  A1  B1  K1  C1  D1  8.0
2  A2  B2  K0  C0  D0  NaN
3  A3  B3  K1  C1  D1  7.0
3  A3  B3  K1  C1  D1  8.0
Out[955]:
A B key C D v
0 A0 B0 K0 NaN NaN NaN
1 A1 B1 K1 NaN NaN NaN
2 A2 B2 K0 NaN NaN NaN
3 A3 B3 K1 NaN NaN NaN
K0 NaN NaN NaN C0 D0 NaN
K1 NaN NaN NaN C1 D1 7.0
K1 NaN NaN NaN C1 D1 8.0
K2 NaN NaN NaN NaN NaN 9.0

pd.get_dummies()

  • pandas.get_dummies(data, prefix=None, prefixsep='', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
  • Convert categorical variable into dummy/indicator variables
  • 类别变量向量化
In [956]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'], 'data':np.random.randn(6)})
print(df)
print(pd.get_dummies(df['key']))
  key      data
0   b -0.194219
1   b  0.591529
2   a  0.435119
3   c  0.345278
4   a  0.324429
5   b -0.866458
   a  b  c
0  0  1  0
1  0  1  0
2  1  0  0
3  0  0  1
4  1  0  0
5  0  1  0
In [957]:
dummies = pd.get_dummies(df['key'],prefix = 'key')
dummies
Out[957]:
key_a key_b key_c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
In [958]:
df_with_dummy = df[['data']].join(dummies) # 與原df的數值結合
print(df_with_dummy)
       data  key_a  key_b  key_c
0 -0.194219      0      1      0
1  0.591529      0      1      0
2  0.435119      1      0      0
3  0.345278      0      0      1
4  0.324429      1      0      0
5 -0.866458      0      1      0
In [959]:
s = ['a', 'b', np.nan, 'a']
print(s)
print('-'*50)
print(pd.get_dummies(s))
print('-'*50)
print(pd.get_dummies(s, dummy_na=True)) # 添加顯示nan的列
['a', 'b', nan, 'a']
--------------------------------------------------
   a  b
0  1  0
1  0  1
2  0  0
3  1  0
--------------------------------------------------
   a  b  NaN
0  1  0    0
1  0  1    0
2  0  0    1
3  1  0    0

pandas.DataFrame.reorder_levels

In [960]:
index = pd.MultiIndex.from_tuples([('bird', 'falcon'),  # 與zip()搭配使用效果會很好
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('species', 'type')])
df = pd.DataFrame([(389.0, 'fly'),
                   ( 24.0, 'fly'),
                   ( 80.5, 'run'),
                   (np.nan, 'jump')],
                  index=index,
                  columns=columns)
print(df)
print('-'*30)
order=[1,0] # 若提供3個元素,則報錯:Too many levels: Index has only 2 levels, not 3
print(df.reorder_levels(order)) # 交換了行level中class和name的位置
print('-'*30)
print(df.reorder_levels(order, axis=1)) # 交換列標籤的上下位置
               speed species
                 max    type
class  name                 
bird   falcon  389.0     fly
       parrot   24.0     fly
mammal lion     80.5     run
       monkey    NaN    jump
------------------------------
               speed species
                 max    type
name   class                
falcon bird    389.0     fly
parrot bird     24.0     fly
lion   mammal   80.5     run
monkey mammal    NaN    jump
------------------------------
                 max    type
               speed species
class  name                 
bird   falcon  389.0     fly
       parrot   24.0     fly
mammal lion     80.5     run
       monkey    NaN    jump

pandas.DataFrame.sort_values

In [961]:
df = pd.DataFrame({
    'col1': ['A', 'A', 'B', np.nan, 'D', 'A'],
    'col2': [3, 1, 9, 8, 7, 2],
    'col3': [0, 1, 9, 4, 2, 3],
})
print(df)
print('-'*30)
print(df.sort_values(by=['col1']))
print('-'*30)
print(df.sort_values(by=['col1', 'col2']))
print('-'*30)
print(df.sort_values(by='col1', ascending=False)) # NaN始終排在最後
print('-'*30)
df.sort_values(by='col1', ascending=False, na_position='first') # NaN這樣就排到前面了
  col1  col2  col3
0    A     3     0
1    A     1     1
2    B     9     9
3  NaN     8     4
4    D     7     2
5    A     2     3
------------------------------
  col1  col2  col3
0    A     3     0
1    A     1     1
5    A     2     3
2    B     9     9
4    D     7     2
3  NaN     8     4
------------------------------
  col1  col2  col3
1    A     1     1
5    A     2     3
0    A     3     0
2    B     9     9
4    D     7     2
3  NaN     8     4
------------------------------
  col1  col2  col3
4    D     7     2
2    B     9     9
0    A     3     0
1    A     1     1
5    A     2     3
3  NaN     8     4
------------------------------
Out[961]:
col1 col2 col3
3 NaN 8 4
4 D 7 2
2 B 9 9
0 A 3 0
1 A 1 1
5 A 2 3

pandas.DataFrame.sort_index

In [962]:
df = pd.DataFrame({
    'col1': ['A', 'A', 'B', np.nan, 'D', 'A'],
    'col2': [3, 1, 9, 8, 7, 2],
    'col3': [0, 1, 9, 4, 2, 3],
})
print(df.sort_values(by='col1', ascending=False, na_position='first'))
df.sort_values(by='col1', ascending=False, na_position='first').sort_index()
  col1  col2  col3
3  NaN     8     4
4    D     7     2
2    B     9     9
0    A     3     0
1    A     1     1
5    A     2     3
Out[962]:
col1 col2 col3
0 A 3 0
1 A 1 1
2 B 9 9
3 NaN 8 4
4 D 7 2
5 A 2 3

pandas.DataFrame.nlargest

  • DataFrame.nlargest(n, columns, keep='first')
  • Return the first n rows ordered by columns in descending order.
  • Return the first n rows with the largest values in columns, in descending order. The columns that are not specified are returned as well, but not used for ordering.
In [963]:
df = pd.DataFrame({'a': [1, 10, 8, 10, -1, 5, 7],
                   'b': list('abdcefg'),
                   'c': [21.0, 2.0, np.nan, 5.0, 4.0, 2, 1]})
print(df)
print('-'*30)
df['rows_total']=df.apply(lambda x: x['a']+x['c'], axis=1)
print(df.sort_values(by='rows_total', ascending=False))
df.nlargest(5, ['c', 'a'], keep='first')  # 是按照列表中的列的合計,進行排序,然後選擇其中的n條記錄的
    a  b     c
0   1  a  21.0
1  10  b   2.0
2   8  d   NaN
3  10  c   5.0
4  -1  e   4.0
5   5  f   2.0
6   7  g   1.0
------------------------------
    a  b     c  rows_total
0   1  a  21.0        22.0
3  10  c   5.0        15.0
1  10  b   2.0        12.0
6   7  g   1.0         8.0
5   5  f   2.0         7.0
4  -1  e   4.0         3.0
2   8  d   NaN         NaN
Out[963]:
a b c rows_total
1 10 b 2.0 12.0
5 5 f 2.0 7.0
0 1 a 21.0 22.0
3 10 c 5.0 15.0
4 -1 e 4.0 3.0

pandas.DataFrame.nsmallest

In [964]:
df = pd.DataFrame({'a': [1, 10, 8, 10, -1, 5, 7],
                   'b': list('abdcefg'),
                   'c': [21.0, 2.0, np.nan, 5.0, 4.0, 2, 1]})
print(df)
print('-'*30)
df['rows_total']=df.apply(lambda x: x['a']+x['c'], axis=1)
print(df.sort_values(by='rows_total', ascending=True))
df.nsmallest(5, ['c', 'a'], keep='first')
    a  b     c
0   1  a  21.0
1  10  b   2.0
2   8  d   NaN
3  10  c   5.0
4  -1  e   4.0
5   5  f   2.0
6   7  g   1.0
------------------------------
    a  b     c  rows_total
4  -1  e   4.0         3.0
5   5  f   2.0         7.0
6   7  g   1.0         8.0
1  10  b   2.0        12.0
3  10  c   5.0        15.0
0   1  a  21.0        22.0
2   8  d   NaN         NaN
Out[964]:
a b c rows_total
6 7 g 1.0 8.0
4 -1 e 4.0 3.0
3 10 c 5.0 15.0
5 5 f 2.0 7.0
1 10 b 2.0 12.0

pandas.DataFrame.swaplevel

  • i, j : int, string (can be mixed)
    • Level of index to be swapped. Can pass level name as string.
In [965]:
index = pd.MultiIndex.from_tuples([('bird', 'falcon'),  # 與zip()搭配使用效果會很好
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('species', 'type')])
df = pd.DataFrame([(389.0, 'fly'),
                   ( 24.0, 'fly'),
                   ( 80.5, 'run'),
                   (np.nan, 'jump')],
                  index=index,
                  columns=columns)
print(df)
print('-'*30)
print(df.swaplevel(i=1, j=0, axis=1))
df.swaplevel(i='name', j='class', axis=0)
               speed species
                 max    type
class  name                 
bird   falcon  389.0     fly
       parrot   24.0     fly
mammal lion     80.5     run
       monkey    NaN    jump
------------------------------
                 max    type
               speed species
class  name                 
bird   falcon  389.0     fly
       parrot   24.0     fly
mammal lion     80.5     run
       monkey    NaN    jump
Out[965]:
speed species
max type
name class
falcon bird 389.0 fly
parrot bird 24.0 fly
lion mammal 80.5 run
monkey mammal NaN jump

pandas.DataFrame.stack

In [966]:
df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
                                    index=['cat', 'dog'],
                                    columns=['weight', 'height'])
print(df_single_level_cols)
print('-'*30)
print(df_single_level_cols.stack().index)
df_single_level_cols.stack()  # 列轉行(索引)
     weight  height
cat       0       1
dog       2       3
------------------------------
MultiIndex(levels=[['cat', 'dog'], ['weight', 'height']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
Out[966]:
cat  weight    0
     height    1
dog  weight    2
     height    3
dtype: int64
In [967]:
multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
                                       ('height', 'm')])
df_multi_level_cols2 = pd.DataFrame([[np.NaN, 2.0], [3.0, 4.0]],
                                    index=['cat', 'dog'],
                                    columns=multicol2)
print(df_multi_level_cols2)
print('-'*30)
print(df_multi_level_cols2.stack().index)
print('-'*30)
print(df_multi_level_cols2.stack(dropna=False)) # NaN 保留, 缺省是不保留的
df_multi_level_cols2.stack(dropna=True)
    weight height
        kg      m
cat    NaN    2.0
dog    3.0    4.0
------------------------------
MultiIndex(levels=[['cat', 'dog'], ['kg', 'm']],
           labels=[[0, 1, 1], [1, 0, 1]])
------------------------------
        height  weight
cat kg     NaN     NaN
    m      2.0     NaN
dog kg     NaN     3.0
    m      4.0     NaN
Out[967]:
height weight
cat m 2.0 NaN
dog kg NaN 3.0
m 4.0 NaN

pandas.DataFrame.unstack

In [968]:
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
                                   ('two', 'a'), ('two', 'b')])
df = pd.DataFrame(np.random.randint(1, 100, (4,2)), index=index, columns=['x', 'y'])
print(df)
print('-'*30)
print(df.unstack(level=0))
print('-'*30)
print(df.unstack(level=1))
print('-'*30)
print(df.unstack(level=-1)) # -1表示最後一個,在這裏也就是1
        x   y
one a  27  61
    b  25  34
two a  86  92
    b   6   7
------------------------------
    x       y    
  one two one two
a  27  86  61  92
b  25   6  34   7
------------------------------
      x       y    
      a   b   a   b
one  27  25  61  34
two  86   6  92   7
------------------------------
      x       y    
      a   b   a   b
one  27  25  61  34
two  86   6  92   7

pandas.DataFrame.swapaxes

In [969]:
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
                                   ('two', 'a'), ('two', 'b')])
df = pd.DataFrame(np.random.randint(1, 100, (4,2)), index=index, columns=['x', 'y'])
print(df)
print('-'*30)
print(df.swapaxes(1,0)) # 交換行軸與列軸
        x   y
one a  35  57
    b  53   3
two a  39   6
    b  90  27
------------------------------
  one     two    
    a   b   a   b
x  35  53  39  90
y  57   3   6  27
In [970]:
df = pd.DataFrame([[0, 1], [2, 3]], index=['cat', 'dog'], columns=['weight', 'height'])
print(df)
print('-'*30)
print(df.swapaxes(1,0))
     weight  height
cat       0       1
dog       2       3
------------------------------
        cat  dog
weight    0    2
height    1    3

pandas.DataFrame.T

In [971]:
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
                                   ('two', 'a'), ('two', 'b')])
df = pd.DataFrame(np.random.randint(1, 100, (4,2)), index=index, columns=['x', 'y'])
print(df)
print('-'*30)
print(df.T) # 交換行軸與列軸
        x   y
one a  44  25
    b  15  73
two a  93  90
    b  59  19
------------------------------
  one     two    
    a   b   a   b
x  44  15  93  59
y  25  73  90  19
In [972]:
df = pd.DataFrame([[0, 1], [2, 3]], index=['cat', 'dog'], columns=['weight', 'height'])
print(df)
print('-'*30)
print(df.T)
     weight  height
cat       0       1
dog       2       3
------------------------------
        cat  dog
weight    0    2
height    1    3

pandas.DataFrame.transpose

In [973]:
df = pd.DataFrame([[0, 1], [2, 3]], index=['cat', 'dog'], columns=['weight', 'height'])
print(df)
print('-'*30)
print(df.transpose())
     weight  height
cat       0       1
dog       2       3
------------------------------
        cat  dog
weight    0    2
height    1    3
In [974]:
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
                                   ('two', 'a'), ('two', 'b')])
df = pd.DataFrame(np.random.randint(1, 100, (4,2)), index=index, columns=['x', 'y'])
print(df)
print('-'*30)
print(df.transpose()) # 交換行軸與列軸
        x   y
one a   7  89
    b  37  15
two a  71  86
    b  43  16
------------------------------
  one     two    
    a   b   a   b
x   7  37  71  43
y  89  15  86  16

pandas.DataFrame.append

In [975]:
df1 = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
print(df1)
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
print(df2)
df1.append(df2, sort=False, ignore_index=True)
   A  B
0  1  2
1  3  4
   A  B
0  5  6
1  7  8
Out[975]:
A B
0 1 2
1 3 4
2 5 6
3 7 8

pandas.DataFrame.assign

In [976]:
df = pd.DataFrame({'A': range(1, 11), 'B': np.random.randn(10)})
print(df)
print('-'*30)
print(df.assign(total = lambda x:x['A']+ x['B'])) # 列之間的運算,如果想對行運算,可以移位(shift)後運算,也可以轉置後運算然後再轉置
print('-'*30)
print(df.assign(total = lambda x:np.sum([x.B]))) # 這個可以用來算結構百分比
print('-'*30)
print(df.assign(ln_A = lambda x: np.log(x.A)))
    A         B
0   1  1.591474
1   2 -0.057835
2   3  1.953241
3   4 -1.062957
4   5  1.014981
5   6 -0.767510
6   7  0.773790
7   8 -0.311985
8   9  0.315448
9  10  0.604212
------------------------------
    A         B      total
0   1  1.591474   2.591474
1   2 -0.057835   1.942165
2   3  1.953241   4.953241
3   4 -1.062957   2.937043
4   5  1.014981   6.014981
5   6 -0.767510   5.232490
6   7  0.773790   7.773790
7   8 -0.311985   7.688015
8   9  0.315448   9.315448
9  10  0.604212  10.604212
------------------------------
    A         B     total
0   1  1.591474  4.052859
1   2 -0.057835  4.052859
2   3  1.953241  4.052859
3   4 -1.062957  4.052859
4   5  1.014981  4.052859
5   6 -0.767510  4.052859
6   7  0.773790  4.052859
7   8 -0.311985  4.052859
8   9  0.315448  4.052859
9  10  0.604212  4.052859
------------------------------
    A         B      ln_A
0   1  1.591474  0.000000
1   2 -0.057835  0.693147
2   3  1.953241  1.098612
3   4 -1.062957  1.386294
4   5  1.014981  1.609438
5   6 -0.767510  1.791759
6   7  0.773790  1.945910
7   8 -0.311985  2.079442
8   9  0.315448  2.197225
9  10  0.604212  2.302585

pandas.DataFrame.update

In [977]:
df = pd.DataFrame({'A': [1, 2, 3],
                   'B': [400, 500, 600]})
new_df = pd.DataFrame({'B': [4, 5, 6],
                       'C': [7, 8, 9]})
print(df)
print(new_df)
df.update(new_df)
df
   A    B
0  1  400
1  2  500
2  3  600
   B  C
0  4  7
1  5  8
2  6  9
Out[977]:
A B
0 1 4
1 2 5
2 3 6
In [978]:
df = pd.DataFrame({'A': [1, 2, 3],
                   'B': [400, 500, 600]})
new_df = pd.DataFrame({'B': [4, np.NAN, 6],
                       'C': [7, 8, 9]})
print(df)
print(new_df)
df.update(new_df) # 當new_df裏面是NaN時,仍使用該元素之前的值
df
   A    B
0  1  400
1  2  500
2  3  600
     B  C
0  4.0  7
1  NaN  8
2  6.0  9
Out[978]:
A B
0 1 4.0
1 2 500.0
2 3 6.0

缺失值處理 Missing data handling

pandas.isnull(obj)

  • Detect missing values for an array-like object.
  • This function takes a scalar or array-like object and indictates whether values are missing (NaN in numeric arrays, None or NaN in object arrays, NaT in datetimelike).
In [979]:
print(pd.isnull('dog'))
print(pd.isnull(np.nan))
print(pd.isnull(None))
False
True
True
In [980]:
df = pd.DataFrame([['ant', 'bee', np.NaN], ['dog', None, 'fly']])
print(df)
pd.isnull(df)
     0     1    2
0  ant   bee  NaN
1  dog  None  fly
Out[980]:
0 1 2
0 False False True
1 False True False

pandas.isna(obj)

  • Detect missing values for an array-like object.
  • This function takes a scalar or array-like object and indictates whether values are missing (NaN in numeric arrays, None or NaN in object arrays, NaT in datetimelike).
In [981]:
print(pd.isna('dog'))
print(pd.isna(np.nan))
print(pd.isna(None))
False
True
True
In [982]:
df = pd.DataFrame([['ant', 'bee', np.NaN], ['dog', None, 'fly']])
print(df)
pd.isna(df)
     0     1    2
0  ant   bee  NaN
1  dog  None  fly
Out[982]:
0 1 2
0 False False True
1 False True False

pandas.notnull(obj)

  • Detect non-missing values for an array-like object.
  • This function takes a scalar or array-like object and indictates whether values are valid (not missing, which is NaN in numeric arrays, None or NaN in object arrays, NaT in datetimelike).
In [983]:
print(pd.notnull('dog'))
print(pd.notnull(np.nan))
print(pd.notnull(None))
True
False
False
In [984]:
df = pd.DataFrame([['ant', 'bee', np.NaN], ['dog', None, 'fly']])
print(df)
pd.notnull(df)
     0     1    2
0  ant   bee  NaN
1  dog  None  fly
Out[984]:
0 1 2
0 True True False
1 True False True

pandas.notna(obj)

  • Detect non-missing values for an array-like object.
  • This function takes a scalar or array-like object and indictates whether values are valid (not missing, which is NaN in numeric arrays, None or NaN in object arrays, NaT in datetimelike).
In [985]:
print(pd.notna('dog'))
print(pd.notna(np.nan))
print(pd.notna(None))
True
False
False
In [986]:
df = pd.DataFrame([['ant', 'bee', np.NaN], ['dog', None, 'fly']])
print(df)
pd.notna(df)
     0     1    2
0  ant   bee  NaN
1  dog  None  fly
Out[986]:
0 1 2
0 True True False
1 True False True

pandas.DataFrame.dropna

In [987]:
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                   "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"),pd.NaT]
                  })
print(df)
print('~-'*25)
print(df.dropna())
print('~-'*25)
# Drop the columns where at least one element is missing.
print(df.dropna(axis='columns'))
print('~-'*25)
# Drop the rows where all elements are missing.
print(df.dropna(how='all'))
print('~-'*25)
# Keep only the rows with at least 2 non-NA values.
print('thresh=2\n',df.dropna(thresh=2)) # 至少有2個元素非空
print('~-'*25)
# Define in which columns to look for missing values.
print(df.dropna(subset=['name', 'toy']))
print('~-'*25)
# Keep the DataFrame with valid entries in the same variable.
df.dropna(inplace=True) # inplace=True表示對原數組進行更改
df
       name        toy       born
0    Alfred        NaN        NaT
1    Batman  Batmobile 1940-04-25
2  Catwoman   Bullwhip        NaT
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
     name        toy       born
1  Batman  Batmobile 1940-04-25
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
       name
0    Alfred
1    Batman
2  Catwoman
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
       name        toy       born
0    Alfred        NaN        NaT
1    Batman  Batmobile 1940-04-25
2  Catwoman   Bullwhip        NaT
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
thresh=2
        name        toy       born
1    Batman  Batmobile 1940-04-25
2  Catwoman   Bullwhip        NaT
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
       name        toy       born
1    Batman  Batmobile 1940-04-25
2  Catwoman   Bullwhip        NaT
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
Out[987]:
name toy born
1 Batman Batmobile 1940-04-25

pandas.DataFrame.fillna

  • method : {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None
  • Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap
In [988]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5],
                   [np.nan, 3, np.nan, 4]],
                   columns=list('ABCD'))
print(df)
print('~-'*25)
print(df.fillna(0))
print('~-'*25)
# We can also propagate non-null values forward or backward.
print('use method:\n', df.fillna(method='bfill', axis=1))
print('~-'*25)
#Replace all NaN elements in column ‘A’, ‘B’, ‘C’, and ‘D’, with 0, 1, 2, and 3 respectively.
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
print(df.fillna(value=values))
print('~-'*25)
#Only replace the first NaN element.
df.fillna(value=values, limit=1)
     A    B   C  D
0  NaN  2.0 NaN  0
1  3.0  4.0 NaN  1
2  NaN  NaN NaN  5
3  NaN  3.0 NaN  4
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
     A    B    C  D
0  0.0  2.0  0.0  0
1  3.0  4.0  0.0  1
2  0.0  0.0  0.0  5
3  0.0  3.0  0.0  4
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
use method:
      A    B    C    D
0  2.0  2.0  0.0  0.0
1  3.0  4.0  1.0  1.0
2  5.0  5.0  5.0  5.0
3  3.0  3.0  4.0  4.0
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
     A    B    C  D
0  0.0  2.0  2.0  0
1  3.0  4.0  2.0  1
2  0.0  1.0  2.0  5
3  0.0  3.0  2.0  4
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
Out[988]:
A B C D
0 0.0 2.0 2.0 0
1 3.0 4.0 NaN 1
2 NaN 1.0 NaN 5
3 NaN 3.0 NaN 4

pandas.DataFrame.replace

  • DataFrame.replace(to_replace=None, value=None, inplace=False, limit=None, regex=False, method='pad')
  • Replace values given in to_replace with value.
  • Regular expression `to_replace`
In [989]:
df = pd.DataFrame({'A': [9, 0, 8, 7, 4],
                   'B': [5, 0, 1, 3, 3],
                   'C': ['a', 'b', 'c', 'd', 'e']})
print(df)
print('~-'*25)
print(df.replace(0, 5))
print('~-'*25)
print(df.replace([0, 1, 2, 3], 4)) # 將所有的元素爲0,1,2或3的都替換爲4
print(df.replace([0, 1, 2, 3], [4, 3, 2, 1]))
print('~-'*25)
print(df.replace({0: 10, 1: 100}))
print('~-'*25)
#Regular expression `to_replace`
df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
                   'B': ['abc', 'bar', 'xyz']})
print(df)
print(df.replace(regex={r'^ba.$':'new', 'foo':'xyz'}))
df.replace(to_replace=r'^ba.*$', value='new', regex=True)
   A  B  C
0  9  5  a
1  0  0  b
2  8  1  c
3  7  3  d
4  4  3  e
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
   A  B  C
0  9  5  a
1  5  5  b
2  8  1  c
3  7  3  d
4  4  3  e
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
   A  B  C
0  9  5  a
1  4  4  b
2  8  4  c
3  7  4  d
4  4  4  e
   A  B  C
0  9  5  a
1  4  4  b
2  8  3  c
3  7  1  d
4  4  1  e
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
    A    B  C
0   9    5  a
1  10   10  b
2   8  100  c
3   7    3  d
4   4    3  e
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
      A    B
0   bat  abc
1   foo  bar
2  bait  xyz
      A    B
0   new  abc
1   xyz  new
2  bait  xyz
Out[989]:
A B
0 new abc
1 foo new
2 new xyz

类型转换 Top-level conversions

pd.to_numeric(arg, errors='raise', downcast=None)

  • Convert argument to a numeric type.
In [990]:
df = pd.DataFrame([['1', '5', np.NaN], [None, '9', '7']])
print(df)
print('-'*50)
print(pd.to_numeric(df[0]))     # 需要轉換爲list或Series
print('-'*50)
print(pd.to_numeric(df.T[0].T)) # 需要轉換爲list或Series
      0  1    2
0     1  5  NaN
1  None  9    7
--------------------------------------------------
0    1.0
1    NaN
Name: 0, dtype: float64
--------------------------------------------------
0    1.0
1    5.0
2    NaN
Name: 0, dtype: float64

时间函数处理 Top-level dealing with datetimelike

pandas.to_datetime()

  • pandas.to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', cache=False)
  • Convert argument to datetime.
In [991]:
df = pd.DataFrame({'year': [2015, 2016],
                   'month': [2, 3],
                   'day': [4, 5]
                  })
pd.to_datetime(df)
Out[991]:
0   2015-02-04
1   2016-03-05
dtype: datetime64[ns]
In [992]:
pd.to_datetime(
      np.arange(8)
    , unit='ns'
    , origin=pd.Timestamp('2018-11-06')
)
# unit : string, default ‘ns’
# unit of the arg (D,s,ms,us,ns) denote the unit, 
# which is an integer or float number. 
# This will be based off the origin. 
# Example, with unit=’ms’ and origin=’unix’ (the default), 
# this would calculate the number of milliseconds to the unix epoch start.
Out[992]:
DatetimeIndex([          '2018-11-06 00:00:00',
               '2018-11-06 00:00:00.000000001',
               '2018-11-06 00:00:00.000000002',
               '2018-11-06 00:00:00.000000003',
               '2018-11-06 00:00:00.000000004',
               '2018-11-06 00:00:00.000000005',
               '2018-11-06 00:00:00.000000006',
               '2018-11-06 00:00:00.000000007'],
              dtype='datetime64[ns]', freq=None)

pandas.to_timedelta

  • pandas.to_timedelta(arg, unit='ns', box=True, errors='raise')
  • Convert argument to timedelta
In [993]:
pd.to_timedelta(np.arange(8), unit='D')
# unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, which is an integer/float number
Out[993]:
TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days', '5 days',
                '6 days', '7 days'],
               dtype='timedelta64[ns]', freq=None)

pandas.date_range()

  • Return a fixed frequency DatetimeIndex.(DatetimeIndex)

语法

pandas.date_range(
          start=None
        , end=None
        , periods=None
        , freq='D'
        , tz=None
        , normalize=False
        , name=None
        , closed=None
        , **kwargs
)

  • 该函数主要用于生成一个固定频率的时间索引
  • 在调用构造方法时,必须指定start、end、periods中的两个参数值,否则报错。

主要参数说明

periods :固定时期,取值为整数或None
freq :日期偏移量,取值为string或DateOffset,默认为'D'
normalize :若参数为True表示将start、end参数值正则化到午夜时间戳
name :生成时间索引对象的名称,取值为string或None
closed :可以理解成在closed=None情况下返回的结果中,若closed=‘left’表示在返回的结果基础上,再取左开右闭的结果,若closed='right'表示在返回的结果基础上,再取做闭右开的结果

全部参数说明

start : str or datetime-like, optional
Left bound for generating dates.
end : str or datetime-like, optional
Right bound for generating dates.
periods : integer, optional
Number of periods to generate.
freq : str or DateOffset, default ‘D’ (calendar daily)
Frequency strings can have multiples, e.g. ‘5H’. See here for a list of frequency aliases.
tz : str or tzinfo, optional
Time zone name for returning localized DatetimeIndex, for example ‘Asia/Hong_Kong’. By default, the resulting DatetimeIndex is timezone-naive.
normalize : bool, default False
Normalize start/end dates to midnight before generating date range.
name : str, default None
Name of the resulting DatetimeIndex.
closed : {None, ‘left’, ‘right’}, optional
Make the interval closed with respect to the given frequency to the ‘left’, ‘right’, or both sides (None, the default).
**kwargs :
For compatibility. Has no effect on the result.

freq Offset Aliases

  • A number of string aliases are given to useful common time series frequencies. We will refer to these aliases as offset aliases.
Alias Description
B business day frequency
C custom business day frequency
D calendar day frequency
W weekly frequency
M month end frequency
SM semi-month end frequency (15th and end of month)
BM business month end frequency
CBM custom business month end frequency
MS month start frequency
SMS semi-month start frequency (1st and 15th)
BMS business month start frequency
CBMS custom business month start frequency
Q quarter end frequency
BQ business quarter end frequency
QS quarter start frequency
BQS business quarter start frequency
A, Y year end frequency
BA, BY business year end frequency
AS, YS year start frequency
BAS, BYS business year start frequency
BH business hour frequency
H hourly frequency
T, min minutely frequency
S secondly frequency
L, ms milliseconds
U, us microseconds
N nanoseconds
In [994]:
pd.date_range(start='1/1/2018', end='1/11/2018')
Out[994]:
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
               '2018-01-09', '2018-01-10', '2018-01-11'],
              dtype='datetime64[ns]', freq='D')
In [995]:
# freq='W-MON' # 表示按周间隔,从每周一开始
dtindex = pd.date_range(start='10/28/2018', end='11/30/2018', freq='W-MON') 
dtindex
Out[995]:
DatetimeIndex(['2018-10-29', '2018-11-05', '2018-11-12', '2018-11-19',
               '2018-11-26'],
              dtype='datetime64[ns]', freq='W-MON')
In [996]:
# freq='W' # 表示按周间隔,从start那一天开始
dtindex = pd.date_range(start='10/28/2018', end='11/30/2018', freq='W') 
dtindex
Out[996]:
DatetimeIndex(['2018-10-28', '2018-11-04', '2018-11-11', '2018-11-18',
               '2018-11-25'],
              dtype='datetime64[ns]', freq='W-SUN')
In [997]:
# freq='M' # 表示按月间隔,每月最后一天
dtindex = pd.date_range(start='1/28/2018', end='11/30/2018', freq='M') 
dtindex
Out[997]:
DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
               '2018-05-31', '2018-06-30', '2018-07-31', '2018-08-31',
               '2018-09-30', '2018-10-31', '2018-11-30'],
              dtype='datetime64[ns]', freq='M')
In [998]:
# freq='M' # 表示按3个月间隔,每月第一天
dtindex = pd.date_range(start='1/28/2018', end='11/30/2018', freq=pd.offsets.MonthBegin(3)) 
dtindex
Out[998]:
DatetimeIndex(['2018-02-01', '2018-05-01', '2018-08-01', '2018-11-01'], dtype='datetime64[ns]', freq='3MS')
In [999]:
# freq='SM' # 月中15天的一次间隔
dtindex = pd.date_range(start='10/28/2018', end='11/30/2018', freq='SM') 
dtindex
Out[999]:
DatetimeIndex(['2018-10-31', '2018-11-15', '2018-11-30'], dtype='datetime64[ns]', freq='SM-15')
In [1000]:
# freq='SM-2' # 月中2天的一次间隔
dtindex = pd.date_range(start='10/28/2018', end='11/30/2018', freq='SM-2') 
dtindex
Out[1000]:
DatetimeIndex(['2018-10-31', '2018-11-02', '2018-11-30'], dtype='datetime64[ns]', freq='SM-2')
In [1001]:
# freq='Q' # 每季度的最后一日
dtindex = pd.date_range(start='4/28/2018', end='12/31/2018', freq='Q') 
dtindex
Out[1001]:
DatetimeIndex(['2018-06-30', '2018-09-30', '2018-12-31'], dtype='datetime64[ns]', freq='Q-DEC')
In [1002]:
# freq='Q-NOV' # 最后截止日是11月最有一日,然后倒退前面的日期,即提前一个月
dtindex = pd.date_range(start='4/28/2018', end='12/31/2018', freq='Q-NOV')
dtindex
Out[1002]:
DatetimeIndex(['2018-05-31', '2018-08-31', '2018-11-30'], dtype='datetime64[ns]', freq='Q-NOV')
In [1003]:
# freq='Q-NOV' # 最后截止日是10月最有一日,然后倒退前面的日期,即提前一个月
dtindex = pd.date_range(start='4/28/2018', end='12/31/2018', freq='Q-OCT')
dtindex
Out[1003]:
DatetimeIndex(['2018-04-30', '2018-07-31', '2018-10-31'], dtype='datetime64[ns]', freq='Q-OCT')
In [1004]:
# pd.date_range().to_period()  # 显示为月份,不显示日期
dtindex = pd.date_range(start='4/28/2018', end='12/31/2018', freq='M').to_period()
dtindex
Out[1004]:
PeriodIndex(['2018-04', '2018-05', '2018-06', '2018-07', '2018-08', '2018-09',
             '2018-10', '2018-11', '2018-12'],
            dtype='period[M]', freq='M')
In [1005]:
# pd.date_range().to_timestamp() # 显示为每月1日
# 之所以没有显示12月份,是因为to_period时对于12月30日,不会显示12月份最后一日(12月31日)
# 所以既然没有12月份,所以最终也不会显示12月1日
dtindex = pd.date_range(start='4/28/2018', end='12/30/2018', freq='M').to_period().to_timestamp()
dtindex
Out[1005]:
DatetimeIndex(['2018-04-01', '2018-05-01', '2018-06-01', '2018-07-01',
               '2018-08-01', '2018-09-01', '2018-10-01', '2018-11-01'],
              dtype='datetime64[ns]', freq='MS')

pandas.period_range()

  • pandas.period_range(start=None, end=None, periods=None, freq='D', name=None)
  • Return a fixed frequency PeriodIndex, with day (calendar) as the default frequency
In [1006]:
pd.period_range(start='2017-01-01', end='2018-01-01', freq='M')
Out[1006]:
PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
             '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12',
             '2018-01'],
            dtype='period[M]', freq='M')

pandas.timedelta_range()

  • pandas.timedelta_range(start=None, end=None, periods=None, freq=None, name=None, closed=None)
  • Return a fixed frequency TimedeltaIndex, with day as the default frequency
In [1007]:
pd.timedelta_range(start='1 day', periods=4)
Out[1007]:
TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq='D')

区间设置 Top-level dealing with intervals

pandas.interval_range()

  • pandas.interval_range(start=None, end=None, periods=None, freq=None, name=None, closed='right')
  • Return a fixed frequency IntervalIndex
In [1008]:
pd.interval_range(start=0, end=6)
Out[1008]:
IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5], (5, 6]]
              closed='right',
              dtype='interval[int64]')
In [1009]:
pd.interval_range(start=0, end=6, periods=4)
Out[1009]:
IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]]
              closed='right',
              dtype='interval[float64]')
In [1010]:
pd.interval_range(start=pd.Timestamp('2017-01-01'),end=pd.Timestamp('2017-01-04'))
Out[1010]:
IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04]]
              closed='right',
              dtype='interval[datetime64[ns]]')
In [1011]:
pd.interval_range(start=0, periods=4, freq=1.5)
Out[1011]:
IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]]
              closed='right',
              dtype='interval[float64]')

Attributes and underlying data

pandas.DataFrame.index

In [1012]:
df=pd.DataFrame([[1.,2.,3.],[7.,8.,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.index
     X    Y    Z
a  1.0  2.0  3.0
b  7.0  8.0  NaN
Out[1012]:
Index(['a', 'b'], dtype='object')

pandas.DataFrame.columns

In [1013]:
df=pd.DataFrame([[1.,2.,3.],[7.,8.,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.columns
     X    Y    Z
a  1.0  2.0  3.0
b  7.0  8.0  NaN
Out[1013]:
Index(['X', 'Y', 'Z'], dtype='object')

pandas.DataFrame.dtypes

In [1014]:
df=pd.DataFrame([[1.,2.,3.],[7.,8.,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.dtypes
     X    Y    Z
a  1.0  2.0  3.0
b  7.0  8.0  NaN
Out[1014]:
X    float64
Y    float64
Z    float64
dtype: object

pandas.DataFrame.ftypes

In [1015]:
df=pd.DataFrame([[1.,2.,3.],[7.,8.,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.ftypes
     X    Y    Z
a  1.0  2.0  3.0
b  7.0  8.0  NaN
Out[1015]:
X    float64:dense
Y    float64:dense
Z    float64:dense
dtype: object

pandas.DataFrame.get_dtype_counts

In [1016]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.get_dtype_counts()
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
Out[1016]:
float64    1
int64      1
object     1
dtype: int64

pandas.DataFrame.select_dtypes

  • DataFrame.select_dtypes(include=None, exclude=None)
In [1017]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
print(df.select_dtypes(include='float'))
print('-'*50)
print(df.select_dtypes(include='int'))
print('-'*50)
print(df.select_dtypes(include='bool'))
print('-'*50)
print(df.select_dtypes(exclude=['int','bool']))
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
     Z
a  3.0
b  NaN
--------------------------------------------------
   Y
a  2
b  8
--------------------------------------------------
Empty DataFrame
Columns: []
Index: [a, b]
--------------------------------------------------
   X    Z
a  1  3.0
b  7  NaN

pandas.DataFrame.values

  • 矩阵,不包含column和index
In [1018]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.values
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
Out[1018]:
array([['1', 2, 3.0],
       ['7', 8, nan]], dtype=object)

pandas.DataFrame.get_values

  • 矩阵,不包含column和index
  • This is the same as .values for non-sparse data.
  • For sparse data contained in a pandas.SparseArray, the data are first converted to a dense representation.
In [1019]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.get_values()
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
Out[1019]:
array([['1', 2, 3.0],
       ['7', 8, nan]], dtype=object)

pandas.DataFrame.axes

In [1020]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.axes
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
Out[1020]:
[Index(['a', 'b'], dtype='object'), Index(['X', 'Y', 'Z'], dtype='object')]

pandas.DataFrame.ndim

In [1021]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.ndim
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
Out[1021]:
2

pandas.DataFrame.size

In [1022]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.size
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
Out[1022]:
6

pandas.DataFrame.shape

In [1023]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.shape
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
Out[1023]:
(2, 3)

pandas.DataFrame.memory_usage

  • Return the memory usage of each column in bytes.
In [1024]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
print(df.memory_usage())
print('-'*50)
print(df.memory_usage(index=False))
print('-'*50)
print(df.memory_usage(deep=True))
print('-'*50)
df.info()
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
Index    16
X        16
Y        16
Z        16
dtype: int64
--------------------------------------------------
X    16
Y    16
Z    16
dtype: int64
--------------------------------------------------
Index    132
X        132
Y         16
Z         16
dtype: int64
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, a to b
Data columns (total 3 columns):
X    2 non-null object
Y    2 non-null int64
Z    1 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 64.0+ bytes

pandas.DataFrame.empty

In [1025]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.empty
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
Out[1025]:
False

转换 Conversion

pandas.DataFrame.astype()

  • DataFrame.astype(dtype, copy=True, errors='raise', **kwargs)
  • Cast a pandas object to a specified dtype dtype.
In [1026]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
print(df.dtypes)
print('-'*50)
print(df.fillna(0).astype('int64').dtypes)
df.fillna(0).astype('int64')
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
X     object
Y      int64
Z    float64
dtype: object
--------------------------------------------------
X    int64
Y    int64
Z    int64
dtype: object
Out[1026]:
X Y Z
a 1 2 3
b 7 8 0

pandas.DataFrame.copy

  • deep : bool, default True
  • Make a deep copy, including a copy of the data and the indices.
  • With deep=False neither the indices nor the data are copied.
In [1027]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df_c=df.copy(deep=True)
print(df)
print(df_c)
print('-'*50)
df_c.iloc[0,0]=100
print(df)
print(df_c)
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
     X  Y    Z
a  100  2  3.0
b    7  8  NaN
In [1028]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df_c=df.copy(deep=False)
print(df)
print(df_c)
print('-'*50)
df_c.iloc[0,0]=100
print(df)
print(df_c)
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
   X  Y    Z
a  1  2  3.0
b  7  8  NaN
--------------------------------------------------
     X  Y    Z
a  100  2  3.0
b    7  8  NaN
     X  Y    Z
a  100  2  3.0
b    7  8  NaN

索引,迭代 Indexing, iteration

pandas.DataFrame.head

In [1029]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df.head(1)
Out[1029]:
X Y Z
a 1 2 3.0

pandas.DataFrame.at

  • 按行列的名称定位元素
In [1030]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df.at['b','Y']
Out[1030]:
8

pandas.DataFrame.iat

  • 按行列的索引值定位元素
In [1031]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df.iat[1,1]
Out[1031]:
8

pandas.DataFrame.loc

  • Access a group of rows and columns by label(s) or a boolean array.
  • .loc[] is primarily label based, but may also be used with a boolean array.
In [1032]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df.loc['b'])
print('-'*50)
print(df.loc[['b','a']])
print('-'*50)
print(df.loc['a':'b','Z'])
print('-'*50)
print(
    df.loc[
        df['Y']>5,['Z']
          ]
     )
print('-'*50)
print(
    df.loc[
        df['Y']>5
          ]
     )
print('-'*50)
print(
    df.loc[
        ['b','a'],['Y']
          ]
     )
print('-'*50)
print(
    df.loc[
        ['b','a'],'X':'Z'
          ]
     )
print('-'*50)
print(
    df.loc[
        :,'Y':'Z'
          ]
     )
X      7
Y      8
Z    NaN
Name: b, dtype: object
--------------------------------------------------
   X  Y    Z
b  7  8  NaN
a  1  2  3.0
--------------------------------------------------
a    3.0
b    NaN
Name: Z, dtype: float64
--------------------------------------------------
    Z
b NaN
--------------------------------------------------
   X  Y   Z
b  7  8 NaN
--------------------------------------------------
   Y
b  8
a  2
--------------------------------------------------
   X  Y    Z
b  7  8  NaN
a  1  2  3.0
--------------------------------------------------
   Y    Z
a  2  3.0
b  8  NaN

pandas.DataFrame.iloc

In [1033]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df.iloc[1])
print('-'*50)
print(df.iloc[[2,1]])
print('-'*50)
print(df.iloc[0:1,1])
print('-'*50)
print(
    df.iloc[
        [0,1],[2]
          ]
     )
print('-'*50)
print(
    df.iloc[
        :,0:2
          ]
     )
print('-'*50)
print(
    df.iloc[
        [2,1],0:3
          ]
     )
print('-'*50)
print(
    df[df.Z > 5]
     )
print('-'*50)
print(
    df[df.Z > 1].X
     )
X      7
Y      8
Z    NaN
Name: b, dtype: object
--------------------------------------------------
    X    Y    Z
c  10  NaN  9.0
b   7  8.0  NaN
--------------------------------------------------
a    2.0
Name: Y, dtype: float64
--------------------------------------------------
     Z
a  3.0
b  NaN
--------------------------------------------------
    X    Y
a   1  2.0
b   7  8.0
c  10  NaN
--------------------------------------------------
    X    Y    Z
c  10  NaN  9.0
b   7  8.0  NaN
--------------------------------------------------
    X   Y    Z
c  10 NaN  9.0
--------------------------------------------------
a     1
c    10
Name: X, dtype: object

pandas.DataFrame.insert

  • DataFrame.insert(loc, column, value, allow_duplicates=False)
  • Insert column into DataFrame at specified location.
  • Raises a ValueError if column is already contained in the DataFrame, unless allow_duplicates is set to True.
  • 在指定位置插入列
  • 若是插入行,採用在最後位置concat新行
In [1034]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
print('-'*50)
df.insert(1,'AAA+',['100',np.NaN,300.])
print(df)
pd.concat([df, pd.DataFrame([['1',2,3.,4]], columns=list('XYZJ')) ], ignore_index=True, sort=False)
    X    Y    Z
a   1  2.0  3.0
b   7  8.0  NaN
c  10  NaN  9.0
--------------------------------------------------
    X AAA+    Y    Z
a   1  100  2.0  3.0
b   7  NaN  8.0  NaN
c  10  300  NaN  9.0
Out[1034]:
X AAA+ Y Z J
0 1 100 2.0 3.0 NaN
1 7 NaN 8.0 NaN NaN
2 10 300 NaN 9.0 NaN
3 1 NaN 2.0 3.0 4.0

pandas.DataFrame.items

  • pandas.DataFrame.iteritems
In [1035]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
for i in df.items():   # 迭代列
    print(i)
    print('-'*30)
('X', a     1
b     7
c    10
Name: X, dtype: object)
------------------------------
('Y', a    2.0
b    8.0
c    NaN
Name: Y, dtype: float64)
------------------------------
('Z', a    3.0
b    NaN
c    9.0
Name: Z, dtype: float64)
------------------------------

pandas.DataFrame.iterrows

In [1036]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
for i in df.iterrows():    # 迭代行
    print(i[0])
    print('-'*30)
    print(i[1])
    print('~'*30)
a
------------------------------
X    1
Y    2
Z    3
Name: a, dtype: object
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
b
------------------------------
X      7
Y      8
Z    NaN
Name: b, dtype: object
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
c
------------------------------
X     10
Y    NaN
Z      9
Name: c, dtype: object
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

pandas.DataFrame.keys

In [1037]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
df.keys()
Out[1037]:
Index(['X', 'Y', 'Z'], dtype='object')

pandas.DataFrame.lookup

  • DataFrame.lookup(row_labels, col_labels)
In [1038]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
print(type(df.lookup('c','Z')))
df.lookup('c','Z')
    X    Y    Z
a   1  2.0  3.0
b   7  8.0  NaN
c  10  NaN  9.0
<class 'numpy.ndarray'>
Out[1038]:
array([9.])
In [1039]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
result = []
for row, col in zip(df.index,df.columns):
    print(row, col)
    result.append(df.at[row, col])
result
a X
b Y
c Z
Out[1039]:
['1', 8.0, 9.0]

pandas.DataFrame.pop

  • DataFrame.pop(item)
  • Return item and drop from frame. Raise KeyError if not found.
In [1040]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
print('-'*30)
y=df.pop('Y')
print(type(y),'\n', 'y=\n', y)
df
    X    Y    Z
a   1  2.0  3.0
b   7  8.0  NaN
c  10  NaN  9.0
------------------------------
<class 'pandas.core.series.Series'> 
 y=
 a    2.0
b    8.0
c    NaN
Name: Y, dtype: float64
Out[1040]:
X Z
a 1 3.0
b 7 NaN
c 10 9.0

pandas.DataFrame.tail

In [1041]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
df.tail(1)
Out[1041]:
X Y Z
c 10 NaN 9.0

pandas.DataFrame.xs

  • DataFrame.xs(key, axis=0, level=None, drop_level=True)
  • 返回指定行或列
  • Returns a cross-section (row(s) or column(s)) from the Series/DataFrame. Defaults to cross-section on the rows (axis=0).
In [1042]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
print('-'*30)
print(df.xs('a'))
print('-'*30)
print(df.xs('Z', axis=1))
    X    Y    Z
a   1  2.0  3.0
b   7  8.0  NaN
c  10  NaN  9.0
------------------------------
X    1
Y    2
Z    3
Name: a, dtype: object
------------------------------
a    3.0
b    NaN
c    9.0
Name: Z, dtype: float64
In [1043]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
          ['1', '1', '1', '1', '2', '2', '2', '2']
         ]
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second', 'third'])
print(len(index))
df = pd.DataFrame(np.random.randn(8,5), index=index, columns=list('ABCDE'))
print(df)
print('-'*50)
print(df.xs(('foo', 'two')))
print('='*50)
print(df.xs('qux', level=0))
print('-'*50)
print(df.xs('one', level=1))
print('*'*50)
print(df.xs('1', level=2))
print('*'*50)
print(df.xs(('baz', 'one'), level=[0, 'second']))
df.xs(('baz', '1'), level=[0, 'third']) # level的列表和之前的元組,存在映射關系,表示對第0,2列加限制條件進行篩選
8
                           A         B         C         D         E
first second third                                                  
bar   one    1      1.209468  1.290964 -0.376996 -0.107501  0.835098
      two    1     -0.834633  0.171100 -0.808162 -0.043249 -0.125283
baz   one    1      0.191019  0.115292 -1.219449  0.326575  1.170924
      two    1     -0.497769  0.247957 -0.116349 -1.485385  0.224618
foo   one    2     -1.155593 -0.248171 -0.238434 -0.238925  0.468336
      two    2     -0.282162  0.203174  0.278302 -0.454708 -1.855569
qux   one    2     -0.321771 -1.595031 -1.876123 -0.812769 -0.349010
      two    2     -0.683977 -0.423224 -0.644195 -1.602555  0.568056
--------------------------------------------------
              A         B         C         D         E
third                                                  
2     -0.282162  0.203174  0.278302 -0.454708 -1.855569
==================================================
                     A         B         C         D         E
second third                                                  
one    2     -0.321771 -1.595031 -1.876123 -0.812769 -0.349010
two    2     -0.683977 -0.423224 -0.644195 -1.602555  0.568056
--------------------------------------------------
                    A         B         C         D         E
first third                                                  
bar   1      1.209468  1.290964 -0.376996 -0.107501  0.835098
baz   1      0.191019  0.115292 -1.219449  0.326575  1.170924
foo   2     -1.155593 -0.248171 -0.238434 -0.238925  0.468336
qux   2     -0.321771 -1.595031 -1.876123 -0.812769 -0.349010
**************************************************
                     A         B         C         D         E
first second                                                  
bar   one     1.209468  1.290964 -0.376996 -0.107501  0.835098
      two    -0.834633  0.171100 -0.808162 -0.043249 -0.125283
baz   one     0.191019  0.115292 -1.219449  0.326575  1.170924
      two    -0.497769  0.247957 -0.116349 -1.485385  0.224618
**************************************************
              A         B         C         D         E
third                                                  
1      0.191019  0.115292 -1.219449  0.326575  1.170924
Out[1043]:
A B C D E
second
one 0.191019 0.115292 -1.219449 0.326575 1.170924
two -0.497769 0.247957 -0.116349 -1.485385 0.224618

pandas.DataFrame.get

In [1044]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
df.get('X')  # 只能獲取列
    X    Y    Z
a   1  2.0  3.0
b   7  8.0  NaN
c  10  NaN  9.0
Out[1044]:
a     1
b     7
c    10
Name: X, dtype: object

pandas.DataFrame.isin

In [1045]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
df.isin([8, '7', np.NaN])  # np.NaN不參與比較,所以也是False,此外對比與位置無關
    X    Y    Z
a   1  2.0  3.0
b   7  8.0  NaN
c  10  NaN  9.0
Out[1045]:
X Y Z
a False False False
b True True False
c False False False
In [1046]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
other = pd.DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
print(df)
print(other)
df.isin(other) # Column A in `other` has a 3, but not at index 1.
# 當作用與df的時候,是按位置一一判斷isin與否的
   A  B
0  1  a
1  2  b
2  3  f
   A  B
0  1  e
1  3  f
2  3  f
3  2  e
Out[1046]:
A B
0 True False
1 False False
2 True True

pandas.DataFrame.where

In [1047]:
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
df2=df.fillna(0).astype('float')
print(df2)
print(df2.where(df2['X']>7)) # 先對列進行判斷,然後取該列中符合條件記錄所在的行的所有行記錄
df2.where(df2.iloc[2:]>7) # 直接對行判斷,並取行記錄
    X    Y    Z
a   1  2.0  3.0
b   7  8.0  NaN
c  10  NaN  9.0
      X    Y    Z
a   1.0  2.0  3.0
b   7.0  8.0  0.0
c  10.0  0.0  9.0
      X    Y    Z
a   NaN  NaN  NaN
b   NaN  NaN  NaN
c  10.0  0.0  9.0
Out[1047]:
X Y Z
a NaN NaN NaN
b NaN NaN NaN
c 10.0 NaN 9.0

pandas.DataFrame.mask

  • DataFrame.mask(cond, other=nan, inplace=False, axis=None, level=None, errors='raise', try_cast=False, raise_on_error=None)
  • Return an object of same shape as self and whose corresponding entries are from self where cond is False and otherwise are from other.
In [1048]:
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
# 一个参数为-1时,那么reshape函数会根据另一个参数的维度计算出数组的另外一个shape属性值。
print(df)
m = df % 3 == 0
df.mask(m, -df) # 不符合條件的,返回self的元素,符合條件的,返回other,即此處的-df的相應位置的元素
   A  B
0  0  1
1  2  3
2  4  5
3  6  7
4  8  9
Out[1048]:
A B
0 0 1
1 2 -3
2 4 5
3 -6 7
4 8 -9
In [1049]:
# 與where()對照的理解其功能
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
m = df % 3 == 0
df.where(m, -df)  # 與mask()的值恰好相反
Out[1049]:
A B
0 0 -1
1 -2 3
2 -4 -5
3 6 -7
4 -8 9
In [1050]:
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
print(df)
print('-'*50)
print(df.where(df>=5))
df.mask(df>=5)
   A  B
0  0  1
1  2  3
2  4  5
3  6  7
4  8  9
--------------------------------------------------
     A    B
0  NaN  NaN
1  NaN  NaN
2  NaN  5.0
3  6.0  7.0
4  8.0  9.0
Out[1050]:
A B
0 0.0 1.0
1 2.0 3.0
2 4.0 NaN
3 NaN NaN
4 NaN NaN

pandas.DataFrame.query

In [1051]:
df = pd.DataFrame(np.random.randn(10, 2), columns=list('ab'))
df.query('a > b')
Out[1051]:
a b
3 1.692565 -0.367258
4 -1.228971 -1.649726
5 -0.935311 -1.194938
7 0.383545 0.216440
8 1.320212 -2.022758
9 0.670143 -0.378956
In [1052]:
df[df.a > df.b]  # same result as the previous expression
Out[1052]:
a b
3 1.692565 -0.367258
4 -1.228971 -1.649726
5 -0.935311 -1.194938
7 0.383545 0.216440
8 1.320212 -2.022758
9 0.670143 -0.378956

Binary operator functions

pandas.DataFrame.add

In [1053]:
a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], two=[np.nan, 2, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
a.add(b, fill_value=100) # fill_value只對其中一個df有效,如果兩個都是NaN,那麼結果仍然是NaN
   one
a  1.0
b  1.0
c  1.0
d  NaN
------------------------------
   one  two
a  1.0  NaN
b  NaN  2.0
d  1.0  NaN
e  NaN  2.0
Out[1053]:
one two
a 2.0 NaN
b 101.0 102.0
c 101.0 NaN
d 101.0 NaN
e NaN 102.0

pandas.DataFrame.radd

In [1054]:
a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], two=[np.nan, 2, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
a.radd(b, fill_value=100) # fill_value只對其中一個df有效,如果兩個都是NaN,那麼結果仍然是NaN
# radd()與add()的區別,在於兩個df的順序,對於相加,a+b=b+a,對於sub,mul,div,調換a和b的順序,則結果會不一樣.
   one
a  1.0
b  1.0
c  1.0
d  NaN
------------------------------
   one  two
a  1.0  NaN
b  NaN  2.0
d  1.0  NaN
e  NaN  2.0
Out[1054]:
one two
a 2.0 NaN
b 101.0 102.0
c 101.0 NaN
d 101.0 NaN
e NaN 102.0

pandas.DataFrame.floordiv

In [1055]:
a = pd.DataFrame([4, 6, 8, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[2, np.nan, 3, np.nan], two=[np.nan, 3, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
print('-'*30)
print(10%3, '10除以3,餘是1')
print(10//3, '10除以3,商是3')
a.floordiv(b, fill_value=100)  # 取商
   one
a  4.0
b  6.0
c  8.0
d  NaN
------------------------------
   one  two
a  2.0  NaN
b  NaN  3.0
d  3.0  NaN
e  NaN  2.0
------------------------------
1 10除以3,餘是1
3 10除以3,商是3
Out[1055]:
one two
a 2.0 NaN
b 0.0 33.0
c 0.0 NaN
d 33.0 NaN
e NaN 50.0

pandas.DataFrame.mod

In [1056]:
a = pd.DataFrame([4, 6, 8, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[2, np.nan, 3, np.nan], two=[np.nan, 3, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
print('-'*30)
print(10%3, '10除以3,餘是1')
print(10//3, '10除以3,商是3')
a.mod(b, fill_value=100)  # 取餘(求模)
   one
a  4.0
b  6.0
c  8.0
d  NaN
------------------------------
   one  two
a  2.0  NaN
b  NaN  3.0
d  3.0  NaN
e  NaN  2.0
------------------------------
1 10除以3,餘是1
3 10除以3,商是3
Out[1056]:
one two
a 0.0 NaN
b 6.0 1.0
c 8.0 NaN
d 1.0 NaN
e NaN 0.0

pandas.DataFrame.pow

In [1057]:
a = pd.DataFrame([2, 2, 2, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[2, np.nan, 3, np.nan], two=[np.nan, 3, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
print('-'*30)
print(2**2)
print(2**3)
a.pow(b, fill_value=3)  # 求冪(開方) power
   one
a  2.0
b  2.0
c  2.0
d  NaN
------------------------------
   one  two
a  2.0  NaN
b  NaN  3.0
d  3.0  NaN
e  NaN  2.0
------------------------------
4
8
Out[1057]:
one two
a 4.0 NaN
b 8.0 27.0
c 8.0 NaN
d 27.0 NaN
e NaN 9.0

pandas.DataFrame.dot

In [1058]:
a = pd.DataFrame([[2, 2, 2, 2]], columns=list('abcd'))
print(a.shape, '\n', a)
print('-'*30)
b = pd.DataFrame([[2, 2, 2, 2]], columns=list('abcd'))
print(b.T.shape, '\n', b.T)
print('-'*30)
a.dot(b.T)
(1, 4) 
    a  b  c  d
0  2  2  2  2
------------------------------
(4, 1) 
    0
a  2
b  2
c  2
d  2
------------------------------
Out[1058]:
0
0 16
In [1059]:
a = pd.DataFrame([[2, 4, 6, 8],[1, 1, 1, 1]], columns=list('abcd'))
print(a.shape, '\n', a)
print('-'*30)
b = pd.DataFrame([[1, 1, 1, 1],[1, 3, 5, 7]], columns=list('abcd'))
print(b.T.shape, '\n', b.T)
print('-'*30)
a.dot(b.T) # a的第一行分別乘以b的第一列和第二列,作爲新的df的第一行的兩個元素
(2, 4) 
    a  b  c  d
0  2  4  6  8
1  1  1  1  1
------------------------------
(4, 2) 
    0  1
a  1  1
b  1  3
c  1  5
d  1  7
------------------------------
Out[1059]:
0 1
0 20 100
1 4 16
In [1060]:
a = pd.DataFrame([[2, 4, 6, 8],[1, 1, 1, 1]], columns=list('abcd'))
print(a.T.shape, '\n', a.T)
print('-'*30)
b = pd.DataFrame([[1, 1, 1, 1],[1, 3, 5, 7]], columns=list('abcd'))
print(b.shape, '\n', b)
print('-'*30)
a.T.dot(b)
(4, 2) 
    0  1
a  2  1
b  4  1
c  6  1
d  8  1
------------------------------
(2, 4) 
    a  b  c  d
0  1  1  1  1
1  1  3  5  7
------------------------------
Out[1060]:
a b c d
a 3 5 7 9
b 5 7 9 11
c 7 9 11 13
d 9 11 13 15

pandas.DataFrame.eq

In [1061]:
a = pd.DataFrame([[2, 4, 6, 8],[1, 1, 1, 1]], columns=list('abcd'))
print(a)
b = pd.DataFrame([[1, 1, 1, 1],[1, 3, 5, 7]], columns=list('abcd'))
print(b)
a.eq(b)
   a  b  c  d
0  2  4  6  8
1  1  1  1  1
   a  b  c  d
0  1  1  1  1
1  1  3  5  7
Out[1061]:
a b c d
0 False False False False
1 True False False False

pandas.DataFrame.combine

In [1062]:
df1 = pd.DataFrame({'A': [0, 2], 'B': [np.NaN, 4], 'C': [0, 0]})
print(df1)
print('~'*50)
print(df1.sum())
print('-'*50)
df2 = pd.DataFrame({'A': [7, 5], 'B': [3, np.NaN]})
print(df2)
print('~'*50)
print(df2.sum())
df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2) # 比較列的sum()
   A    B  C
0  0  NaN  0
1  2  4.0  0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A    2.0
B    4.0
C    0.0
dtype: float64
--------------------------------------------------
   A    B
0  7  3.0
1  5  NaN
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A    12.0
B     3.0
dtype: float64
Out[1062]:
A B C
0 0 3.0 NaN
1 2 NaN NaN

pandas.DataFrame.combine_first

In [1063]:
df1 = pd.DataFrame({'A': [0, np.NaN], 'B': [np.NaN, 6]})
print(df1)
print('~'*50)
print(df1.sum())
print('-'*50)
df2 = pd.DataFrame({'A': [7, 5], 'B': [3, 1]})
print(df2)
print('~'*50)
print(df2.sum())
df1.combine_first(df2) # df1’s values prioritized, use values from df2 to fill holes:
     A    B
0  0.0  NaN
1  NaN  6.0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A    0.0
B    6.0
dtype: float64
--------------------------------------------------
   A  B
0  7  3
1  5  1
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A    12
B     4
dtype: int64
Out[1063]:
A B
0 0.0 3.0
1 5.0 6.0

函數調用和聚合 Function application & GroupBy

pandas.DataFrame.apply

  • DataFrame.apply(func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, args=(), **kwds)
  • Apply a function along an axis of the DataFrame.
In [1064]:
df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
print(df)
print('-'*50)
print(df.apply(np.sqrt))
print('-'*50)
print(df.apply(np.sum, axis=0))
print('-'*50)
print(df.apply(np.sum, axis=1))
print('-'*50)
print(type(df.apply(lambda x: [x[0]**1, x[1]**2], axis=1)))
print(df.apply(lambda x: [x[0]**1, x[1]**2], axis=1))
print('-'*50)
# result_type='expand'时,类型为DataFrame
# result_type='reduce'时,类型为Series
print(type(df.apply(lambda x: [x[0]**1, x[1]**2], axis=1, result_type='broadcast')))
print(df.apply(lambda x: [x[0]**1, x[1]**2], axis=1, result_type='broadcast'))
# result_type : {‘expand’, ‘reduce’, ‘broadcast’, None}, default None
#   These only act when axis=1 (columns):
#       ‘expand’ : list-like results will be turned into columns.
#       ‘reduce’ : returns a Series if possible rather than expanding list-like results. This is the opposite of ‘expand’.
#       ‘broadcast’ : results will be broadcast to the original shape of the DataFrame, the original index and columns will be retained.
#   The default behaviour (None) depends on the return value of the applied function: list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns.
#   New in version 0.23.0.
print('-'*50)
print(type(df.apply(lambda x: str(x))[0]))
print(df.apply(lambda x: str(x))) # apply()是对整行或整列進行處理,獲得的是整行或整列的數據
print('~'*30)
for i, element in enumerate(df.apply(lambda x: str(x))[0]):
     print(i, element)# 验证apply()与applymap()的区别,一个是对整行或列進行處理,另一个是对行或列中的元素逐一处理
df.apply(lambda x: len(str(x)))
   A  B
0  4  9
1  4  9
2  4  9
--------------------------------------------------
     A    B
0  2.0  3.0
1  2.0  3.0
2  2.0  3.0
--------------------------------------------------
A    12
B    27
dtype: int64
--------------------------------------------------
0    13
1    13
2    13
dtype: int64
--------------------------------------------------
<class 'pandas.core.series.Series'>
0    [4, 81]
1    [4, 81]
2    [4, 81]
dtype: object
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
   A   B
0  4  81
1  4  81
2  4  81
--------------------------------------------------
<class 'str'>
A    0    4\n1    4\n2    4\nName: A, dtype: int64
B    0    9\n1    9\n2    9\nName: B, dtype: int64
dtype: object
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0 0
1  
2  
3  
4  
5 4
6 

7 1
8  
9  
10  
11  
12 4
13 

14 2
15  
16  
17  
18  
19 4
20 

21 N
22 a
23 m
24 e
25 :
26  
27 A
28 ,
29  
30 d
31 t
32 y
33 p
34 e
35 :
36  
37 i
38 n
39 t
40 6
41 4
Out[1064]:
A    42
B    42
dtype: int64
In [1065]:
df = pd.DataFrame(np.random.randn(4, 5), columns=['A', 'B', 'C', 'D', 'E'])
df['Col_sum'] = df.apply(lambda x: x.sum(), axis=1)
df.loc['Row_sum'] = df.apply(lambda x: x.sum())
df
Out[1065]:
A B C D E Col_sum
0 0.576207 1.424097 -0.237222 1.205378 0.467291 3.435751
1 0.015648 -1.629934 0.398383 0.397965 -0.567927 -1.385865
2 0.234611 -0.453310 -2.221688 -1.048314 0.062570 -3.426131
3 0.143311 -1.669318 0.108048 -0.186586 -0.093079 -1.697625
Row_sum 0.969777 -2.328465 -1.952480 0.368443 -0.131144 -3.073870

pandas.DataFrame.applymap

  • Apply a function to a Dataframe elementwise.
In [1066]:
df = pd.DataFrame([[1, 2.12], [3.356, 4.5678]])
print(df)
print(df.applymap(lambda x: str(x)))# applymap()是对行或列裏面的元素進行處理,獲得的是使用函數逐一對元素處理後的結果
df.applymap(lambda x: len(str(x)))
       0       1
0  1.000  2.1200
1  3.356  4.5678
       0       1
0    1.0    2.12
1  3.356  4.5678
Out[1066]:
0 1
0 3 4
1 5 6

pandas.DataFrame.pipe

  • DataFrame.pipe(func, args, \*kwargs)
  • Apply func(self, args, \*kwargs)
  • Pipe在实际运用中和apply主要区别就是写法不一样,pipe在写法上相对比较简便
In [1067]:
fruits = ['ap1', 'or2', 'le3', 'bb4'] * 2
N = len(fruits)
df = pd.DataFrame(
    {
        'fruit': fruits
      , 'basket_id': np.arange(N)
      , 'count': np.random.randint(3, 15, size=N)
      , 'weight': np.random.uniform(0, 4, size=N)
    }
    , columns=['basket_id', 'fruit', 'count', 'weight']
)
print(df)
print('-'*70)

def first(x):
    return x*2
def second(x, sec):
    return x*sec
def third(x, t1, t2, t3=-1):
    return x*t1*t2

print(df.pipe(first).pipe(second,2).pipe(third,t1=1,t2=2,t3=0))
print('-'*70)
print(df.apply(first).apply(second,args=(2,)).apply(third,args=(1,2),t3=0))
   basket_id fruit  count    weight
0          0   ap1      7  0.336429
1          1   or2     11  3.921252
2          2   le3      7  1.776452
3          3   bb4      4  3.948572
4          4   ap1      7  1.104207
5          5   or2      4  1.063009
6          6   le3      3  0.594073
7          7   bb4     12  3.029662
----------------------------------------------------------------------
   basket_id                     fruit  count     weight
0          0  ap1ap1ap1ap1ap1ap1ap1ap1     56   2.691432
1          8  or2or2or2or2or2or2or2or2     88  31.370014
2         16  le3le3le3le3le3le3le3le3     56  14.211620
3         24  bb4bb4bb4bb4bb4bb4bb4bb4     32  31.588578
4         32  ap1ap1ap1ap1ap1ap1ap1ap1     56   8.833655
5         40  or2or2or2or2or2or2or2or2     32   8.504072
6         48  le3le3le3le3le3le3le3le3     24   4.752583
7         56  bb4bb4bb4bb4bb4bb4bb4bb4     96  24.237294
----------------------------------------------------------------------
   basket_id                     fruit  count     weight
0          0  ap1ap1ap1ap1ap1ap1ap1ap1     56   2.691432
1          8  or2or2or2or2or2or2or2or2     88  31.370014
2         16  le3le3le3le3le3le3le3le3     56  14.211620
3         24  bb4bb4bb4bb4bb4bb4bb4bb4     32  31.588578
4         32  ap1ap1ap1ap1ap1ap1ap1ap1     56   8.833655
5         40  or2or2or2or2or2or2or2or2     32   8.504072
6         48  le3le3le3le3le3le3le3le3     24   4.752583
7         56  bb4bb4bb4bb4bb4bb4bb4bb4     96  24.237294

pandas.DataFrame.agg

  • DataFrame.agg(func, axis=0, *args, **kwargs)
  • Aggregate using one or more operations over the specified axis.
In [1068]:
df = pd.DataFrame([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9],
                   [np.nan, np.nan, np.nan]],
                   columns=['A', 'B', 'C'])
print(df)
# Aggregate these functions over the rows.
print(df.fillna(0).agg(['sum', 'min', 'mean'])) # 做3种运算
print(df.agg(['sum', 'min', 'mean']))
print(df.agg(('mean'), axis=1)) # 只做一种运算
df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max'], 'C' :'mean'}) # 不同列做不同运算
     A    B    C
0  1.0  2.0  3.0
1  4.0  5.0  6.0
2  7.0  8.0  9.0
3  NaN  NaN  NaN
         A      B     C
sum   12.0  15.00  18.0
min    0.0   0.00   0.0
mean   3.0   3.75   4.5
         A     B     C
sum   12.0  15.0  18.0
min    1.0   2.0   3.0
mean   4.0   5.0   6.0
0    2.0
1    5.0
2    8.0
3    NaN
dtype: float64
Out[1068]:
A B C
max NaN 8.0 NaN
mean NaN NaN 6.0
min 1.0 2.0 NaN
sum 12.0 NaN NaN

pandas.DataFrame.transform

  • To apply to column
  • DataFrame.transform(func, *args, **kwargs)
  • Call function producing a like-indexed NDFrame and return a NDFrame with the transformed values
  • 只允许在同一时间在一个Series上进行一次转换,如果定义列‘a’ 减去列‘b’,则会出现异常;
  • 必须返回与group相同的单个维度的序列(行)
  • 返回单个标量对象也可以使用,如.transform(sum)
In [1069]:
df = pd.DataFrame(np.random.randn(4, 4), columns=['A', 'B', 'C', 'D'], index=pd.date_range('1/1/2018', periods=4))
df.iloc[1:2] = np.nan
print(df)
df.transform(lambda x: (x - x.mean()) / x.std())
                   A         B         C         D
2018-01-01 -0.796542 -1.414477  0.300034  0.156123
2018-01-02       NaN       NaN       NaN       NaN
2018-01-03 -1.535938 -0.169594 -1.565617 -1.673637
2018-01-04  2.602354  0.735667 -0.364206  0.507863
Out[1069]:
A B C D
2018-01-01 -0.401679 -1.048301 0.891787 0.420646
2018-01-02 NaN NaN NaN NaN
2018-01-03 -0.736705 0.104867 -1.081139 -1.141609
2018-01-04 1.138384 0.943434 0.189352 0.720963
In [1070]:
df.transform(lambda x: x-x+x.mean()) # 列的平均數
Out[1070]:
A B C D
2018-01-01 0.089958 -0.282801 -0.543263 -0.336551
2018-01-02 NaN NaN NaN NaN
2018-01-03 0.089958 -0.282801 -0.543263 -0.336551
2018-01-04 0.089958 -0.282801 -0.543263 -0.336551
In [1071]:
df.transform(lambda x: x-x+x.std()) # 列的標準差
Out[1071]:
A B C D
2018-01-01 2.206983 1.079533 0.945626 1.171231
2018-01-02 NaN NaN NaN NaN
2018-01-03 2.206983 1.079533 0.945626 1.171231
2018-01-04 2.206983 1.079533 0.945626 1.171231
In [1072]:
df.transform(max) # sum, max, min, np.mean, np.std # 只有df行數等於列數的時候才能正常執行
Out[1072]:
A    2.602354
B    0.735667
C    0.300034
D    0.507863
dtype: float64

pandas.DataFrame.groupby

  • DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, observed=False, **kwargs)
  • Group series using mapper (dict or key function, apply given function to group, return result as series) or by a series of columns.
In [1073]:
df=pd.DataFrame([
     ['exact',9720,'2017-10-01',515,458]
    ,['exact',9720,'2017-10-01',510,896]
    ,['fuzzy',8242,'2017-11-01',122,415]
    ,['fuzzy',8242,'2017-11-01',128,782]
], columns=['type','id','date','code','amount'])
df
Out[1073]:
type id date code amount
0 exact 9720 2017-10-01 515 458
1 exact 9720 2017-10-01 510 896
2 fuzzy 8242 2017-11-01 122 415
3 fuzzy 8242 2017-11-01 128 782
In [1074]:
df.groupby(['type','id','date']).mean()
Out[1074]:
code amount
type id date
exact 9720 2017-10-01 512.5 677.0
fuzzy 8242 2017-11-01 125.0 598.5
In [1075]:
print(type(df.groupby(['type','id'])['amount'].mean()))
print(df.groupby(['type','id'])['amount'].mean().index)
df.groupby(['type','id'])['amount'].mean()
<class 'pandas.core.series.Series'>
MultiIndex(levels=[['exact', 'fuzzy'], [8242, 9720]],
           labels=[[0, 1], [1, 0]],
           names=['type', 'id'])
Out[1075]:
type   id  
exact  9720    677.0
fuzzy  8242    598.5
Name: amount, dtype: float64

Python ' ~ ' (取反) 操作符

In [1076]:
#知识点:
#计算机中的符号数有三种表示方法:原码、反码和补码。
#三种表示方法均有符号位和数值位两部分,符号位都是用0表示“正”,用1表示“负”,而数值位,三种表示方法各不相同。
#在计算机系统中,数值一律用补码来表示和存储。
#原因在于,使用补码,可以将符号位和数值域统一处理;同时,加法和减法也可以统一处理。
#正整数的补码是其二进制表示,与原码相同
#负整数的补码,将其对应正数二进制表示所有位取反(包括符号位,0变1,1变0)后加1。
#运算分析:
#-6的补码是+6(0000 0110)取反后再+1,为(1111 1001)+(0000 0001)=(1111 1010),也就是计算机中-6是用(1111 1010)来存储的,(1111 1010) 按位取反得到(0000 0101)这就是答案5
#-4的补码,是其對應的正數4(0000 0100)取反后再+1,为(1111 1011)+(0000 0001)=(1111 1100),也就是计算机中-4是用(1111 1100)来存储的,(1111 1100) 按位取反得到(0000 0011)这就是答案3
#Python按位取反运算:
a=-4
print('~-4 =',~a)
# 4的补码與原碼相同是(0000 0100), 取反即为(1111 1011)
#对于计算机来说,二进制以1开头表示的是负数
#所以(1111 1011)是一個負數
#但是(1111 1011)是十進制的多少呢?
#想要知道这个值,可以求它的补码,即先取反码 :0000 0100,再加1:0000 0101,
#(0000 0101),是5,並且之前知道這個數是一個負數,所以答案是-5
b=4
print('~4 =',~b)
~-4 = 3
~4 = -5

聚合函數組合使用示例

In [1077]:
df=pd.DataFrame([
     ['exact',9720,'2018-10-01',515,458]
    ,['exact',9720,'2018-10-01',510,896]
    ,['fuzzy',8242,'2018-11-01',122,415]
    ,['fuzzy',8242,'2018-11-01',128,782]
    ,['fuzzy',8243,'2018-11-02',128,782]
    ,['fuzzy',8243,'2018-11-03',128,782]
], columns=['type','id','date','code','amount'])
df
Out[1077]:
type id date code amount
0 exact 9720 2018-10-01 515 458
1 exact 9720 2018-10-01 510 896
2 fuzzy 8242 2018-11-01 122 415
3 fuzzy 8242 2018-11-01 128 782
4 fuzzy 8243 2018-11-02 128 782
5 fuzzy 8243 2018-11-03 128 782
In [1078]:
exact_rows = df['type'] != 'fuzzy'
exact_rows
Out[1078]:
0     True
1     True
2    False
3    False
4    False
5    False
Name: type, dtype: bool
In [1079]:
df.loc[exact_rows]
Out[1079]:
type id date code amount
0 exact 9720 2018-10-01 515 458
1 exact 9720 2018-10-01 510 896
In [1080]:
df.loc[~exact_rows]
Out[1080]:
type id date code amount
2 fuzzy 8242 2018-11-01 122 415
3 fuzzy 8242 2018-11-01 128 782
4 fuzzy 8243 2018-11-02 128 782
5 fuzzy 8243 2018-11-03 128 782
In [1081]:
df.loc[~exact_rows].groupby('id').apply(lambda x:x.sum())
Out[1081]:
type id date code amount
id
8242 fuzzyfuzzy 16484 2018-11-012018-11-01 250 1197
8243 fuzzyfuzzy 16486 2018-11-022018-11-03 256 1564
In [1082]:
df.loc[~exact_rows].groupby('id').apply(lambda x:x.sum())['amount']
Out[1082]:
id
8242    1197
8243    1564
Name: amount, dtype: int64
In [1083]:
df.loc[~exact_rows].groupby('id').transform('nunique') # 取非重復元素的個數
# 譬如date列,判斷是否唯一,是分組進行的
Out[1083]:
type date code amount
2 1 1 2 2
3 1 1 2 2
4 1 2 1 1
5 1 2 1 1
In [1084]:
grouped = df.loc[~exact_rows].groupby('id').apply(lambda g: g.sort_values('code', ascending=False))
print(grouped)
print('~'*50)
grouped = df.loc[~exact_rows].sort_values(['id','date'], ascending=True).groupby('id')
print(grouped)
print(type(grouped))
for i in grouped:
    print(i[0])
    print(i[1])
         type    id        date  code  amount
id                                           
8242 3  fuzzy  8242  2018-11-01   128     782
     2  fuzzy  8242  2018-11-01   122     415
8243 4  fuzzy  8243  2018-11-02   128     782
     5  fuzzy  8243  2018-11-03   128     782
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f78f8d8ae80>
<class 'pandas.core.groupby.groupby.DataFrameGroupBy'>
8242
    type    id        date  code  amount
2  fuzzy  8242  2018-11-01   122     415
3  fuzzy  8242  2018-11-01   128     782
8243
    type    id        date  code  amount
4  fuzzy  8243  2018-11-02   128     782
5  fuzzy  8243  2018-11-03   128     782
In [1085]:
for i in grouped:
    print(i[0])
    print(i[1])
8242
    type    id        date  code  amount
2  fuzzy  8242  2018-11-01   122     415
3  fuzzy  8242  2018-11-01   128     782
8243
    type    id        date  code  amount
4  fuzzy  8243  2018-11-02   128     782
5  fuzzy  8243  2018-11-03   128     782
In [1086]:
for i in grouped['code']:
    print(i[0])
    print(i[1])
8242
2    122
3    128
Name: code, dtype: int64
8243
4    128
5    128
Name: code, dtype: int64
In [1087]:
grouped['code'].transform('nunique') # 判斷是否唯一時,是分組進行的,而不是全部列元素進行判斷.
Out[1087]:
2    2
3    2
4    1
5    1
Name: code, dtype: int64
In [1088]:
a = np.where(grouped['code'].transform('nunique') == 2, 18, 81)
# where(cond, option1, option2)
a
Out[1088]:
array([18, 18, 81, 81])

pd.apply()與pd.transform區別示例

  • 针对dataframe完成特征的计算,并且常常与groupby()方法一起使用
In [1089]:
data = pd.DataFrame({'state':['Florida','Florida','Texas','Texas'], 'a':[4,5,1,3], 'b':[6,10,3,11] }) 
print(data)
print('-'*50)
def sub_two(X):
    return X['a'] - X['b']
data1 = data.groupby(data['state']).apply(sub_two) # 此处使用transform 会出现错误
print(data1)
print('-'*50)
def group_sum(x): 
    return x.sum() 
data3 = data.groupby(data['state']).transform(group_sum) # 返回与原df一样的行數 
print(data3)
print('-~'*30)
data4 = data.groupby(data['state'])['a','b'].apply(group_sum)                                          
print(data4)
     state  a   b
0  Florida  4   6
1  Florida  5  10
2    Texas  1   3
3    Texas  3  11
--------------------------------------------------
state     
Florida  0   -2
         1   -5
Texas    2   -2
         3   -8
dtype: int64
--------------------------------------------------
   a   b
0  9  16
1  9  16
2  4  14
3  4  14
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
         a   b
state         
Florida  9  16
Texas    4  14

Computations / Descriptive Stats

pandas.DataFrame.abs

In [1090]:
df = pd.DataFrame({
    'a': [4, 5, 6, 7],
    'b': [10, 20, 30, 40],
    'c': [100, 50, -30, -50]
})
print(df)
print('-~'*25)
print(df.abs())
print('-~'*25)
print(df.loc[[1,0,2,3]]) # (df.c - 43).abs().argsort() 返回 [1,0,2,3],見下一cell說明
df.loc[(df.c - 43).abs().argsort()]
   a   b    c
0  4  10  100
1  5  20   50
2  6  30  -30
3  7  40  -50
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
   a   b    c
0  4  10  100
1  5  20   50
2  6  30   30
3  7  40   50
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
   a   b    c
1  5  20   50
0  4  10  100
2  6  30  -30
3  7  40  -50
Out[1090]:
a b c
1 5 20 50
0 4 10 100
2 6 30 -30
3 7 40 -50
In [1091]:
# argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y
df = pd.DataFrame({
    'a': [4, 5, 6, 7],
    'b': [10, 20, 30, 40],
    'c': [100, 50, -30, -50]
})
print(df)
print('-~'*25)
print((df.c - 43))
print('-~'*25)
print((df.c - 43).abs())
print('-~'*25)
print((df.c - 43).abs().argsort())
   a   b    c
0  4  10  100
1  5  20   50
2  6  30  -30
3  7  40  -50
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
0    57
1     7
2   -73
3   -93
Name: c, dtype: int64
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
0    57
1     7
2    73
3    93
Name: c, dtype: int64
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
0    1
1    0
2    2
3    3
Name: c, dtype: int64

pandas.DataFrame.all

  • DataFrame.all(axis=0, bool_only=None, skipna=True, level=None, **kwargs)
  • Return whether all elements are True, potentially over an axis.
  • 必須全部是True才會返回True,否則False
In [1092]:
df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
print(df)
print('-~'*25)
print(df.all())
print('-~'*25)
print(df.all(axis='columns'))
print('-~'*25)
print(df.all(axis=None))
   col1   col2
0  True   True
1  True  False
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
col1     True
col2    False
dtype: bool
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
0     True
1    False
dtype: bool
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
False

pandas.DataFrame.any

  • DataFrame.any(axis=0, bool_only=None, skipna=True, level=None, **kwargs)
  • Return whether any element is True over requested axis.
  • 正好與all()相反,只要有一個是True就返回True
In [1093]:
df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
print(df)
print('-~'*25)
print(df.any())
print('-~'*25)
print(df.any(axis='columns'))
print('-~'*25)
print(df.any(axis=None))
   col1   col2
0  True   True
1  True  False
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
col1    True
col2    True
dtype: bool
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
0    True
1    True
dtype: bool
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
True

pandas.DataFrame.clip

  • pandas.DataFrame.clip_lower
  • pandas.DataFrame.clip_upper
  • DataFrame.clip(lower=None, upper=None, axis=None, inplace=False, *args, **kwargs)
  • Trim values at input threshold(s).
  • 按給定的值確定df元素值的邊界
In [1094]:
data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
df = pd.DataFrame(data)
print(df)
print('-~'*25)
print(df.clip(-4, 6))
print('-~'*25)
print(df.clip_lower(-2))
print('-~'*25)
print(df.clip_upper(3))
   col_0  col_1
0      9     -2
1     -3     -7
2      0      6
3     -1      8
4      5     -5
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
   col_0  col_1
0      6     -2
1     -3     -4
2      0      6
3     -1      6
4      5     -4
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
   col_0  col_1
0      9     -2
1     -2     -2
2      0      6
3     -1      8
4      5     -2
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
   col_0  col_1
0      3     -2
1     -3     -7
2      0      3
3     -1      3
4      3     -5

pandas.DataFrame.compound

  • DataFrame.compound(axis=None, skipna=None, level=None)
  • Return the compound percentage of the values for the requested axis
[source] def compound(self, axis=None, skipna=None, level=None): if skipna is None: skipna = True return (1 + self).prod(axis=axis, skipna=skipna, level=level) - 1
In [1095]:
data = {'col_0': [1, 2, 5], 'col_1': [10, 20, 50]}
df = pd.DataFrame(data)
print(df)
print('-~'*25)
print(df.prod())
df.compound() 
# col_0: (1+1)*(1+2)*(1+5)-1=2*3*6-1=35
# col_1: (1+10)*(1+20)*(1+50)-1=11*21*51-1=11780
   col_0  col_1
0      1     10
1      2     20
2      5     50
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
col_0       10
col_1    10000
dtype: int64
Out[1095]:
col_0       35
col_1    11780
dtype: int64

pandas.DataFrame.count

  • DataFrame.count(axis=0, level=None, numeric_only=False)
  • Count non-NA cells for each column or row.
In [1096]:
df = pd.DataFrame({"Person":["John", "Myla", None, "John", "Myla"],
                   "Age": [24., np.nan, 21., 33, 26],
                   "Single": [False, True, True, True, False]
                  })
print(df)
print('-~'*25)
print(df.count())  # Notice the uncounted NA values
print('-~'*25)
print((df.count(axis='columns')))  # Counts for each row
print('-~'*25)
#Counts for one level of a MultiIndex:
df.set_index(["Person", "Single"]).count(level="Person")
  Person   Age  Single
0   John  24.0   False
1   Myla   NaN    True
2   None  21.0    True
3   John  33.0    True
4   Myla  26.0   False
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
Person    4
Age       4
Single    5
dtype: int64
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
0    3
1    2
2    2
3    3
4    3
dtype: int64
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
Out[1096]:
Age
Person
John 2
Myla 1
In [1097]:
df.groupby('Person')['Age'].count()
Out[1097]:
Person
John    2
Myla    1
Name: Age, dtype: int64

pandas.DataFrame.cov

  • DataFrame.cov(min_periods=None)
  • Compute pairwise covariance of columns, excluding NA/null values.
  • 随机变量的协方差: 與数学期望、方差一样,是分布的一个总体参数. 在概率论和统计中,协方差是对两个随机变量联合分布线性相关程度的一种度量。两个随机变量越线性相关,协方差越大,完全线性无关,协方差为零。
  • 样本的协方差: 是样本集的一个统计量,可作为联合分布总体参数的一个估计。在实际中计算的通常是样本的协方差.
  • 计算各维度两两之间的协方差,各协方差组成一个n×n的矩阵,称为协方差矩阵。协方差矩阵是个对称矩阵,对角线上的元素是各维度上随机变量的方差
In [1098]:
df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], columns=['dogs', 'cats'])
print(df)
df.cov()
   dogs  cats
0     1     2
1     0     3
2     2     0
3     1     1
Out[1098]:
dogs cats
dogs 0.666667 -1.000000
cats -1.000000 1.666667

pandas.DataFrame.cummax

  • Return cumulative maximum over a DataFrame or Series axis.
In [1099]:
df = pd.DataFrame([[2.0, 1.0],
                   [3.0, np.nan],
                   [1.0, 0.0]],
                   columns=list('AB'))
print(df)
print('-~'*25)
# By default, iterates over rows and finds the maximum in each column. 
# This is equivalent to axis=None or axis='index'.
print(df.cummax())
print('-~'*25)
print(df.cummax(axis=1))
     A    B
0  2.0  1.0
1  3.0  NaN
2  1.0  0.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     A    B
0  2.0  1.0
1  3.0  NaN
2  3.0  1.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     A    B
0  2.0  2.0
1  3.0  NaN
2  1.0  1.0

pandas.DataFrame.cummin

In [1100]:
df = pd.DataFrame([[2.0, 1.0],
                   [3.0, np.nan],
                   [1.0, 0.0]],
                   columns=list('AB'))
print(df)
print('-~'*25)
print(df.cummin()) # 每一行記錄與上一行比較,取最小的記錄
print('-~'*25)
print(df.cummin(axis=1)) # 每一列記錄與前一(左)列數據比較,取最小記錄
     A    B
0  2.0  1.0
1  3.0  NaN
2  1.0  0.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     A    B
0  2.0  1.0
1  2.0  NaN
2  1.0  0.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     A    B
0  2.0  1.0
1  3.0  NaN
2  1.0  0.0

pandas.DataFrame.cumprod

In [1101]:
df = pd.DataFrame([[2.0, 1.0, 5],
                   [3.0, np.nan, None],
                   [1.0, 0.0, 3]],
                   columns=list('ABC'))
print(df)
print('-~'*25)
print(df.cumprod())
print('-~'*25)
print(df.cumprod(axis=1))
     A    B    C
0  2.0  1.0  5.0
1  3.0  NaN  NaN
2  1.0  0.0  3.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     A    B     C
0  2.0  1.0   5.0
1  6.0  NaN   NaN
2  6.0  0.0  15.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     A    B     C
0  2.0  2.0  10.0
1  3.0  NaN   NaN
2  1.0  0.0   0.0

pandas.DataFrame.cumsum

In [1102]:
df = pd.DataFrame([[2.0, 1.0, 5],
                   [3.0, np.nan, None],
                   [1.0, 0.0, 3]],
                   columns=list('ABC'))
print(df)
print('-~'*25)
print(df.cumsum())
print('-~'*25)
print(df.cumsum(axis=1))
     A    B    C
0  2.0  1.0  5.0
1  3.0  NaN  NaN
2  1.0  0.0  3.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     A    B    C
0  2.0  1.0  5.0
1  5.0  NaN  NaN
2  6.0  1.0  8.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     A    B    C
0  2.0  3.0  8.0
1  3.0  NaN  NaN
2  1.0  1.0  4.0

pandas.DataFrame.describe

In [1103]:
df = pd.DataFrame({ 'object': ['a', 'b', 'b'],
                    'numeric': [1, 2, 3],
                    'categorical': pd.Categorical(['g','e','f'])
                  })
print(df)
print('-~'*25)
print(df.describe())
print('-~'*25)
df.describe(include='all')
  object  numeric categorical
0      a        1           g
1      b        2           e
2      b        3           f
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
       numeric
count      3.0
mean       2.0
std        1.0
min        1.0
25%        1.5
50%        2.0
75%        2.5
max        3.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
Out[1103]:
object numeric categorical
count 3 3.0 3
unique 2 NaN 3
top b NaN g
freq 2 NaN 1
mean NaN 2.0 NaN
std NaN 1.0 NaN
min NaN 1.0 NaN
25% NaN 1.5 NaN
50% NaN 2.0 NaN
75% NaN 2.5 NaN
max NaN 3.0 NaN

pandas.DataFrame.diff

In [1104]:
df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
                   'b': [1, 1, 2, 3, 5, 8],
                   'c': [1, 4, 9, 16, 25, 36]})
print(df)
print('-~'*25)
print(df.diff()) # 與前一條記錄的比較
print('-~'*25)
print(df.diff(axis=1)) # 與前一列記錄的比較
print('-~'*25)
print(df.diff(periods=3)) # 與第前3條記錄的比較(相減)
   a  b   c
0  1  1   1
1  2  1   4
2  3  2   9
3  4  3  16
4  5  5  25
5  6  8  36
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     a    b     c
0  NaN  NaN   NaN
1  1.0  0.0   3.0
2  1.0  1.0   5.0
3  1.0  1.0   7.0
4  1.0  2.0   9.0
5  1.0  3.0  11.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
    a    b     c
0 NaN  0.0   0.0
1 NaN -1.0   3.0
2 NaN -1.0   7.0
3 NaN -1.0  13.0
4 NaN  0.0  20.0
5 NaN  2.0  28.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
     a    b     c
0  NaN  NaN   NaN
1  NaN  NaN   NaN
2  NaN  NaN   NaN
3  3.0  2.0  15.0
4  3.0  4.0  21.0
5  3.0  6.0  27.0

pandas.DataFrame.mad

  • DataFrame.mad(axis=None, skipna=None, level=None)
  • Return the mean absolute deviation of the values for the requested axis
  • 每一個元素與平均數的差,取絕對數,再平均,得到平均絕對偏差the mean absolute deviation
In [1105]:
df = pd.DataFrame({'A': [2, 2, 4]})
print(df)
print('-~'*25)
print(df.mean())
print('-~'*25)
print((df-df.mean()).abs().mean())
print('-~'*25)
print(df.mad())
   A
0  2
1  2
2  4
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
A    2.666667
dtype: float64
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
A    0.888889
dtype: float64
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
A    0.888889
dtype: float64

pandas.DataFrame.median

  • 中位數
In [1106]:
df = pd.DataFrame({'A': [1, 2, 1, 102, 1, 3, 8]})
print(df)
print('-~'*25)
print(df.sort_values(by=['A']).reset_index())
print('-~'*25)
print(type(df.median()))
print('-~'*25)
print(df.median())
print('-~'*25)
print(df.median()[0])
print('-~'*25)
print(df[df['A']==df.median()[0]])
print('-~'*25)
df[df['A']==df.median()[0]].index.tolist()
     A
0    1
1    2
2    1
3  102
4    1
5    3
6    8
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
   index    A
0      0    1
1      2    1
2      4    1
3      1    2
4      5    3
5      6    8
6      3  102
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
<class 'pandas.core.series.Series'>
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
A    2.0
dtype: float64
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
2.0
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
   A
1  2
-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
Out[1106]:
[1]

pandas.DataFrame.mode

  • 衆數
In [1107]:
df = pd.DataFrame({'A': [1, 1, 3, 2, 3, 3, 3], 'B': [9, 9, 9, 9, 3, 3, 3]})
print(df)
df.mode()
   A  B
0  1  9
1  1  9
2  3  9
3  2  9
4  3  3
5  3  3
6  3  3
Out[1107]:
A B
0 3 9
In [1108]:
# 先按照a列做聚合,然後再對聚合後的a列的值,即A和B的所有值取衆數.
df = pd.DataFrame({'a':['A','A','A','A','B','B','B','B','B', 'B'],'b':[2,1,2,3,1,2,2,3,3, 3]})
print(df)
# dir(df.groupby('a')) 檢查可用的方法
df.groupby('a')['b'].apply(lambda x:x.mode())
   a  b
0  A  2
1  A  1
2  A  2
3  A  3
4  B  1
5  B  2
6  B  2
7  B  3
8  B  3
9  B  3
Out[1108]:
a   
A  0    2
B  0    3
Name: b, dtype: int64

pandas.DataFrame.pct_change

  • DataFrame.pct_change(periods=1, fill_method='pad', limit=None, freq=None, **kwargs)
  • Percentage change between the current and a prior element.
  • 可用於計算環比
In [1109]:
df = pd.DataFrame({
    'FR': [4.0405, 4.0963, 4.3149],
    'GR': [1.7246, 1.7482, 1.8519],
    'IT': [804.74, 810.01, 860.13]},
    index=['2018-11-01', '2018-12-01', '2019-01-01'])
print(df)
print('~-'*25)
print(df.diff())    # 先計算與上一行的差,若索引爲時間(如月),就是環比.
print('~-'*25)
print(df.shift(1))  # 移位,對齊
print('~-'*25)
print(df.diff().div(df.shift(1))) # 即將每一行與上一行的差,再除以上一行,得到的百分比,就是 pct_change()的值
# 當periods=12時,表示與上年同期的同比
df.pct_change()
                FR      GR      IT
2018-11-01  4.0405  1.7246  804.74
2018-12-01  4.0963  1.7482  810.01
2019-01-01  4.3149  1.8519  860.13
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
                FR      GR     IT
2018-11-01     NaN     NaN    NaN
2018-12-01  0.0558  0.0236   5.27
2019-01-01  0.2186  0.1037  50.12
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
                FR      GR      IT
2018-11-01     NaN     NaN     NaN
2018-12-01  4.0405  1.7246  804.74
2019-01-01  4.0963  1.7482  810.01
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
                  FR        GR        IT
2018-11-01       NaN       NaN       NaN
2018-12-01  0.013810  0.013684  0.006549
2019-01-01  0.053365  0.059318  0.061876
Out[1109]:
FR GR IT
2018-11-01 NaN NaN NaN
2018-12-01 0.013810 0.013684 0.006549
2019-01-01 0.053365 0.059318 0.061876
In [1110]:
df = pd.DataFrame({
     '2016': [1769950, 30586265],
     '2015': [1500923, 40912316],
     '2014': [1371819, 41403351]},
     index=['GOOG', 'APPL'])
print(df)
df.pct_change(axis='columns')
          2016      2015      2014
GOOG   1769950   1500923   1371819
APPL  30586265  40912316  41403351
Out[1110]:
2016 2015 2014
GOOG NaN -0.151997 -0.086016
APPL NaN 0.337604 0.012002

pandas.DataFrame.prod

  • 行,或列的元素連乘
In [1111]:
df = pd.DataFrame({
     'A': [1, 3],
     'B': [3, 4],
     'C': [2, 4]},
     index=['GOOG', 'APPL'])
print(df)
print('~-'*25)
print(df.prod())
print('~-'*25)
print(df.prod(axis='columns'))
      A  B  C
GOOG  1  3  2
APPL  3  4  4
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
A     3
B    12
C     8
dtype: int64
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
GOOG     6
APPL    48
dtype: int64

pandas.DataFrame.quantile

  • P分位:P取0~1之间的任何数值
  • P分位所在位置计算公式:
    • pos = 1+(n-1)*p
    • value=i+(j-i)*fraction
  • P分位的数值是指先将所有数据从大到小排列,
  • 若P分位的位置通过上述公式计算后为整数,则直接取P分位所在处的数值;
  • 若为小数,则表示该位置在两个数之间,则用vakue公式计算出对应的值
  • 分位距fraction为小数部分,
  • i,j为分位前后的数值
In [1112]:
df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), columns=['a', 'b'])
print(df)
print('~-'*25)
print(df.quantile(.1))  
# 计算a列
# pos = 1 + (4 - 1)*0.1 = 1.3 
# fraction = 0.3
# ret = 1 + (2 - 1) * 0.3 = 1.3
# 计算b列
# pos = 1.3 
# ret = 1 + (10 - 1) * 0.3 = 3.7
print('~-'*25)
print(type(df.quantile([.1, .5, .1, .5, .3, .2])))
print(df.quantile([.1, .5, .1, .5, .3, .2]))
   a    b
0  1    1
1  2   10
2  3  100
3  4  100
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
a    1.3
b    3.7
Name: 0.1, dtype: float64
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
<class 'pandas.core.frame.DataFrame'>
       a     b
0.1  1.3   3.7
0.5  2.5  55.0
0.1  1.3   3.7
0.5  2.5  55.0
0.3  1.9   9.1
0.2  1.6   6.4

pandas.DataFrame.rank

  • rank 表示在这个数在原来的Series或DataFrame中排第几名,有相同的数,取其排名平均(默认)作为值。
In [1113]:
df = pd.DataFrame(np.array([[1, 3], [2., 2], [3, 1], [3, 1]]), columns=['a', 'b'])
print(df)
print('~-'*25)
print(df.rank())  
     a    b
0  1.0  3.0
1  2.0  2.0
2  3.0  1.0
3  3.0  1.0
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
     a    b
0  1.0  4.0
1  2.0  3.0
2  3.5  1.5
3  3.5  1.5

pandas.DataFrame.round

  • DataFrame.round(decimals=0, *args, **kwargs)
  • Round a DataFrame to a variable number of decimal places.
  • 四舍五入
In [1114]:
df = pd.DataFrame(np.random.random([3, 3]), columns=['A', 'B', 'C'], index=['first', 'second', 'third'])
print(df)
print('~-'*25)
print(df.round(2))
print('~-'*25)
df.round({'A': 1, 'C': 2})
               A         B         C
first   0.704474  0.686108  0.084263
second  0.834782  0.348655  0.857906
third   0.627151  0.743559  0.115237
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
           A     B     C
first   0.70  0.69  0.08
second  0.83  0.35  0.86
third   0.63  0.74  0.12
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
Out[1114]:
A B C
first 0.7 0.686108 0.08
second 0.8 0.348655 0.86
third 0.6 0.743559 0.12

pandas.DataFrame.nunique

  • 返回唯一值的個數
In [1115]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
print(df)
print('~-'*25)
print(df.nunique())
df.nunique(axis=1)
   A  B
0  1  1
1  2  1
2  3  1
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
A    3
B    1
dtype: int64
Out[1115]:
0    1
1    2
2    2
dtype: int64

Reindexing / Selection / Label manipulation

pandas.DataFrame.add_prefix

pandas.DataFrame.add_suffix

In [1116]:
df = pd.DataFrame({'A': [1, 2, 3, 4],  'B': [3, 4, 5, 6]})
print(df)
print('~-'*25)
print(df.add_prefix('col_'))
print('~-'*25)
print(df.add_suffix('_col'))
   A  B
0  1  3
1  2  4
2  3  5
3  4  6
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
   col_A  col_B
0      1      3
1      2      4
2      3      5
3      4      6
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
   A_col  B_col
0      1      3
1      2      4
2      3      5
3      4      6

pandas.DataFrame.align

In [1117]:
df1 = pd.DataFrame({'A': [1, 2, 3, 4],  'B': [3, 4, 5, 6]})
df2 = pd.DataFrame({'X': [1, 2, 3, 4],  'Y': [3, 4, 5, 6]}, index=[3,2,1,4])
print(df1)
print('~-'*25)
print(df2)
print('~-'*25)
print(type(df1.align(df2, join='inner', axis=0)))
print(df1.align(df2, join='outer', axis=0)[0])
print('~-'*25)
print(df1.align(df2, join='outer', axis=0)[1])
   A  B
0  1  3
1  2  4
2  3  5
3  4  6
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
   X  Y
3  1  3
2  2  4
1  3  5
4  4  6
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
<class 'tuple'>
     A    B
0  1.0  3.0
1  2.0  4.0
2  3.0  5.0
3  4.0  6.0
4  NaN  NaN
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
     X    Y
0  NaN  NaN
1  3.0  5.0
2  2.0  4.0
3  1.0  3.0
4  4.0  6.0

pandas.DataFrame.at_time

In [1118]:
i = pd.date_range('2018-12-09', periods=4, freq='12H')
ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
print(ts)
ts.at_time('12:00')
                     A
2018-12-09 00:00:00  1
2018-12-09 12:00:00  2
2018-12-10 00:00:00  3
2018-12-10 12:00:00  4
Out[1118]:
A
2018-12-09 12:00:00 2
2018-12-10 12:00:00 4

pandas.DataFrame.between_time

In [1119]:
i = pd.date_range('2018-12-09', periods=6, freq='4H')
ts = pd.DataFrame({'A': [1,2,3,4, 5, 6]}, index=i)
print(ts)
ts.between_time('4:15', '16:45')
                     A
2018-12-09 00:00:00  1
2018-12-09 04:00:00  2
2018-12-09 08:00:00  3
2018-12-09 12:00:00  4
2018-12-09 16:00:00  5
2018-12-09 20:00:00  6
Out[1119]:
A
2018-12-09 08:00:00 3
2018-12-09 12:00:00 4
2018-12-09 16:00:00 5

pandas.DataFrame.drop

In [1120]:
df = pd.DataFrame(np.arange(12).reshape(3,4), columns=['A', 'B', 'C', 'D'])
print(df)
print('~-'*25)
print(df.drop(['B', 'C'], axis=1))
print('~-'*25)
print(df.drop(columns=['B', 'C']))
print('~-'*25)
df.drop([0, 1])
   A  B   C   D
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
   A   D
0  0   3
1  4   7
2  8  11
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
   A   D
0  0   3
1  4   7
2  8  11
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
Out[1120]:
A B C D
2 8 9 10 11
In [1121]:
midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
                             ['speed', 'weight', 'length']],
                     labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(index=midx, columns=['big', 'small'],
                  data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
                        [250, 150], [1.5, 0.8], [320, 250],
                        [1, 0.8], [0.3,0.2]])
print(df)
print('~-'*25)
print(df.drop(index='cow', columns='small'))
print('~-'*25)
print(df.drop(index='length', level=1)) # level 與 索引的級別需要一致,否則無效, 如length的level是1
                 big  small
lama   speed    45.0   30.0
       weight  200.0  100.0
       length    1.5    1.0
cow    speed    30.0   20.0
       weight  250.0  150.0
       length    1.5    0.8
falcon speed   320.0  250.0
       weight    1.0    0.8
       length    0.3    0.2
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
                 big
lama   speed    45.0
       weight  200.0
       length    1.5
falcon speed   320.0
       weight    1.0
       length    0.3
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
                 big  small
lama   speed    45.0   30.0
       weight  200.0  100.0
cow    speed    30.0   20.0
       weight  250.0  150.0
falcon speed   320.0  250.0
       weight    1.0    0.8

pandas.DataFrame.drop_duplicates

  • 删除重复记录值
  • DataFrame.drop_duplicates(subset=None, keep='first', inplace=False)
  • Return DataFrame with duplicate rows removed, optionally only considering certain columns
    • keep : {‘first’, ‘last’, False}, default ‘first’
      • first : Drop duplicates except for the first occurrence.
      • last : Drop duplicates except for the last occurrence.
      • False : Drop all duplicates.
In [1122]:
midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
                             ['speed', 'weight', 'length']],
                     labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(index=midx, columns=['big', 'small'],
                  data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
                        [250, 150], [1.5, 0.8], [45, 250],
                        [1.5, 0.8], [0.3,0.2]])
print(df)
print('~'*25)
print(df.drop_duplicates(subset='big', keep='last', inplace=False))
                 big  small
lama   speed    45.0   30.0
       weight  200.0  100.0
       length    1.5    1.0
cow    speed    30.0   20.0
       weight  250.0  150.0
       length    1.5    0.8
falcon speed    45.0  250.0
       weight    1.5    0.8
       length    0.3    0.2
~~~~~~~~~~~~~~~~~~~~~~~~~
                 big  small
lama   weight  200.0  100.0
cow    speed    30.0   20.0
       weight  250.0  150.0
falcon speed    45.0  250.0
       weight    1.5    0.8
       length    0.3    0.2
In [1123]:
df = pd.DataFrame({'A': [1, 2, 3, 4],  'B': [3, 4, 3, 3]})
print(df)
print('~-'*25)
print(df.drop_duplicates('B', keep='last'))
   A  B
0  1  3
1  2  4
2  3  3
3  4  3
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
   A  B
1  2  4
3  4  3

pandas.DataFrame.duplicated

In [1124]:
df = pd.DataFrame({'A': [1, 2, 3, 1],  'B': [3, 4, 3, 3]})
print(df)
print('~-'*25)
print(df.duplicated(['A','B'], keep='last'))
   A  B
0  1  3
1  2  4
2  3  3
3  1  3
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
0     True
1    False
2    False
3    False
dtype: bool

pandas.DataFrame.equals

  • 全部相同才會返回True
In [1125]:
df1 = pd.DataFrame({'A': [1, 2, 3, 1],  'B': [3, 4, np.NAN, 3]})
df2 = pd.DataFrame({'A': [1, 2, 3, 1],  'B': [3, 4, np.NAN, 3]})
print(df1)
print('~-'*25)
df1.equals(df2)
   A    B
0  1  3.0
1  2  4.0
2  3  NaN
3  1  3.0
~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-
Out[1125]:
True

pandas.DataFrame.filter

  • DataFrame.filter(items=None, like=None, regex=None, axis=None)
  • Subset rows or columns of dataframe according to labels in the specified index.
  • Note that this routine does not filter a dataframe on its contents. The filter is applied to the labels of the index.
In [1126]:
df=pd.DataFrame([
     [1,2,3]
    ,[4,5,6]
], index=['mouse','rabbit'], columns=['one','two','three'])
print(df)
print('-'*30)
print(df.filter(items=['one','two']))  # 只選擇列表中的列
print('-'*30)
print(df.filter(regex='e$', axis=1))   # 對列做過濾,選擇索引標籤中含有以"e"結尾的列
print('-'*30)
print(df.filter(like='bbi', axis=0))   # 對行做過濾,選擇索引標籤中包含bbi的行
        one  two  three
mouse     1    2      3
rabbit    4    5      6
------------------------------
        one  two
mouse     1    2
rabbit    4    5
------------------------------
        one  three
mouse     1      3
rabbit    4      6
------------------------------
        one  two  three
rabbit    4    5      6

pandas.DataFrame.first

In [1127]:
i = pd.date_range('2018-04-09', periods=8, freq='2D')
ts = pd.DataFrame({'A': [1,2,3,4,2,3,4,2]}, index=i)
print(ts)
print('-'*30)
# Get the rows for the first 3 days:
print(ts.first('4D'))
print('-'*30)
ts.first('5D')
            A
2018-04-09  1
2018-04-11  2
2018-04-13  3
2018-04-15  4
2018-04-17  2
2018-04-19  3
2018-04-21  4
2018-04-23  2
------------------------------
            A
2018-04-09  1
2018-04-11  2
------------------------------
Out[1127]:
A
2018-04-09 1
2018-04-11 2
2018-04-13 3

pandas.DataFrame.head

In [1128]:
i = pd.date_range('2018-04-09', periods=8, freq='2D')
ts = pd.DataFrame({'A': [1,2,3,4,2,3,4,2]}, index=i)
print(ts)
print('-'*30)
print(ts.head())
ts.head(3)
            A
2018-04-09  1
2018-04-11  2
2018-04-13  3
2018-04-15  4
2018-04-17  2
2018-04-19  3
2018-04-21  4
2018-04-23  2
------------------------------
            A
2018-04-09  1
2018-04-11  2
2018-04-13  3
2018-04-15  4
2018-04-17  2
Out[1128]:
A
2018-04-09 1
2018-04-11 2
2018-04-13 3

pandas.DataFrame.idxmax

pandas.DataFrame.idxmin

In [1129]:
i = pd.date_range('2018-04-09', periods=8, freq='2D')
ts = pd.DataFrame({'A': [1,2,3,16,2,3,4,2], 'B': [5,8,3,9,2,3,6,12]}, index=i)
print(ts)
print('-'*30)
# Get the rows for the first 3 days:
print(ts.idxmax(axis=0))
print('-'*30)
print(ts.idxmax(axis=1)) # 若是相同,取前一列(A列)
print('-'*30)
print(ts.idxmin(axis=0))
print('-'*30)
ts.idxmin(axis=1) # 若是相同,取前一列(A列)
             A   B
2018-04-09   1   5
2018-04-11   2   8
2018-04-13   3   3
2018-04-15  16   9
2018-04-17   2   2
2018-04-19   3   3
2018-04-21   4   6
2018-04-23   2  12
------------------------------
A   2018-04-15
B   2018-04-23
dtype: datetime64[ns]
------------------------------
2018-04-09    B
2018-04-11    B
2018-04-13    A
2018-04-15    A
2018-04-17    A
2018-04-19    A
2018-04-21    B
2018-04-23    B
Freq: 2D, dtype: object
------------------------------
A   2018-04-09
B   2018-04-17
dtype: datetime64[ns]
------------------------------
Out[1129]:
2018-04-09    A
2018-04-11    A
2018-04-13    A
2018-04-15    B
2018-04-17    A
2018-04-19    A
2018-04-21    A
2018-04-23    A
Freq: 2D, dtype: object

pandas.DataFrame.last

In [1130]:
i = pd.date_range('2018-04-09', periods=8, freq='2D')
ts = pd.DataFrame({'A': [1,2,3,4,2,3,4,2]}, index=i)
print(ts)
print('-'*30)
# Get the rows for the first 3 days:
print(ts.last('4D'))
print('-'*30)
ts.last('5D')
            A
2018-04-09  1
2018-04-11  2
2018-04-13  3
2018-04-15  4
2018-04-17  2
2018-04-19  3
2018-04-21  4
2018-04-23  2
------------------------------
            A
2018-04-21  4
2018-04-23  2
------------------------------
Out[1130]:
A
2018-04-19 3
2018-04-21 4
2018-04-23 2

pandas.DataFrame.reindex

  • DataFrame.reindex(labels=None, index=None, columns=None, axis=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None)
  • Conform DataFrame to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index.
  • A new object is produced unless the new index is equivalent to the current one and copy=False
In [1131]:
index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
df = pd.DataFrame({
    'http_status': [200,200,404,404,301],
    'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
    index=index)
print(df)
print('-'*30)
new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', 'Chrome']
print(df.reindex(new_index))
df.reindex(new_index, fill_value=0)
           http_status  response_time
Firefox            200           0.04
Chrome             200           0.02
Safari             404           0.07
IE10               404           0.08
Konqueror          301           1.00
------------------------------
               http_status  response_time
Safari               404.0           0.07
Iceweasel              NaN            NaN
Comodo Dragon          NaN            NaN
IE10                 404.0           0.08
Chrome               200.0           0.02
Out[1131]:
http_status response_time
Safari 404 0.07
Iceweasel 0 0.00
Comodo Dragon 0 0.00
IE10 404 0.08
Chrome 200 0.02
In [1132]:
date_index1 = pd.date_range('11/1/2018', periods=6, freq='D')
date_index2 = pd.date_range('10/29/2018', periods=10, freq='D')
print(date_index1)
print('-'*30)
print(date_index2)
df = pd.DataFrame({"prices": [100, 105, np.nan, 100, 89, 88]},
                    index=date_index1)
print(df)
print('~'*50)
print(df.reindex(date_index2)) # 按索引對齊,缺失爲NaN
df.reindex(date_index2, method='bfill') #對NaN值替換,是向前替換的(b代表before)
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
               '2018-11-05', '2018-11-06'],
              dtype='datetime64[ns]', freq='D')
------------------------------
DatetimeIndex(['2018-10-29', '2018-10-30', '2018-10-31', '2018-11-01',
               '2018-11-02', '2018-11-03', '2018-11-04', '2018-11-05',
               '2018-11-06', '2018-11-07'],
              dtype='datetime64[ns]', freq='D')
            prices
2018-11-01   100.0
2018-11-02   105.0
2018-11-03     NaN
2018-11-04   100.0
2018-11-05    89.0
2018-11-06    88.0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            prices
2018-10-29     NaN
2018-10-30     NaN
2018-10-31     NaN
2018-11-01   100.0
2018-11-02   105.0
2018-11-03     NaN
2018-11-04   100.0
2018-11-05    89.0
2018-11-06    88.0
2018-11-07     NaN
Out[1132]:
prices
2018-10-29 100.0
2018-10-30 100.0
2018-10-31 100.0
2018-11-01 100.0
2018-11-02 105.0
2018-11-03 NaN
2018-11-04 100.0
2018-11-05 89.0
2018-11-06 88.0
2018-11-07 NaN
In [1133]:
df = pd.DataFrame({"A": [100, 105, np.nan],"B": [100, 105, 100],"C": [100, 105, np.nan]}, index=['a', 'b', 'c'])
print(df)
print('~'*50)
df.reindex(['B', 'C', 'D'], axis=1)
       A    B      C
a  100.0  100  100.0
b  105.0  105  105.0
c    NaN  100    NaN
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Out[1133]:
B C D
a 100 100.0 NaN
b 105 105.0 NaN
c 100 NaN NaN

pandas.DataFrame.reindex_like

In [1134]:
df1 = pd.DataFrame({"X": [10000, 10500, np.nan],"B": [10000, 10500, 10000],"C": [10000, 10500, np.nan]}, index=['x', 'b', 'c'])
df2 = pd.DataFrame({"A": [1, 1.5, np.nan],"B": [1, 1.5, 1],"C": [1, 1.5, np.nan]}, index=['a', 'b', 'c'])
print(df1)
print('~'*50)
print(df2)
print('~'*50)
df1.reindex_like(df2) # 形式上像df2, 但是數據仍然是 df1 的,如果有缺失數據,使用NaN
         X      B        C
x  10000.0  10000  10000.0
b  10500.0  10500  10500.0
c      NaN  10000      NaN
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     A    B    C
a  1.0  1.0  1.0
b  1.5  1.5  1.5
c  NaN  1.0  NaN
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Out[1134]:
A B C
a NaN NaN NaN
b NaN 10500.0 10500.0
c NaN 10000.0 NaN

pandas.DataFrame.rename

In [1135]:
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
print(df)
#df.rename(index={0:'a', 1:'b', 2:'c'}, columns={"A": "a", "B": "c"})
df.rename(index={1:'a', 0:'b', 2:'啊'}, columns={"A": "a", "B": "c"}).rename(str.upper) # (str.lower, axis='columns')
   A  B
0  1  4
1  2  5
2  3  6
Out[1135]:
a c
B 1 4
A 2 5
3 6

pandas.DataFrame.reset_index

In [1136]:
df = pd.DataFrame([('bird',    389.0),
                   ('bird',     24.0),
                   ('mammal',   80.5),
                   ('mammal', np.nan)],
                  index=['falcon', 'parrot', 'lion', 'monkey'],
                  columns=('class', 'max_speed'))
print(df)
print('~'*50)
print(df.reset_index())
print(df.reset_index().columns)
df.reset_index(drop=True) # use the drop parameter to avoid the old index being added as a column
         class  max_speed
falcon    bird      389.0
parrot    bird       24.0
lion    mammal       80.5
monkey  mammal        NaN
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    index   class  max_speed
0  falcon    bird      389.0
1  parrot    bird       24.0
2    lion  mammal       80.5
3  monkey  mammal        NaN
Index(['index', 'class', 'max_speed'], dtype='object')
Out[1136]:
class max_speed
0 bird 389.0
1 bird 24.0
2 mammal 80.5
3 mammal NaN
In [1137]:
index = pd.MultiIndex.from_tuples([('bird', 'falcon'),  # 與zip()搭配使用效果會很好
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal',