import numpy as np
import pandas as pd
pandas.DataFrame(
data=None
, index=None
, columns=None
, dtype=None
, copy=False
)
data : numpy ndarray (structured or homogeneous), dict, or DataFrame
Dict can contain Series, arrays, constants, or list-like objects
Changed in version 0.23.0: If data is a dict, argument order is maintained for Python 3.6 and later.
index : Index or array-like
Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided
columns : Index or array-like
Column labels to use for resulting frame. Will default to RangeIndex (0, 1, 2, …, n) if no column labels are provided
dtype : dtype, default None
Data type to force. Only a single dtype is allowed. If None, infer
copy : boolean, default False
Copy data from inputs. Only affects DataFrame / 2d ndarray input
s=pd.Series(np.random.randn(5))
pd.DataFrame(s) # Series的每一個元素是DataFrame的一條行記錄
s=pd.Series([1,2,3,4,5],name='Jasper')
pd.DataFrame(s, columns=['Jasper','Casper'])
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])
}
pd.DataFrame(d)
pd.DataFrame(d, index=['d', 'b', 'a'])
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])
df=pd.DataFrame(d)
df.index
pd.DataFrame(d)
df.columns
# 也可直接根據列表創建
l=[1., 2., 3., 4.]
pd.DataFrame(l, columns=['one'])
d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]}
pd.DataFrame(d)
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])
d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]}
pd.DataFrame.from_dict(d)
# orient='index'
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names
d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]}
pd.DataFrame.from_dict(d
, orient='index'
, columns=['one', 'two', 'three', 'four']
)
d = {'one': np.array([1.,2.,3.,4.]), 'two': np.array([4.,2.,3.,1.])}
pd.DataFrame(d)
d = {'one': np.array([1.,2.,3.,4.]), 'two': np.array([4.,2.,3.,1.])}
pd.DataFrame(d, index=['A','B','C','D'])
l = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(l, index=['first', 'second'], columns=['c', 'a', 'b'])
t={ # 字典里面嵌套字典, key里面嵌套tuples, values里面嵌套字典
('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}
}
for i in t.keys(): # 獲取字典key的方法
print(type(i), i)
print('-'*100)
for v in t.values(): # 獲取字典value的方法
print(type(v), v)
print('-'*100)
for j in t.keys(): # 獲取字典key中的元組的元素的方法
print(j[0], j[1])
print('-'*100)
for x, y in t.keys(): # 獲取字典key中的元組的元素的第二種方法
print(x, y)
print('-'*100)
for v2 in t.values(): # 獲取字典value中的字典的元素的key和Value
for k3, v3 in v2.items():
print(k3, ' | ', v3)
print('-'*20)
pd.DataFrame(t)
pd.DataFrame({ # key里面的tuple成为列组合,
# value里面的dict里面key里面的tuple成为行组合
# value里面的dict里面value成为DataFrame的元素
('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
})
# 再舉一例
pd.DataFrame({
('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
})
# 注意與數組字典的區別:直接使用數組(而非字典)生成的DataFrame
ar=np.array([[1.,2.,3.,4.],[3,4,5,6]])
print(pd.DataFrame(ar, dtype=int, columns=list('ABCD'))) # 數組元素(列表)內的元素形成各列
print(pd.DataFrame(ar).index)
pd.DataFrame(ar)
data = np.zeros((2,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])
data[:] = [(1,2.,'Hello'), (2,3.,"World")]
print(type(data))
print(data)
pd.DataFrame(data, index=['first', 'second'], columns=['C', 'A', 'B'])
pd.DataFrame.from_records(data, index='C')
arrays = [[1, 1, 2, 2], ['Oct', 'Nov', 'Oct', 'Nov']]
index = pd.MultiIndex.from_arrays(arrays, names=('Quarter', 'periods'))
print(index)
columns = pd.MultiIndex.from_tuples([('company1', 'area1'),
('company2', 'area2')])
print(columns)
df = pd.DataFrame([(389.0, 18),
( 24.0, 59),
( 80.5, None),
(np.nan, 550)],
index=index,
columns=columns)
df
index = pd.MultiIndex.from_tuples([('bird', 'falcon'), # 與zip()搭配使用效果會很好
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns = pd.MultiIndex.from_tuples([('speed', 'max'),
('species', 'type')])
print(index)
df = pd.DataFrame([(389.0, 'fly'),
( 24.0, 'fly'),
( 80.5, 'run'),
(np.nan, 'jump')],
index=index,
columns=columns)
df
m_index=pd.Index([("A","x1"),("A","x2"),("B","y1"),("B","y2"),("B","y3"),("B","y4"),("B","y5"),("B","y6"),("B","y7")],name=["class1","class2"])
print(m_index)
df = pd.DataFrame(
data=[[45, 30], [200, 100], [1.5, 1], [30, 20], [250, 150], [1.5, 0.8], [320, 250], [1, 0.8], [0.3,0.2]]
, index=m_index
, columns=['big', 'small']
)
df
m_index=pd.MultiIndex.from_product([["Y1","Y2"],['Q1','Q2','Q3','Q4']],names=["Year","Quarter"])
print(m_index)
df=pd.DataFrame(
np.random.randint(5,15,(3,8))
, index=['one','two','three']
, columns=m_index
)
df
frame : 要处理的数据集 DataFrame
id_vars : tuple, list, or ndarray, optional
不需要被转换的列名,用作标识变量 Column(s) to use as identifier variables.
value_vars : tuple, list, or ndarray, optional
需要转换的列名,如果剩下的列全部都要转换,就不用写了 Column(s) to unpivot . If not specified, uses all columns that are not set as id_vars.
var_name : scalar
自定义设置对应的列名 Name to use for the ‘variable’ column. If None it uses frame.columns.name or ‘variable’.
value_name : scalar, default ‘value’
自定义设置对应的列名 Name to use for the ‘value’ column.
col_level : int or string, optional
如果列是MultiIndex,则使用此级别 If columns are a MultiIndex then use this level to melt.
df = pd.DataFrame({
'A': {0: 'a', 1: 'b', 2: 'c'}
, 'B': {0: 1, 1: 3, 2: 5}
, 'C': {0: 2, 1: 4, 2: 6}
})
print(df)
pd.melt(df, id_vars=['A'], value_vars=['B'])
df.columns = [list('ABC'), list('DEF')]
print(df)
pd.melt(
df
, id_vars=[('A','D')]
, value_vars=[('B','E'),('C','F')]
)
values : column to aggregate, optional
index : column, Grouper, array, or list of the previous
If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values.
columns : column, Grouper, array, or list of the previous
If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values.
aggfunc : function, list of functions, dict, default numpy.mean
If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) If dict is passed, the key is column to aggregate and value is function or list of functions
fill_value : scalar, default None
Value to replace missing values with
margins : boolean, default False
Add all row / columns (e.g. for subtotal / grand totals)
dropna : boolean, default True
Do not include columns whose entries are all NaN
margins_name : string, default ‘All’
Name of the row / column that will contain the totals when margins is True.
df = pd.DataFrame({
'A': {0: 'a', 1: 'b', 2: 'c'}
, 'B': {0: 1, 1: 3, 2: 5}
, 'C': {0: 2, 1: 4, 2: np.NaN}
})
print(df)
print('-'*50)
df.columns = [list('ABC'), list('DEF')]
print(df)
print('-'*50)
df_m=pd.melt(
df
, id_vars=[('A','D')]
, value_vars=[('B','E'),('C','F')]
)
df_m.columns = list('ABCD')
print(df_m)
print('-'*50)
table = pd.pivot_table(
df_m
, values='D'
, index=['B','C']
, columns=['A']
, aggfunc=np.mean
, fill_value=100
)
table
a = np.array(["foo", "foo", "foo", "foo", "nam", "bar", "bar", "nam", "foo", "foo", "foo"], dtype=object)
b = np.array(["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], dtype=object)
c = np.array(["dul", "dul", "shi", "dul", "jas", "shi", "shi", "cas", "shi", "shi", "shi"], dtype=object)
d = np.array([100,1,1,1,1,1,1,1,1,1,1], dtype=int)
pd.crosstab(
a
, [b, c]
, values=d
, rownames=['a']
, colnames=['Jasper', 'Casper']
, aggfunc=np.sum
, dropna=True
, margins=True
, margins_name='Total'
)
# 交换index和column的位置(及其名称的位置),就相当于转置
# 此处计算的是count,计算count的应用场景会更多一些
pd.crosstab(
[b, c]
, a
, rownames=['Jasper', 'Casper']
, colnames=['a']
)
print(type(pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)))
print(type(pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)))
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
print(pd.cut(np.array([1, 7, 5, 4, 6, 3, 9, 9]), 3, labels=["bad", "medium", "good"]))
pd.cut(np.array([1, 7, 5, 4, 6, 3, 9, 9]), 3, labels=["bad", "medium", "good"]).value_counts().sort_values(ascending=False)
s = pd.Series(np.array([2, 4, 6, 8, 10]), index=['a', 'b', 'c', 'd', 'e'])
print(s)
print(pd.cut(s, 3))
pd.cut(s, 3).value_counts()
pd.qcut(np.random.randn(1000000), 5).value_counts() # 区间变量数相同
pd.cut(np.random.randn(1000000), 5).value_counts() # 区间变量符合正态分布
df1=pd.DataFrame({'name':['kate','herz','catherine','sally'], 'age':[25,28,39,35]})
df2=pd.DataFrame({'name':['kate','herz','sally'], 'score':[70,60,90]})
print(df1)
print('-'*50)
print(df2)
print('-'*50)
print(pd.merge(df1,df2))
print('-'*50)
print(pd.merge(df1,df2, how='outer'))
df1.merge(df2)
df1=pd.DataFrame({'name1':['kate','herz','catherine','sally'], 'age':[25,28,39,35]})
df2=pd.DataFrame({'name2':['kate','herz','sally'], 'score':[70,60,90]})
print(df1.merge(df2, left_on='name1', right_on='name2', how='outer'))
df1.merge(df2, left_on='name1', right_on='name2', how='outer').drop('name2',axis=1).fillna(0)
df1 = pd.DataFrame([['a', 1], ['b', 2], ['c', 5]], columns=['letter', 'number'])
print(df1)
print('-'*50)
df2 = pd.DataFrame([['c', 3], ['d', 4]], columns=['letter', 'number'])
print(df2)
print('-'*50)
df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], columns=['letter', 'number', 'animal'])
print(df3)
print('-'*50)
print(pd.concat([df1, df2]))
print('-'*50)
print(pd.concat([df1, df3], sort=False, keys=['oneblock', 'twoblock'], names=['block', 'number']))
print('-'*50)
print(pd.concat([df1, df3], sort=False, ignore_index=True)) # 忽略原有的index,重新建立index
print(pd.concat([df1, df2], axis=1, join='inner'))
print('-'*50)
print(pd.concat([df1, df2], axis=1, join='outer'))
df1=pd.DataFrame({'Red':[1,3,5],'Green':[5,0,3]},index=list('abd'))
df2=pd.DataFrame({'Blue':[1,9],'Yellow':[6,6]},index=list('ce'))
df3=pd.DataFrame({'Brown':[3,4,5],'White':[1,1,2]},index=list('aed'))
print(df1.join([df2,df3])) # 默认是left join
#df1.join([df2,df3], sort=False, how='outer') # A future version of pandas will change to not sort by default.
df1.join(df2, sort=False, how='outer').join(df3, sort=False, how='outer')
left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']})
right = pd.DataFrame({'C': ['C0', 'C1'],
'D': ['D0', 'D1']},
index=['K0', 'K1'])
right2 = pd.DataFrame({'v': [7, 8, 9]}, index=['K1', 'K1', 'K2'])
print(left)
print(right)
print(right2)
print(left.join(right, on='key').join(right2, on='key'))
result = left.join([right, right2], how='outer')
result
df = pd.DataFrame({'key':['b','b','a','c','a','b'], 'data':np.random.randn(6)})
print(df)
print(pd.get_dummies(df['key']))
dummies = pd.get_dummies(df['key'],prefix = 'key')
dummies
df_with_dummy = df[['data']].join(dummies) # 與原df的數值結合
print(df_with_dummy)
s = ['a', 'b', np.nan, 'a']
print(s)
print('-'*50)
print(pd.get_dummies(s))
print('-'*50)
print(pd.get_dummies(s, dummy_na=True)) # 添加顯示nan的列
index = pd.MultiIndex.from_tuples([('bird', 'falcon'), # 與zip()搭配使用效果會很好
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns = pd.MultiIndex.from_tuples([('speed', 'max'),
('species', 'type')])
df = pd.DataFrame([(389.0, 'fly'),
( 24.0, 'fly'),
( 80.5, 'run'),
(np.nan, 'jump')],
index=index,
columns=columns)
print(df)
print('-'*30)
order=[1,0] # 若提供3個元素,則報錯:Too many levels: Index has only 2 levels, not 3
print(df.reorder_levels(order)) # 交換了行level中class和name的位置
print('-'*30)
print(df.reorder_levels(order, axis=1)) # 交換列標籤的上下位置
df = pd.DataFrame({
'col1': ['A', 'A', 'B', np.nan, 'D', 'A'],
'col2': [3, 1, 9, 8, 7, 2],
'col3': [0, 1, 9, 4, 2, 3],
})
print(df)
print('-'*30)
print(df.sort_values(by=['col1']))
print('-'*30)
print(df.sort_values(by=['col1', 'col2']))
print('-'*30)
print(df.sort_values(by='col1', ascending=False)) # NaN始終排在最後
print('-'*30)
df.sort_values(by='col1', ascending=False, na_position='first') # NaN這樣就排到前面了
df = pd.DataFrame({
'col1': ['A', 'A', 'B', np.nan, 'D', 'A'],
'col2': [3, 1, 9, 8, 7, 2],
'col3': [0, 1, 9, 4, 2, 3],
})
print(df.sort_values(by='col1', ascending=False, na_position='first'))
df.sort_values(by='col1', ascending=False, na_position='first').sort_index()
df = pd.DataFrame({'a': [1, 10, 8, 10, -1, 5, 7],
'b': list('abdcefg'),
'c': [21.0, 2.0, np.nan, 5.0, 4.0, 2, 1]})
print(df)
print('-'*30)
df['rows_total']=df.apply(lambda x: x['a']+x['c'], axis=1)
print(df.sort_values(by='rows_total', ascending=False))
df.nlargest(5, ['c', 'a'], keep='first') # 是按照列表中的列的合計,進行排序,然後選擇其中的n條記錄的
df = pd.DataFrame({'a': [1, 10, 8, 10, -1, 5, 7],
'b': list('abdcefg'),
'c': [21.0, 2.0, np.nan, 5.0, 4.0, 2, 1]})
print(df)
print('-'*30)
df['rows_total']=df.apply(lambda x: x['a']+x['c'], axis=1)
print(df.sort_values(by='rows_total', ascending=True))
df.nsmallest(5, ['c', 'a'], keep='first')
index = pd.MultiIndex.from_tuples([('bird', 'falcon'), # 與zip()搭配使用效果會很好
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns = pd.MultiIndex.from_tuples([('speed', 'max'),
('species', 'type')])
df = pd.DataFrame([(389.0, 'fly'),
( 24.0, 'fly'),
( 80.5, 'run'),
(np.nan, 'jump')],
index=index,
columns=columns)
print(df)
print('-'*30)
print(df.swaplevel(i=1, j=0, axis=1))
df.swaplevel(i='name', j='class', axis=0)
df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
index=['cat', 'dog'],
columns=['weight', 'height'])
print(df_single_level_cols)
print('-'*30)
print(df_single_level_cols.stack().index)
df_single_level_cols.stack() # 列轉行(索引)
multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
('height', 'm')])
df_multi_level_cols2 = pd.DataFrame([[np.NaN, 2.0], [3.0, 4.0]],
index=['cat', 'dog'],
columns=multicol2)
print(df_multi_level_cols2)
print('-'*30)
print(df_multi_level_cols2.stack().index)
print('-'*30)
print(df_multi_level_cols2.stack(dropna=False)) # NaN 保留, 缺省是不保留的
df_multi_level_cols2.stack(dropna=True)
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
('two', 'a'), ('two', 'b')])
df = pd.DataFrame(np.random.randint(1, 100, (4,2)), index=index, columns=['x', 'y'])
print(df)
print('-'*30)
print(df.unstack(level=0))
print('-'*30)
print(df.unstack(level=1))
print('-'*30)
print(df.unstack(level=-1)) # -1表示最後一個,在這裏也就是1
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
('two', 'a'), ('two', 'b')])
df = pd.DataFrame(np.random.randint(1, 100, (4,2)), index=index, columns=['x', 'y'])
print(df)
print('-'*30)
print(df.swapaxes(1,0)) # 交換行軸與列軸
df = pd.DataFrame([[0, 1], [2, 3]], index=['cat', 'dog'], columns=['weight', 'height'])
print(df)
print('-'*30)
print(df.swapaxes(1,0))
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
('two', 'a'), ('two', 'b')])
df = pd.DataFrame(np.random.randint(1, 100, (4,2)), index=index, columns=['x', 'y'])
print(df)
print('-'*30)
print(df.T) # 交換行軸與列軸
df = pd.DataFrame([[0, 1], [2, 3]], index=['cat', 'dog'], columns=['weight', 'height'])
print(df)
print('-'*30)
print(df.T)
df = pd.DataFrame([[0, 1], [2, 3]], index=['cat', 'dog'], columns=['weight', 'height'])
print(df)
print('-'*30)
print(df.transpose())
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
('two', 'a'), ('two', 'b')])
df = pd.DataFrame(np.random.randint(1, 100, (4,2)), index=index, columns=['x', 'y'])
print(df)
print('-'*30)
print(df.transpose()) # 交換行軸與列軸
df1 = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
print(df1)
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
print(df2)
df1.append(df2, sort=False, ignore_index=True)
df = pd.DataFrame({'A': range(1, 11), 'B': np.random.randn(10)})
print(df)
print('-'*30)
print(df.assign(total = lambda x:x['A']+ x['B'])) # 列之間的運算,如果想對行運算,可以移位(shift)後運算,也可以轉置後運算然後再轉置
print('-'*30)
print(df.assign(total = lambda x:np.sum([x.B]))) # 這個可以用來算結構百分比
print('-'*30)
print(df.assign(ln_A = lambda x: np.log(x.A)))
df = pd.DataFrame({'A': [1, 2, 3],
'B': [400, 500, 600]})
new_df = pd.DataFrame({'B': [4, 5, 6],
'C': [7, 8, 9]})
print(df)
print(new_df)
df.update(new_df)
df
df = pd.DataFrame({'A': [1, 2, 3],
'B': [400, 500, 600]})
new_df = pd.DataFrame({'B': [4, np.NAN, 6],
'C': [7, 8, 9]})
print(df)
print(new_df)
df.update(new_df) # 當new_df裏面是NaN時,仍使用該元素之前的值
df
print(pd.isnull('dog'))
print(pd.isnull(np.nan))
print(pd.isnull(None))
df = pd.DataFrame([['ant', 'bee', np.NaN], ['dog', None, 'fly']])
print(df)
pd.isnull(df)
print(pd.isna('dog'))
print(pd.isna(np.nan))
print(pd.isna(None))
df = pd.DataFrame([['ant', 'bee', np.NaN], ['dog', None, 'fly']])
print(df)
pd.isna(df)
print(pd.notnull('dog'))
print(pd.notnull(np.nan))
print(pd.notnull(None))
df = pd.DataFrame([['ant', 'bee', np.NaN], ['dog', None, 'fly']])
print(df)
pd.notnull(df)
print(pd.notna('dog'))
print(pd.notna(np.nan))
print(pd.notna(None))
df = pd.DataFrame([['ant', 'bee', np.NaN], ['dog', None, 'fly']])
print(df)
pd.notna(df)
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
"toy": [np.nan, 'Batmobile', 'Bullwhip'],
"born": [pd.NaT, pd.Timestamp("1940-04-25"),pd.NaT]
})
print(df)
print('~-'*25)
print(df.dropna())
print('~-'*25)
# Drop the columns where at least one element is missing.
print(df.dropna(axis='columns'))
print('~-'*25)
# Drop the rows where all elements are missing.
print(df.dropna(how='all'))
print('~-'*25)
# Keep only the rows with at least 2 non-NA values.
print('thresh=2\n',df.dropna(thresh=2)) # 至少有2個元素非空
print('~-'*25)
# Define in which columns to look for missing values.
print(df.dropna(subset=['name', 'toy']))
print('~-'*25)
# Keep the DataFrame with valid entries in the same variable.
df.dropna(inplace=True) # inplace=True表示對原數組進行更改
df
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
[3, 4, np.nan, 1],
[np.nan, np.nan, np.nan, 5],
[np.nan, 3, np.nan, 4]],
columns=list('ABCD'))
print(df)
print('~-'*25)
print(df.fillna(0))
print('~-'*25)
# We can also propagate non-null values forward or backward.
print('use method:\n', df.fillna(method='bfill', axis=1))
print('~-'*25)
#Replace all NaN elements in column ‘A’, ‘B’, ‘C’, and ‘D’, with 0, 1, 2, and 3 respectively.
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
print(df.fillna(value=values))
print('~-'*25)
#Only replace the first NaN element.
df.fillna(value=values, limit=1)
df = pd.DataFrame({'A': [9, 0, 8, 7, 4],
'B': [5, 0, 1, 3, 3],
'C': ['a', 'b', 'c', 'd', 'e']})
print(df)
print('~-'*25)
print(df.replace(0, 5))
print('~-'*25)
print(df.replace([0, 1, 2, 3], 4)) # 將所有的元素爲0,1,2或3的都替換爲4
print(df.replace([0, 1, 2, 3], [4, 3, 2, 1]))
print('~-'*25)
print(df.replace({0: 10, 1: 100}))
print('~-'*25)
#Regular expression `to_replace`
df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
'B': ['abc', 'bar', 'xyz']})
print(df)
print(df.replace(regex={r'^ba.$':'new', 'foo':'xyz'}))
df.replace(to_replace=r'^ba.*$', value='new', regex=True)
df = pd.DataFrame([['1', '5', np.NaN], [None, '9', '7']])
print(df)
print('-'*50)
print(pd.to_numeric(df[0])) # 需要轉換爲list或Series
print('-'*50)
print(pd.to_numeric(df.T[0].T)) # 需要轉換爲list或Series
df = pd.DataFrame({'year': [2015, 2016],
'month': [2, 3],
'day': [4, 5]
})
pd.to_datetime(df)
pd.to_datetime(
np.arange(8)
, unit='ns'
, origin=pd.Timestamp('2018-11-06')
)
# unit : string, default ‘ns’
# unit of the arg (D,s,ms,us,ns) denote the unit,
# which is an integer or float number.
# This will be based off the origin.
# Example, with unit=’ms’ and origin=’unix’ (the default),
# this would calculate the number of milliseconds to the unix epoch start.
pd.to_timedelta(np.arange(8), unit='D')
# unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, which is an integer/float number
语法
pandas.date_range(
start=None
, end=None
, periods=None
, freq='D'
, tz=None
, normalize=False
, name=None
, closed=None
, **kwargs
)
主要参数说明
periods :固定时期,取值为整数或None
freq :日期偏移量,取值为string或DateOffset,默认为'D'
normalize :若参数为True表示将start、end参数值正则化到午夜时间戳
name :生成时间索引对象的名称,取值为string或None
closed :可以理解成在closed=None情况下返回的结果中,若closed=‘left’表示在返回的结果基础上,再取左开右闭的结果,若closed='right'表示在返回的结果基础上,再取做闭右开的结果
全部参数说明
start : str or datetime-like, optional
Left bound for generating dates.
end : str or datetime-like, optional
Right bound for generating dates.
periods : integer, optional
Number of periods to generate.
freq : str or DateOffset, default ‘D’ (calendar daily)
Frequency strings can have multiples, e.g. ‘5H’. See here for a list of frequency aliases.
tz : str or tzinfo, optional
Time zone name for returning localized DatetimeIndex, for example ‘Asia/Hong_Kong’. By default, the resulting DatetimeIndex is timezone-naive.
normalize : bool, default False
Normalize start/end dates to midnight before generating date range.
name : str, default None
Name of the resulting DatetimeIndex.
closed : {None, ‘left’, ‘right’}, optional
Make the interval closed with respect to the given frequency to the ‘left’, ‘right’, or both sides (None, the default).
**kwargs :
For compatibility. Has no effect on the result.
freq Offset Aliases
| Alias | Description |
|---|---|
| B | business day frequency |
| C | custom business day frequency |
| D | calendar day frequency |
| W | weekly frequency |
| M | month end frequency |
| SM | semi-month end frequency (15th and end of month) |
| BM | business month end frequency |
| CBM | custom business month end frequency |
| MS | month start frequency |
| SMS | semi-month start frequency (1st and 15th) |
| BMS | business month start frequency |
| CBMS | custom business month start frequency |
| Q | quarter end frequency |
| BQ | business quarter end frequency |
| QS | quarter start frequency |
| BQS | business quarter start frequency |
| A, Y | year end frequency |
| BA, BY | business year end frequency |
| AS, YS | year start frequency |
| BAS, BYS | business year start frequency |
| BH | business hour frequency |
| H | hourly frequency |
| T, min | minutely frequency |
| S | secondly frequency |
| L, ms | milliseconds |
| U, us | microseconds |
| N | nanoseconds |
pd.date_range(start='1/1/2018', end='1/11/2018')
# freq='W-MON' # 表示按周间隔,从每周一开始
dtindex = pd.date_range(start='10/28/2018', end='11/30/2018', freq='W-MON')
dtindex
# freq='W' # 表示按周间隔,从start那一天开始
dtindex = pd.date_range(start='10/28/2018', end='11/30/2018', freq='W')
dtindex
# freq='M' # 表示按月间隔,每月最后一天
dtindex = pd.date_range(start='1/28/2018', end='11/30/2018', freq='M')
dtindex
# freq='M' # 表示按3个月间隔,每月第一天
dtindex = pd.date_range(start='1/28/2018', end='11/30/2018', freq=pd.offsets.MonthBegin(3))
dtindex
# freq='SM' # 月中15天的一次间隔
dtindex = pd.date_range(start='10/28/2018', end='11/30/2018', freq='SM')
dtindex
# freq='SM-2' # 月中2天的一次间隔
dtindex = pd.date_range(start='10/28/2018', end='11/30/2018', freq='SM-2')
dtindex
# freq='Q' # 每季度的最后一日
dtindex = pd.date_range(start='4/28/2018', end='12/31/2018', freq='Q')
dtindex
# freq='Q-NOV' # 最后截止日是11月最有一日,然后倒退前面的日期,即提前一个月
dtindex = pd.date_range(start='4/28/2018', end='12/31/2018', freq='Q-NOV')
dtindex
# freq='Q-NOV' # 最后截止日是10月最有一日,然后倒退前面的日期,即提前一个月
dtindex = pd.date_range(start='4/28/2018', end='12/31/2018', freq='Q-OCT')
dtindex
# pd.date_range().to_period() # 显示为月份,不显示日期
dtindex = pd.date_range(start='4/28/2018', end='12/31/2018', freq='M').to_period()
dtindex
# pd.date_range().to_timestamp() # 显示为每月1日
# 之所以没有显示12月份,是因为to_period时对于12月30日,不会显示12月份最后一日(12月31日)
# 所以既然没有12月份,所以最终也不会显示12月1日
dtindex = pd.date_range(start='4/28/2018', end='12/30/2018', freq='M').to_period().to_timestamp()
dtindex
pd.period_range(start='2017-01-01', end='2018-01-01', freq='M')
pd.timedelta_range(start='1 day', periods=4)
pd.interval_range(start=0, end=6)
pd.interval_range(start=0, end=6, periods=4)
pd.interval_range(start=pd.Timestamp('2017-01-01'),end=pd.Timestamp('2017-01-04'))
pd.interval_range(start=0, periods=4, freq=1.5)
df=pd.DataFrame([[1.,2.,3.],[7.,8.,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.index
df=pd.DataFrame([[1.,2.,3.],[7.,8.,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.columns
df=pd.DataFrame([[1.,2.,3.],[7.,8.,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.dtypes
df=pd.DataFrame([[1.,2.,3.],[7.,8.,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.ftypes
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
df.get_dtype_counts()
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
print(df.select_dtypes(include='float'))
print('-'*50)
print(df.select_dtypes(include='int'))
print('-'*50)
print(df.select_dtypes(include='bool'))
print('-'*50)
print(df.select_dtypes(exclude=['int','bool']))
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.values
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.get_values()
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.axes
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.ndim
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.size
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.shape
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
print(df.memory_usage())
print('-'*50)
print(df.memory_usage(index=False))
print('-'*50)
print(df.memory_usage(deep=True))
print('-'*50)
df.info()
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
df.empty
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df)
print('-'*50)
print(df.dtypes)
print('-'*50)
print(df.fillna(0).astype('int64').dtypes)
df.fillna(0).astype('int64')
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df_c=df.copy(deep=True)
print(df)
print(df_c)
print('-'*50)
df_c.iloc[0,0]=100
print(df)
print(df_c)
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df_c=df.copy(deep=False)
print(df)
print(df_c)
print('-'*50)
df_c.iloc[0,0]=100
print(df)
print(df_c)
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df.head(1)
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df.at['b','Y']
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
df.iat[1,1]
df=pd.DataFrame([['1',2,3.],['7',8,np.nan]], index=['a','b'], columns=list('XYZ'))
print(df.loc['b'])
print('-'*50)
print(df.loc[['b','a']])
print('-'*50)
print(df.loc['a':'b','Z'])
print('-'*50)
print(
df.loc[
df['Y']>5,['Z']
]
)
print('-'*50)
print(
df.loc[
df['Y']>5
]
)
print('-'*50)
print(
df.loc[
['b','a'],['Y']
]
)
print('-'*50)
print(
df.loc[
['b','a'],'X':'Z'
]
)
print('-'*50)
print(
df.loc[
:,'Y':'Z'
]
)
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df.iloc[1])
print('-'*50)
print(df.iloc[[2,1]])
print('-'*50)
print(df.iloc[0:1,1])
print('-'*50)
print(
df.iloc[
[0,1],[2]
]
)
print('-'*50)
print(
df.iloc[
:,0:2
]
)
print('-'*50)
print(
df.iloc[
[2,1],0:3
]
)
print('-'*50)
print(
df[df.Z > 5]
)
print('-'*50)
print(
df[df.Z > 1].X
)
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
print('-'*50)
df.insert(1,'AAA+',['100',np.NaN,300.])
print(df)
pd.concat([df, pd.DataFrame([['1',2,3.,4]], columns=list('XYZJ')) ], ignore_index=True, sort=False)
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
for i in df.items(): # 迭代列
print(i)
print('-'*30)
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
for i in df.iterrows(): # 迭代行
print(i[0])
print('-'*30)
print(i[1])
print('~'*30)
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
df.keys()
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
print(type(df.lookup('c','Z')))
df.lookup('c','Z')
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
result = []
for row, col in zip(df.index,df.columns):
print(row, col)
result.append(df.at[row, col])
result
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
print('-'*30)
y=df.pop('Y')
print(type(y),'\n', 'y=\n', y)
df
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
df.tail(1)
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
print('-'*30)
print(df.xs('a'))
print('-'*30)
print(df.xs('Z', axis=1))
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
['1', '1', '1', '1', '2', '2', '2', '2']
]
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second', 'third'])
print(len(index))
df = pd.DataFrame(np.random.randn(8,5), index=index, columns=list('ABCDE'))
print(df)
print('-'*50)
print(df.xs(('foo', 'two')))
print('='*50)
print(df.xs('qux', level=0))
print('-'*50)
print(df.xs('one', level=1))
print('*'*50)
print(df.xs('1', level=2))
print('*'*50)
print(df.xs(('baz', 'one'), level=[0, 'second']))
df.xs(('baz', '1'), level=[0, 'third']) # level的列表和之前的元組,存在映射關系,表示對第0,2列加限制條件進行篩選
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
df.get('X') # 只能獲取列
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
df.isin([8, '7', np.NaN]) # np.NaN不參與比較,所以也是False,此外對比與位置無關
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
other = pd.DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
print(df)
print(other)
df.isin(other) # Column A in `other` has a 3, but not at index 1.
# 當作用與df的時候,是按位置一一判斷isin與否的
df=pd.DataFrame([['1',2,3.],['7',8,np.nan],['10',np.nan,9]], index=['a','b','c'], columns=list('XYZ'))
print(df)
df2=df.fillna(0).astype('float')
print(df2)
print(df2.where(df2['X']>7)) # 先對列進行判斷,然後取該列中符合條件記錄所在的行的所有行記錄
df2.where(df2.iloc[2:]>7) # 直接對行判斷,並取行記錄
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
# 一个参数为-1时,那么reshape函数会根据另一个参数的维度计算出数组的另外一个shape属性值。
print(df)
m = df % 3 == 0
df.mask(m, -df) # 不符合條件的,返回self的元素,符合條件的,返回other,即此處的-df的相應位置的元素
# 與where()對照的理解其功能
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
m = df % 3 == 0
df.where(m, -df) # 與mask()的值恰好相反
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
print(df)
print('-'*50)
print(df.where(df>=5))
df.mask(df>=5)
df = pd.DataFrame(np.random.randn(10, 2), columns=list('ab'))
df.query('a > b')
df[df.a > df.b] # same result as the previous expression
a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], two=[np.nan, 2, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
a.add(b, fill_value=100) # fill_value只對其中一個df有效,如果兩個都是NaN,那麼結果仍然是NaN
a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], two=[np.nan, 2, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
a.radd(b, fill_value=100) # fill_value只對其中一個df有效,如果兩個都是NaN,那麼結果仍然是NaN
# radd()與add()的區別,在於兩個df的順序,對於相加,a+b=b+a,對於sub,mul,div,調換a和b的順序,則結果會不一樣.
a = pd.DataFrame([4, 6, 8, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[2, np.nan, 3, np.nan], two=[np.nan, 3, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
print('-'*30)
print(10%3, '10除以3,餘是1')
print(10//3, '10除以3,商是3')
a.floordiv(b, fill_value=100) # 取商
a = pd.DataFrame([4, 6, 8, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[2, np.nan, 3, np.nan], two=[np.nan, 3, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
print('-'*30)
print(10%3, '10除以3,餘是1')
print(10//3, '10除以3,商是3')
a.mod(b, fill_value=100) # 取餘(求模)
a = pd.DataFrame([2, 2, 2, np.nan], index=['a', 'b', 'c', 'd'], columns=['one'])
print(a)
print('-'*30)
b = pd.DataFrame(dict(one=[2, np.nan, 3, np.nan], two=[np.nan, 3, np.nan, 2]), index=['a', 'b', 'd', 'e'])
print(b)
print('-'*30)
print(2**2)
print(2**3)
a.pow(b, fill_value=3) # 求冪(開方) power
a = pd.DataFrame([[2, 2, 2, 2]], columns=list('abcd'))
print(a.shape, '\n', a)
print('-'*30)
b = pd.DataFrame([[2, 2, 2, 2]], columns=list('abcd'))
print(b.T.shape, '\n', b.T)
print('-'*30)
a.dot(b.T)
a = pd.DataFrame([[2, 4, 6, 8],[1, 1, 1, 1]], columns=list('abcd'))
print(a.shape, '\n', a)
print('-'*30)
b = pd.DataFrame([[1, 1, 1, 1],[1, 3, 5, 7]], columns=list('abcd'))
print(b.T.shape, '\n', b.T)
print('-'*30)
a.dot(b.T) # a的第一行分別乘以b的第一列和第二列,作爲新的df的第一行的兩個元素
a = pd.DataFrame([[2, 4, 6, 8],[1, 1, 1, 1]], columns=list('abcd'))
print(a.T.shape, '\n', a.T)
print('-'*30)
b = pd.DataFrame([[1, 1, 1, 1],[1, 3, 5, 7]], columns=list('abcd'))
print(b.shape, '\n', b)
print('-'*30)
a.T.dot(b)
a = pd.DataFrame([[2, 4, 6, 8],[1, 1, 1, 1]], columns=list('abcd'))
print(a)
b = pd.DataFrame([[1, 1, 1, 1],[1, 3, 5, 7]], columns=list('abcd'))
print(b)
a.eq(b)
df1 = pd.DataFrame({'A': [0, 2], 'B': [np.NaN, 4], 'C': [0, 0]})
print(df1)
print('~'*50)
print(df1.sum())
print('-'*50)
df2 = pd.DataFrame({'A': [7, 5], 'B': [3, np.NaN]})
print(df2)
print('~'*50)
print(df2.sum())
df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2) # 比較列的sum()
df1 = pd.DataFrame({'A': [0, np.NaN], 'B': [np.NaN, 6]})
print(df1)
print('~'*50)
print(df1.sum())
print('-'*50)
df2 = pd.DataFrame({'A': [7, 5], 'B': [3, 1]})
print(df2)
print('~'*50)
print(df2.sum())
df1.combine_first(df2) # df1’s values prioritized, use values from df2 to fill holes:
df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
print(df)
print('-'*50)
print(df.apply(np.sqrt))
print('-'*50)
print(df.apply(np.sum, axis=0))
print('-'*50)
print(df.apply(np.sum, axis=1))
print('-'*50)
print(type(df.apply(lambda x: [x[0]**1, x[1]**2], axis=1)))
print(df.apply(lambda x: [x[0]**1, x[1]**2], axis=1))
print('-'*50)
# result_type='expand'时,类型为DataFrame
# result_type='reduce'时,类型为Series
print(type(df.apply(lambda x: [x[0]**1, x[1]**2], axis=1, result_type='broadcast')))
print(df.apply(lambda x: [x[0]**1, x[1]**2], axis=1, result_type='broadcast'))
# result_type : {‘expand’, ‘reduce’, ‘broadcast’, None}, default None
# These only act when axis=1 (columns):
# ‘expand’ : list-like results will be turned into columns.
# ‘reduce’ : returns a Series if possible rather than expanding list-like results. This is the opposite of ‘expand’.
# ‘broadcast’ : results will be broadcast to the original shape of the DataFrame, the original index and columns will be retained.
# The default behaviour (None) depends on the return value of the applied function: list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns.
# New in version 0.23.0.
print('-'*50)
print(type(df.apply(lambda x: str(x))[0]))
print(df.apply(lambda x: str(x))) # apply()是对整行或整列進行處理,獲得的是整行或整列的數據
print('~'*30)
for i, element in enumerate(df.apply(lambda x: str(x))[0]):
print(i, element)# 验证apply()与applymap()的区别,一个是对整行或列進行處理,另一个是对行或列中的元素逐一处理
df.apply(lambda x: len(str(x)))
df = pd.DataFrame(np.random.randn(4, 5), columns=['A', 'B', 'C', 'D', 'E'])
df['Col_sum'] = df.apply(lambda x: x.sum(), axis=1)
df.loc['Row_sum'] = df.apply(lambda x: x.sum())
df
df = pd.DataFrame([[1, 2.12], [3.356, 4.5678]])
print(df)
print(df.applymap(lambda x: str(x)))# applymap()是对行或列裏面的元素進行處理,獲得的是使用函數逐一對元素處理後的結果
df.applymap(lambda x: len(str(x)))
fruits = ['ap1', 'or2', 'le3', 'bb4'] * 2
N = len(fruits)
df = pd.DataFrame(
{
'fruit': fruits
, 'basket_id': np.arange(N)
, 'count': np.random.randint(3, 15, size=N)
, 'weight': np.random.uniform(0, 4, size=N)
}
, columns=['basket_id', 'fruit', 'count', 'weight']
)
print(df)
print('-'*70)
def first(x):
return x*2
def second(x, sec):
return x*sec
def third(x, t1, t2, t3=-1):
return x*t1*t2
print(df.pipe(first).pipe(second,2).pipe(third,t1=1,t2=2,t3=0))
print('-'*70)
print(df.apply(first).apply(second,args=(2,)).apply(third,args=(1,2),t3=0))
df = pd.DataFrame([[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[np.nan, np.nan, np.nan]],
columns=['A', 'B', 'C'])
print(df)
# Aggregate these functions over the rows.
print(df.fillna(0).agg(['sum', 'min', 'mean'])) # 做3种运算
print(df.agg(['sum', 'min', 'mean']))
print(df.agg(('mean'), axis=1)) # 只做一种运算
df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max'], 'C' :'mean'}) # 不同列做不同运算
df = pd.DataFrame(np.random.randn(4, 4), columns=['A', 'B', 'C', 'D'], index=pd.date_range('1/1/2018', periods=4))
df.iloc[1:2] = np.nan
print(df)
df.transform(lambda x: (x - x.mean()) / x.std())
df.transform(lambda x: x-x+x.mean()) # 列的平均數
df.transform(lambda x: x-x+x.std()) # 列的標準差
df.transform(max) # sum, max, min, np.mean, np.std # 只有df行數等於列數的時候才能正常執行
df=pd.DataFrame([
['exact',9720,'2017-10-01',515,458]
,['exact',9720,'2017-10-01',510,896]
,['fuzzy',8242,'2017-11-01',122,415]
,['fuzzy',8242,'2017-11-01',128,782]
], columns=['type','id','date','code','amount'])
df
df.groupby(['type','id','date']).mean()
print(type(df.groupby(['type','id'])['amount'].mean()))
print(df.groupby(['type','id'])['amount'].mean().index)
df.groupby(['type','id'])['amount'].mean()
#知识点:
#计算机中的符号数有三种表示方法:原码、反码和补码。
#三种表示方法均有符号位和数值位两部分,符号位都是用0表示“正”,用1表示“负”,而数值位,三种表示方法各不相同。
#在计算机系统中,数值一律用补码来表示和存储。
#原因在于,使用补码,可以将符号位和数值域统一处理;同时,加法和减法也可以统一处理。
#正整数的补码是其二进制表示,与原码相同
#负整数的补码,将其对应正数二进制表示所有位取反(包括符号位,0变1,1变0)后加1。
#运算分析:
#-6的补码是+6(0000 0110)取反后再+1,为(1111 1001)+(0000 0001)=(1111 1010),也就是计算机中-6是用(1111 1010)来存储的,(1111 1010) 按位取反得到(0000 0101)这就是答案5
#-4的补码,是其對應的正數4(0000 0100)取反后再+1,为(1111 1011)+(0000 0001)=(1111 1100),也就是计算机中-4是用(1111 1100)来存储的,(1111 1100) 按位取反得到(0000 0011)这就是答案3
#Python按位取反运算:
a=-4
print('~-4 =',~a)
# 4的补码與原碼相同是(0000 0100), 取反即为(1111 1011)
#对于计算机来说,二进制以1开头表示的是负数
#所以(1111 1011)是一個負數
#但是(1111 1011)是十進制的多少呢?
#想要知道这个值,可以求它的补码,即先取反码 :0000 0100,再加1:0000 0101,
#(0000 0101),是5,並且之前知道這個數是一個負數,所以答案是-5
b=4
print('~4 =',~b)
df=pd.DataFrame([
['exact',9720,'2018-10-01',515,458]
,['exact',9720,'2018-10-01',510,896]
,['fuzzy',8242,'2018-11-01',122,415]
,['fuzzy',8242,'2018-11-01',128,782]
,['fuzzy',8243,'2018-11-02',128,782]
,['fuzzy',8243,'2018-11-03',128,782]
], columns=['type','id','date','code','amount'])
df
exact_rows = df['type'] != 'fuzzy'
exact_rows
df.loc[exact_rows]
df.loc[~exact_rows]
df.loc[~exact_rows].groupby('id').apply(lambda x:x.sum())
df.loc[~exact_rows].groupby('id').apply(lambda x:x.sum())['amount']
df.loc[~exact_rows].groupby('id').transform('nunique') # 取非重復元素的個數
# 譬如date列,判斷是否唯一,是分組進行的
grouped = df.loc[~exact_rows].groupby('id').apply(lambda g: g.sort_values('code', ascending=False))
print(grouped)
print('~'*50)
grouped = df.loc[~exact_rows].sort_values(['id','date'], ascending=True).groupby('id')
print(grouped)
print(type(grouped))
for i in grouped:
print(i[0])
print(i[1])
for i in grouped:
print(i[0])
print(i[1])
for i in grouped['code']:
print(i[0])
print(i[1])
grouped['code'].transform('nunique') # 判斷是否唯一時,是分組進行的,而不是全部列元素進行判斷.
a = np.where(grouped['code'].transform('nunique') == 2, 18, 81)
# where(cond, option1, option2)
a
data = pd.DataFrame({'state':['Florida','Florida','Texas','Texas'], 'a':[4,5,1,3], 'b':[6,10,3,11] })
print(data)
print('-'*50)
def sub_two(X):
return X['a'] - X['b']
data1 = data.groupby(data['state']).apply(sub_two) # 此处使用transform 会出现错误
print(data1)
print('-'*50)
def group_sum(x):
return x.sum()
data3 = data.groupby(data['state']).transform(group_sum) # 返回与原df一样的行數
print(data3)
print('-~'*30)
data4 = data.groupby(data['state'])['a','b'].apply(group_sum)
print(data4)
df = pd.DataFrame({
'a': [4, 5, 6, 7],
'b': [10, 20, 30, 40],
'c': [100, 50, -30, -50]
})
print(df)
print('-~'*25)
print(df.abs())
print('-~'*25)
print(df.loc[[1,0,2,3]]) # (df.c - 43).abs().argsort() 返回 [1,0,2,3],見下一cell說明
df.loc[(df.c - 43).abs().argsort()]
# argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y
df = pd.DataFrame({
'a': [4, 5, 6, 7],
'b': [10, 20, 30, 40],
'c': [100, 50, -30, -50]
})
print(df)
print('-~'*25)
print((df.c - 43))
print('-~'*25)
print((df.c - 43).abs())
print('-~'*25)
print((df.c - 43).abs().argsort())
df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
print(df)
print('-~'*25)
print(df.all())
print('-~'*25)
print(df.all(axis='columns'))
print('-~'*25)
print(df.all(axis=None))
df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
print(df)
print('-~'*25)
print(df.any())
print('-~'*25)
print(df.any(axis='columns'))
print('-~'*25)
print(df.any(axis=None))
data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
df = pd.DataFrame(data)
print(df)
print('-~'*25)
print(df.clip(-4, 6))
print('-~'*25)
print(df.clip_lower(-2))
print('-~'*25)
print(df.clip_upper(3))
data = {'col_0': [1, 2, 5], 'col_1': [10, 20, 50]}
df = pd.DataFrame(data)
print(df)
print('-~'*25)
print(df.prod())
df.compound()
# col_0: (1+1)*(1+2)*(1+5)-1=2*3*6-1=35
# col_1: (1+10)*(1+20)*(1+50)-1=11*21*51-1=11780
df = pd.DataFrame({"Person":["John", "Myla", None, "John", "Myla"],
"Age": [24., np.nan, 21., 33, 26],
"Single": [False, True, True, True, False]
})
print(df)
print('-~'*25)
print(df.count()) # Notice the uncounted NA values
print('-~'*25)
print((df.count(axis='columns'))) # Counts for each row
print('-~'*25)
#Counts for one level of a MultiIndex:
df.set_index(["Person", "Single"]).count(level="Person")
df.groupby('Person')['Age'].count()
df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], columns=['dogs', 'cats'])
print(df)
df.cov()
df = pd.DataFrame([[2.0, 1.0],
[3.0, np.nan],
[1.0, 0.0]],
columns=list('AB'))
print(df)
print('-~'*25)
# By default, iterates over rows and finds the maximum in each column.
# This is equivalent to axis=None or axis='index'.
print(df.cummax())
print('-~'*25)
print(df.cummax(axis=1))
df = pd.DataFrame([[2.0, 1.0],
[3.0, np.nan],
[1.0, 0.0]],
columns=list('AB'))
print(df)
print('-~'*25)
print(df.cummin()) # 每一行記錄與上一行比較,取最小的記錄
print('-~'*25)
print(df.cummin(axis=1)) # 每一列記錄與前一(左)列數據比較,取最小記錄
df = pd.DataFrame([[2.0, 1.0, 5],
[3.0, np.nan, None],
[1.0, 0.0, 3]],
columns=list('ABC'))
print(df)
print('-~'*25)
print(df.cumprod())
print('-~'*25)
print(df.cumprod(axis=1))
df = pd.DataFrame([[2.0, 1.0, 5],
[3.0, np.nan, None],
[1.0, 0.0, 3]],
columns=list('ABC'))
print(df)
print('-~'*25)
print(df.cumsum())
print('-~'*25)
print(df.cumsum(axis=1))
df = pd.DataFrame({ 'object': ['a', 'b', 'b'],
'numeric': [1, 2, 3],
'categorical': pd.Categorical(['g','e','f'])
})
print(df)
print('-~'*25)
print(df.describe())
print('-~'*25)
df.describe(include='all')
df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
'b': [1, 1, 2, 3, 5, 8],
'c': [1, 4, 9, 16, 25, 36]})
print(df)
print('-~'*25)
print(df.diff()) # 與前一條記錄的比較
print('-~'*25)
print(df.diff(axis=1)) # 與前一列記錄的比較
print('-~'*25)
print(df.diff(periods=3)) # 與第前3條記錄的比較(相減)
df = pd.DataFrame({'A': [2, 2, 4]})
print(df)
print('-~'*25)
print(df.mean())
print('-~'*25)
print((df-df.mean()).abs().mean())
print('-~'*25)
print(df.mad())
df = pd.DataFrame({'A': [1, 2, 1, 102, 1, 3, 8]})
print(df)
print('-~'*25)
print(df.sort_values(by=['A']).reset_index())
print('-~'*25)
print(type(df.median()))
print('-~'*25)
print(df.median())
print('-~'*25)
print(df.median()[0])
print('-~'*25)
print(df[df['A']==df.median()[0]])
print('-~'*25)
df[df['A']==df.median()[0]].index.tolist()
df = pd.DataFrame({'A': [1, 1, 3, 2, 3, 3, 3], 'B': [9, 9, 9, 9, 3, 3, 3]})
print(df)
df.mode()
# 先按照a列做聚合,然後再對聚合後的a列的值,即A和B的所有值取衆數.
df = pd.DataFrame({'a':['A','A','A','A','B','B','B','B','B', 'B'],'b':[2,1,2,3,1,2,2,3,3, 3]})
print(df)
# dir(df.groupby('a')) 檢查可用的方法
df.groupby('a')['b'].apply(lambda x:x.mode())
df = pd.DataFrame({
'FR': [4.0405, 4.0963, 4.3149],
'GR': [1.7246, 1.7482, 1.8519],
'IT': [804.74, 810.01, 860.13]},
index=['2018-11-01', '2018-12-01', '2019-01-01'])
print(df)
print('~-'*25)
print(df.diff()) # 先計算與上一行的差,若索引爲時間(如月),就是環比.
print('~-'*25)
print(df.shift(1)) # 移位,對齊
print('~-'*25)
print(df.diff().div(df.shift(1))) # 即將每一行與上一行的差,再除以上一行,得到的百分比,就是 pct_change()的值
# 當periods=12時,表示與上年同期的同比
df.pct_change()
df = pd.DataFrame({
'2016': [1769950, 30586265],
'2015': [1500923, 40912316],
'2014': [1371819, 41403351]},
index=['GOOG', 'APPL'])
print(df)
df.pct_change(axis='columns')
df = pd.DataFrame({
'A': [1, 3],
'B': [3, 4],
'C': [2, 4]},
index=['GOOG', 'APPL'])
print(df)
print('~-'*25)
print(df.prod())
print('~-'*25)
print(df.prod(axis='columns'))
df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), columns=['a', 'b'])
print(df)
print('~-'*25)
print(df.quantile(.1))
# 计算a列
# pos = 1 + (4 - 1)*0.1 = 1.3
# fraction = 0.3
# ret = 1 + (2 - 1) * 0.3 = 1.3
# 计算b列
# pos = 1.3
# ret = 1 + (10 - 1) * 0.3 = 3.7
print('~-'*25)
print(type(df.quantile([.1, .5, .1, .5, .3, .2])))
print(df.quantile([.1, .5, .1, .5, .3, .2]))
df = pd.DataFrame(np.array([[1, 3], [2., 2], [3, 1], [3, 1]]), columns=['a', 'b'])
print(df)
print('~-'*25)
print(df.rank())
df = pd.DataFrame(np.random.random([3, 3]), columns=['A', 'B', 'C'], index=['first', 'second', 'third'])
print(df)
print('~-'*25)
print(df.round(2))
print('~-'*25)
df.round({'A': 1, 'C': 2})
df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
print(df)
print('~-'*25)
print(df.nunique())
df.nunique(axis=1)
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
print(df)
print('~-'*25)
print(df.add_prefix('col_'))
print('~-'*25)
print(df.add_suffix('_col'))
df1 = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
df2 = pd.DataFrame({'X': [1, 2, 3, 4], 'Y': [3, 4, 5, 6]}, index=[3,2,1,4])
print(df1)
print('~-'*25)
print(df2)
print('~-'*25)
print(type(df1.align(df2, join='inner', axis=0)))
print(df1.align(df2, join='outer', axis=0)[0])
print('~-'*25)
print(df1.align(df2, join='outer', axis=0)[1])
i = pd.date_range('2018-12-09', periods=4, freq='12H')
ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
print(ts)
ts.at_time('12:00')
i = pd.date_range('2018-12-09', periods=6, freq='4H')
ts = pd.DataFrame({'A': [1,2,3,4, 5, 6]}, index=i)
print(ts)
ts.between_time('4:15', '16:45')
df = pd.DataFrame(np.arange(12).reshape(3,4), columns=['A', 'B', 'C', 'D'])
print(df)
print('~-'*25)
print(df.drop(['B', 'C'], axis=1))
print('~-'*25)
print(df.drop(columns=['B', 'C']))
print('~-'*25)
df.drop([0, 1])
midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
['speed', 'weight', 'length']],
labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
[0, 1, 2, 0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(index=midx, columns=['big', 'small'],
data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
[250, 150], [1.5, 0.8], [320, 250],
[1, 0.8], [0.3,0.2]])
print(df)
print('~-'*25)
print(df.drop(index='cow', columns='small'))
print('~-'*25)
print(df.drop(index='length', level=1)) # level 與 索引的級別需要一致,否則無效, 如length的level是1
- keep : {‘first’, ‘last’, False}, default ‘first’
- first : Drop duplicates except for the first occurrence.
- last : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
['speed', 'weight', 'length']],
labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
[0, 1, 2, 0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(index=midx, columns=['big', 'small'],
data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
[250, 150], [1.5, 0.8], [45, 250],
[1.5, 0.8], [0.3,0.2]])
print(df)
print('~'*25)
print(df.drop_duplicates(subset='big', keep='last', inplace=False))
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 3, 3]})
print(df)
print('~-'*25)
print(df.drop_duplicates('B', keep='last'))
df = pd.DataFrame({'A': [1, 2, 3, 1], 'B': [3, 4, 3, 3]})
print(df)
print('~-'*25)
print(df.duplicated(['A','B'], keep='last'))
df1 = pd.DataFrame({'A': [1, 2, 3, 1], 'B': [3, 4, np.NAN, 3]})
df2 = pd.DataFrame({'A': [1, 2, 3, 1], 'B': [3, 4, np.NAN, 3]})
print(df1)
print('~-'*25)
df1.equals(df2)
df=pd.DataFrame([
[1,2,3]
,[4,5,6]
], index=['mouse','rabbit'], columns=['one','two','three'])
print(df)
print('-'*30)
print(df.filter(items=['one','two'])) # 只選擇列表中的列
print('-'*30)
print(df.filter(regex='e$', axis=1)) # 對列做過濾,選擇索引標籤中含有以"e"結尾的列
print('-'*30)
print(df.filter(like='bbi', axis=0)) # 對行做過濾,選擇索引標籤中包含bbi的行
i = pd.date_range('2018-04-09', periods=8, freq='2D')
ts = pd.DataFrame({'A': [1,2,3,4,2,3,4,2]}, index=i)
print(ts)
print('-'*30)
# Get the rows for the first 3 days:
print(ts.first('4D'))
print('-'*30)
ts.first('5D')
i = pd.date_range('2018-04-09', periods=8, freq='2D')
ts = pd.DataFrame({'A': [1,2,3,4,2,3,4,2]}, index=i)
print(ts)
print('-'*30)
print(ts.head())
ts.head(3)
i = pd.date_range('2018-04-09', periods=8, freq='2D')
ts = pd.DataFrame({'A': [1,2,3,16,2,3,4,2], 'B': [5,8,3,9,2,3,6,12]}, index=i)
print(ts)
print('-'*30)
# Get the rows for the first 3 days:
print(ts.idxmax(axis=0))
print('-'*30)
print(ts.idxmax(axis=1)) # 若是相同,取前一列(A列)
print('-'*30)
print(ts.idxmin(axis=0))
print('-'*30)
ts.idxmin(axis=1) # 若是相同,取前一列(A列)
i = pd.date_range('2018-04-09', periods=8, freq='2D')
ts = pd.DataFrame({'A': [1,2,3,4,2,3,4,2]}, index=i)
print(ts)
print('-'*30)
# Get the rows for the first 3 days:
print(ts.last('4D'))
print('-'*30)
ts.last('5D')
index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
df = pd.DataFrame({
'http_status': [200,200,404,404,301],
'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
index=index)
print(df)
print('-'*30)
new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', 'Chrome']
print(df.reindex(new_index))
df.reindex(new_index, fill_value=0)
date_index1 = pd.date_range('11/1/2018', periods=6, freq='D')
date_index2 = pd.date_range('10/29/2018', periods=10, freq='D')
print(date_index1)
print('-'*30)
print(date_index2)
df = pd.DataFrame({"prices": [100, 105, np.nan, 100, 89, 88]},
index=date_index1)
print(df)
print('~'*50)
print(df.reindex(date_index2)) # 按索引對齊,缺失爲NaN
df.reindex(date_index2, method='bfill') #對NaN值替換,是向前替換的(b代表before)
df = pd.DataFrame({"A": [100, 105, np.nan],"B": [100, 105, 100],"C": [100, 105, np.nan]}, index=['a', 'b', 'c'])
print(df)
print('~'*50)
df.reindex(['B', 'C', 'D'], axis=1)
df1 = pd.DataFrame({"X": [10000, 10500, np.nan],"B": [10000, 10500, 10000],"C": [10000, 10500, np.nan]}, index=['x', 'b', 'c'])
df2 = pd.DataFrame({"A": [1, 1.5, np.nan],"B": [1, 1.5, 1],"C": [1, 1.5, np.nan]}, index=['a', 'b', 'c'])
print(df1)
print('~'*50)
print(df2)
print('~'*50)
df1.reindex_like(df2) # 形式上像df2, 但是數據仍然是 df1 的,如果有缺失數據,使用NaN
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
print(df)
#df.rename(index={0:'a', 1:'b', 2:'c'}, columns={"A": "a", "B": "c"})
df.rename(index={1:'a', 0:'b', 2:'啊'}, columns={"A": "a", "B": "c"}).rename(str.upper) # (str.lower, axis='columns')
df = pd.DataFrame([('bird', 389.0),
('bird', 24.0),
('mammal', 80.5),
('mammal', np.nan)],
index=['falcon', 'parrot', 'lion', 'monkey'],
columns=('class', 'max_speed'))
print(df)
print('~'*50)
print(df.reset_index())
print(df.reset_index().columns)
df.reset_index(drop=True) # use the drop parameter to avoid the old index being added as a column
index = pd.MultiIndex.from_tuples([('bird', 'falcon'), # 與zip()搭配使用效果會很好
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns = pd.MultiIndex.from_tuples([('speed', 'max'),
('species', 'type')])
df = pd.DataFrame([(389.0, 'fly'),
( 24.0, 'fly'),
( 80.5, 'run'),
(np.nan, 'jump')],
index=index,
columns=columns)
print(df)
print('-'*30)
print(df.columns)
print('-'*30)
#If the index has multiple levels, we can reset a subset of them:
print(df.reset_index(level='class', col_level=0, col_fill='genus').index)
print('-'*30)
print(df.reset_index(level='class', col_fill='genus'))
print(df.reset_index(level='class', col_fill='genus').columns)
df = pd.DataFrame(np.random.random((4,6)), index=[['a','a','c','c'],['x','z','z','w']])
df.columns = pd.MultiIndex.from_product([[1,2],['E','C','A']])
print(df)
print('-'*30)
print(df.columns)
print('-'*30)
print(df.reset_index(level=0, col_fill='B'))
print('-'*30)
df.reset_index(level=0, col_fill='B').columns
# 在levels 和 labels中, 索引轉化爲label的部分, 順序上的對應關系很特別
# levels=[[1, 2, 'level_0'], ['A', 'C', 'E', 'B']],
# labels=[[2, 0, 0, 0, 1, 1, 1], [3, 2, 1, 0, 2, 1, 0]]
# 注意這裏的labels中的[2...]和[3...]
frac : float, optional
Fraction of axis items to return. Cannot be used with n.
df = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))
print(df)
df.sample(frac=0.3, replace=True)
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
print(df)
print('-'*50)
print(df.set_axis(['a', 'b', 'c'], axis='index', inplace=False))
print('-'*50)
print(df.set_axis(['I', 'II'], axis='columns', inplace=False))
df = pd.DataFrame({'month': [1, 4, 7, 10],
'year': [2018, 2018, 2019, 2018],
'sale':[55, 40, 84, 31]})
print(df)
df.sort_values(['year', 'month']).set_index(['year', 'month'])
i = pd.date_range('2018-04-09', periods=8, freq='2D')
ts = pd.DataFrame({'A': [1,2,3,4,2,3,4,2]}, index=i)
print(ts)
print('-'*30)
print(ts.tail())
ts.tail(3)
df = pd.DataFrame([('falcon', 'bird', 389.0),
('parrot', 'bird', 24.0),
('lion', 'mammal', 80.5),
('monkey', 'mammal', np.nan)],
columns=['name', 'class', 'max_speed'],
index=[0, 2, 3, 1])
print(df)
print('-'*30)
print(df.take([0, 3])) # 對行記錄切片
print('-'*30)
print(df.take([-1, -2])) # 對行記錄切片
print('-'*30)
print(df.take([1, 2], axis=1)) # 對列記錄切片
df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
'B': ['f', 'g', 'h', 'i', 'j'],
'C': ['k', 'l', 'm', 'n', 'o']},
index=[1, 2, 3, 4, 5])
print(df)
print('-'*30)
print(df.truncate(before=2, after=4)) # include 2 and 4
print('-'*30)
print(df.truncate(before="A", after="B", axis="columns"))
index = pd.date_range('11/15/2018', periods=4, freq='T')
series = pd.Series([0.0, None, 2.0, 3.0], index=index)
df = pd.DataFrame({'s':series})
print(df)
print('-'*30)
print(df.asfreq(freq='30S'))
print('-'*30)
print(df.asfreq(freq='30S', fill_value=9.0))
print('-'*30)
df.asfreq(freq='30S', method='bfill')
index = pd.date_range('11/15/2018', periods=4, freq='T')
series = pd.Series([0.0, 8.5, 2.0, 3.0], index=index)
df = pd.DataFrame({'s':series})
print(df)
print('-'*30)
df.shift(periods=-2)
index = pd.date_range('11/15/2018', periods=4, freq='T')
series = pd.Series([0.0, 8.5, 2.0, 3.0], index=index)
df = pd.DataFrame({'s':series})
print(df)
print('-'*30)
df.slice_shift(periods=2)
index = pd.date_range('11/15/2018', periods=4, freq='T')
series = pd.Series([0.0, 8.5, 2.0, 3.0], index=index)
df = pd.DataFrame({'s':series})
print(df)
print('-'*30)
df.tshift(periods=2)
index = pd.date_range('11/15/2018', periods=5, freq='M')
df = pd.DataFrame({'C': ['k', 'l', 'm', 'n', 'o']
, 'D': ['a', 'c', 'g', 'j', 'r']
},
index=index)
print(df)
print('-'*30)
df.tshift(periods=2) # 移動的是索引
index = pd.date_range('11/15/2018', periods=4, freq='D')
series = pd.Series([0.0, np.NaN, 2.0, 3.0], index=index)
df = pd.DataFrame({'s':series})
print(df)
print('-'*30)
print(df.first_valid_index())
print('-'*30)
print(df.last_valid_index())
index = pd.date_range('1/1/2018', periods=12, freq='M')
df = pd.DataFrame(np.random.randn(12, 3), columns=['Sales','Cost','Profit'], index=index)
print(df)
print('-'*50)
print(df.resample('Q').sum())
df.resample('Y').sum()
def custom_resampler(array_like):
return np.sum(array_like)+5
df.resample('Q').apply(custom_resampler) # 嵌套自定義函數
index = pd.date_range('1/1/2018', periods=2, freq='M')
df = pd.DataFrame(np.random.randn(2, 3), columns=['Sales','Cost','Profit'], index=index)
print(df)
print('-'*50)
print(df.to_period()) # 從年月日轉換爲年月
data : DataFrame
x : label or position, default None
y : label, position or list of label, positions, default None
Allows plotting of one column versus another
kind : str
line : line plot (default)
bar : vertical bar plot
barh : horizontal bar plot
hist : histogram
box : boxplot
kde : Kernel Density Estimation plot
density : same as ‘kde’
area : area plot
pie : pie plot
scatter : scatter plot
hexbin : hexbin plot
ax : matplotlib axes object, default None
subplots : boolean, default False
Make separate subplots for each column
sharex : boolean, default True if ax is None else False
In case subplots=True, share x axis and set some x axis labels to invisible; defaults to True if ax is None otherwise False if an ax is passed in; Be aware, that passing in both an ax and sharex=True will alter all x axis labels for all axis in a figure!
sharey : boolean, default False
In case subplots=True, share y axis and set some y axis labels to invisible
layout : tuple (optional)
(rows, columns) for the layout of subplots
figsize : a tuple (width, height) in inches
use_index : boolean, default True
Use index as ticks for x axis
title : string or list
Title to use for the plot. If a string is passed, print the string at the top of the figure. If a list is passed and subplots is True, print each item in the list above the corresponding subplot.
grid : boolean, default None (matlab style default)
Axis grid lines
legend : False/True/’reverse’
Place legend on axis subplots
style : list or dict
matplotlib line style per column
logx : boolean, default False
Use log scaling on x axis
logy : boolean, default False
Use log scaling on y axis
loglog : boolean, default False
Use log scaling on both x and y axes
xticks : sequence
Values to use for the xticks
yticks : sequence
Values to use for the yticks
xlim_ : 2-tuple/list
ylim : 2-tuple/list
rot : int, default None
Rotation for ticks (xticks for vertical, yticks for horizontal plots)
fontsize : int, default None
Font size for xticks and yticks
colormap : str or matplotlib colormap object, default None
Colormap to select colors from. If string, load colormap with that name from matplotlib.
colorbar : boolean, optional
If True, plot colorbar (only relevant for ‘scatter’ and ‘hexbin’ plots)
position : float
Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
table : boolean, Series or DataFrame, default False
If True, draw a table using the data in the DataFrame and the data will be transposed to meet matplotlib’s default layout. If a Series or DataFrame is passed, use passed data to draw a table.
yerr : DataFrame, Series, array-like, dict and str
See Plotting with Error Bars for detail.
xerr : same types as yerr.
stacked : boolean, default False in line and
bar plots, and True in area plot. If True, create stacked plot.
sort_columns : boolean, default False
Sort column names to determine plot ordering
secondary_y : boolean or sequence, default False
Whether to plot on the secondary y-axis If a list/tuple, which columns to plot on secondary y-axis
mark_right : boolean, default True
When using a secondary_y axis, automatically mark the column labels with “(right)” in the legend
**kwds : keywords
Options to pass to matplotlib plotting method
df = pd.DataFrame(
np.random.rand(12,3),
index=range(1, 13),
columns =['Product_1','Product_2','Product_3']
)
print(df)
ax = df.plot(kind='area')
ax = df.plot.area(x=None, y=None)
#df.plot(kind='bar',stacked=True)
df = pd.DataFrame({'Product Catgory':['Product_1', 'Product_2', 'Product_3'], 'Sales':[32, 45, 20], 'Cost':[25, 30, 15]})
print(df)
ax = df.plot.bar(x='Product Catgory', y=['Sales', 'Cost'], rot=45) # rot 傾斜度
speed = [0.1, 17.5, 40, 48, 52, 69, 88]
lifespan = [2, 8, 70, 1.5, 25, 12, 28]
index = ['snail', 'pig', 'elephant', 'rabbit', 'giraffe', 'coyote', 'horse']
df = pd.DataFrame({'speed': speed,
'lifespan': lifespan}
, index=index
)
print(df)
df.plot(kind='bar',rot=0)
ax = df.plot.bar(rot=0)
print('1'*50)
axes = df.plot.bar(rot=0, subplots=True)
print('2'*50)
axes[1].legend(loc=2)
ax = df.plot.bar(y='lifespan', rot=0)
ax = df.plot.bar(x='lifespan', rot=0)
df.plot(kind='barh')
axes = df.plot.barh(rot=0, subplots=True)
df.plot(kind='box')
print(df)
df.describe()
np.random.seed(1234)
df = pd.DataFrame(np.random.randn(10,4), columns=['Col1', 'Col2', 'Col3', 'Col4'])
boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3', 'Col4'])
df
df.plot(kind='line')
df = pd.DataFrame({'Sales': [330, 487, 597],
'profit': [24.397, 60.518, 63.781]},
index=['P1', 'P2', 'P3'])
plot = df.plot.pie(y='Sales', figsize=(5, 5))
df
df = pd.DataFrame(np.random.randn(1000, 3), columns=['length', 'width', 'species'])
ax1 = df.plot.scatter(x='length', y='width', c='DarkBlue')
d_6 = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
d_6
df6=pd.DataFrame(d_6)
df6
df6['one']
df6['three'] = df6['one'] * df6['two']
df6['flag'] = df6['one'] > 2
df6['newtest'] = 666
df6
df6['one_trunc'] = df6['one'][:2] # 将某列(对行有选择性地)拷贝至新列
df6
df6.assign(one_three_ratio=df6['one']/df6['three'])
df6.assign(proc_one=lambda x: df6['one']**2 )
dependent = pd.DataFrame({"A": [1, 1, 1]})
print(dependent)
dependent.assign(A=lambda x: x['A'] + 1).assign(B=lambda x: x['A'] + 2)
dependent.assign(A=lambda x: x["A"] + 1,B=lambda x: x["A"] + 2)
three = df6.pop('three')
print(type(three))
print(three)
df6
df6.insert(2,'one_copy',df6.pop('one')) # pop()返回删除的列
df6
df6_1=df6.drop('flag', axis=1, inplace=False)
print(df6)
df6_1
del df6['two'] #直接在原数据上删除列
df6
| Operation | Syntax | Result |
|---|---|---|
| Select column | df[col] | Series |
| Select row by label | df.loc[label] | Series |
| Select row by integer location | df.iloc[loc] | Series |
| Select columns by integer location | df.iloc[:, loc] | Series |
| Slice rows | df[5:10] | DataFrame |
| Select rows by boolean vector | df[bool_vec] | DataFrame |
df_i=pd.DataFrame({('one'):[1., 2., 3., np.nan]
,('bar'):[1., 2., 3., np.nan]
,('flag'):[False, False, True, False]
,('foo'):['bar', 'bar', 'bar', 'bar']
,('one_trunc'):[1., 2., np.nan, np.nan]
}, index=['a','b','c','d'])
df_i
df_i['bar'] # Select column
df_i.iloc[:, 0:4] # Select column
df_i.loc['b'] # Select row by label
df_i.iloc[-1] # Select row by integer location
df_i[1:3] # Slice rows
from numpy.random import randn
from pandas import DataFrame
df_q = pd.DataFrame(randn(10, 2), columns=list('ab'))
df_q
df_q[df_q['a']>0]
df_q.query('a > b')
df_q[df_q.a > df_q.b] # same result as the previous expression
df_align_1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
print(df_align_1)
df_align_2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
print(df_align_2)
df_align_1+df_align_2
df_align_1 - df_align_1.iloc[9] # 只能对一行做处理,如果是多行,全部变为NaN
df
df*10
1/df
(df*100).round(2)
dfb1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)
dfb1
dfb2 = pd.DataFrame({'a' : [0, 0, 1], 'b' : [1, 1, 0] }, dtype=bool)
dfb2
dfb1 & dfb2 # 和, 對應位置,只有兩個都是True,才會得到True,其他全部是False
dfb1 | dfb2 # 或,只有有True,就會得到True
dfb1 ^ dfb2
df_align_1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
print(df_align_1)
df_align_2 = pd.DataFrame(np.random.randn(2, 2), columns=['A', 'B'])
print(df_align_2)
df_align_3 = df_align_1.add(df_align_2, fill_value=0)
print(df_align_3)