1 2 3 4 5 6 7 8 | #encodeing=utf-8 import numpy as np import pandas as pd s = pd.Series([i * 2 for i in range ( 1 , 11 )]) dates = pd.date_range( "20170301" , periods = 8 ) df = pd.DataFrame(np.random.randn( 8 , 5 ),index = dates,columns = list ( "ABCDE" )) print (df) |
随机生成8行5列 以dates作为index,ABCDE作为列
1 2 3 4 5 | #打印前3行 print (df.head( 3 )) #打印后三行 print (df.tail( 3 )) |
1 2 3 4 5 | #打印索引值 print (df.index) #打印value print (df.values) |
1 2 | #转制 索引和列转换 print (df.T) |
1 | print (df.sort(columns = "C" )) |
我测试sort的时候尽然报错 可能是python3对排序重新定义了函数名
1 | print (df.sort_index(axis = 1 ,ascending = False )) |
1 | print (df.describe()) |
1 2 3 | #切片 print (df[ "A" ]) print ( type (df[ "A" ])) |
1 2 3 4 5 | print (df[: 3 ]) print ( "=============================================================" ) print (df[ "20170301" : "20170304" ]) print ( "=============================================================" ) print (df.loc[dates[ 0 ]]) |
1 2 3 | print (df.loc[ "20170301" : "20170304" ,[ "B" , "D" ]]) print ( "=============================================================" ) print (df.at[dates[ 0 ], "C" ]) |
1 2 | #通过下标进行选择 print (df.iloc[ 1 : 3 , 2 : 4 ]) |
1 2 3 4 5 6 | #填条件进行筛选 print (df[df.B> 0 ][df.A< 0 ]) print ( "=============================================================" ) print (df[df> 0 ]) print ( "=============================================================" ) print (df[df[ "E" ].isin([ 1 , 2 ])]) |
1 2 3 4 5 6 7 8 9 | df.iat[ 1 , 1 ] = 1 print (df) df.loc[:, "D" ] = np.array([ 4 ] * len (df)) print (df) df2 = df.copy() df2[df2> 0 ] = - df2 print (df2) |
1 2 3 4 5 6 7 8 9 | df1 = df.reindex(index = dates[: 4 ],columns = list ( "ABCD" ) + [ "G" ]) df1.loc[dates[ 0 ]:dates[ 1 ], "G" ] = 1 # print(df1) #丢弃NaN的行 print (df1.dropna()) #填充NaN print (df1.fillna(value = 2 )) |
http://www.imooc.com/video/14994
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | #encodeing=utf-8 import sys import os import re import numpy as np import pandas as pd from pandas import Series, DataFrame, Panel logfile = 'www.xxxxxx.com-access_log-20170521' with open (logfile, 'r' ) as fo: log_list = [] for line in fo: regex = '([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+|-)' rline = re.match(regex, line).groups() log_list.append(rline) indexs = [ 'IP' , 'Time' , 'Result' , 'Status' , 'No.' ] df = DataFrame(log_list,columns = indexs) print (df) # print(df[df['IP'] == '42.120.160.97']) |
像sql一样使用pandas 可以参考下面这篇文章
http://www.cnblogs.com/en-heng/p/5630849.html