pandas 小技巧
1、筛选数据
complaints[['Complaint Type', 'Borough']][:10]
is_noise = complaints['Complaint Type'] == "Noise - Street/Sidewalk"
in_brooklyn = complaints['Borough'] == "BROOKLYN"
complaints[is_noise & in_brooklyn][:5]
2、计数
complaint_counts = complaints['Complaint Type'].value_counts()
complaint_counts[:10]
3、分类计数
weekday_counts = berri_bikes.groupby('weekday').aggregate(sum)
4、合并
weather_2012 = pd.concat([f(x) for x in l])
5、字符串操作
weather_2012['Weather'].str.contains('Snow')
6、修改采样频率(可以获得更宏观的图形)
weather_2012['Temp (C)'].resample('M').apply(np.median)
median:中位数 mean:均值
7、清理数据
http://nbviewer.jupyter.org/github/jvns/pandas-cookbook/blob/master/cookbook/Chapter%207%20-%20Cleaning%20up%20messy%20data.ipynb
rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
long_zip_codes = requests['Incident Zip'].str.len() > 5
requests['Incident Zip'][long_zip_codes].unique()
zips = zips.str.slice(0, 5)
zero_zips = zips == '00000'
zips[zero_zips] = np.nan
zips = requests['Incident Zip']
is_close = zips.str.startswith('0') | zips.str.startswith('1')
is_far = ~(is_close) & zips.notnull()
8、时间转换
popcon['atime'] = pd.to_datetime(popcon['atime'], unit='s')
popcon = popcon[popcon['atime'] > '1970-01-01']
nonlibraries = popcon[~popcon['package-name'].str.contains('lib')]
9、与数据库交互
con = sqlite3.connect("../data/weather_2012.sqlite")
df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con,
index_col=['id', 'date_time'])
con.execute("DROP TABLE IF EXISTS weather_2012")
weather_df.to_sql("weather_2012", con)