pandas 小技巧

1、筛选数据

complaints[['Complaint Type', 'Borough']][:10]

is_noise = complaints['Complaint Type'] == "Noise - Street/Sidewalk"

in_brooklyn = complaints['Borough'] == "BROOKLYN"

complaints[is_noise & in_brooklyn][:5]

2、计数

complaint_counts = complaints['Complaint Type'].value_counts()

complaint_counts[:10]

3、分类计数

weekday_counts = berri_bikes.groupby('weekday').aggregate(sum)

4、合并

weather_2012 = pd.concat([f(x) for x in l])

5、字符串操作

 weather_2012['Weather'].str.contains('Snow')

6、修改采样频率(可以获得更宏观的图形)

weather_2012['Temp (C)'].resample('M').apply(np.median)

median:中位数 mean:均值

7、清理数据

http://nbviewer.jupyter.org/github/jvns/pandas-cookbook/blob/master/cookbook/Chapter%207%20-%20Cleaning%20up%20messy%20data.ipynb

rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)

long_zip_codes = requests['Incident Zip'].str.len() > 5

requests['Incident Zip'][long_zip_codes].unique()

zips = zips.str.slice(0, 5)

zero_zips = zips == '00000'

zips[zero_zips] = np.nan

zips = requests['Incident Zip']

is_close = zips.str.startswith('0') | zips.str.startswith('1')

is_far = ~(is_close) & zips.notnull()

8、时间转换

popcon['atime'] = pd.to_datetime(popcon['atime'], unit='s')

popcon = popcon[popcon['atime'] > '1970-01-01']

nonlibraries = popcon[~popcon['package-name'].str.contains('lib')]

9、与数据库交互

con = sqlite3.connect("../data/weather_2012.sqlite")

df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con, 

                 index_col=['id', 'date_time'])

con.execute("DROP TABLE IF EXISTS weather_2012")

weather_df.to_sql("weather_2012", con)

相关推荐