python学习-数据清洗
1、handling missing Data
string_data = pd.Series([‘aardvark‘, ‘artichoke‘, np.nan, ‘avocado‘]) string_data.isnull() string_data[0] = None #dropna fillna isnull notnull from numpy import nan as NA string_data[string_data.notnull()] data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]]) data.dropna(how=‘all‘) data.dropna(axis=1, how=‘all‘) df = pd.DataFrame(np.random.randn(7, 3)) df.iloc[:4, 1] = NA df.iloc[:2, 2] = NA df.dropna() df.dropna(thresh=2) #填充缺失值 df.fillna(0) df.fillna({1: 0.5, 2: 0}) _ = df.fillna(0, inplace=True)#修改原来对象 df = pd.DataFrame(np.random.randn(6, 3)) df.iloc[2:, 1] = NA df.iloc[4:, 2] = NA df.fillna(method=‘ffill‘) df.fillna(method=‘ffill‘, limit=2)
2、数据转换
#去掉重复值 data = pd.DataFrame({‘k1‘: [‘one‘, ‘two‘] * 3 + [‘two‘],‘k2‘: [1, 1, 2, 3, 3, 4, 4]}) data.duplicated() data.drop_duplicates() data[‘v1‘] = range(7) data.drop_duplicates([‘k1‘]) data.drop_duplicates([‘k1‘, ‘k2‘], keep=‘last‘) #使用函数和映射转换Map data = pd.DataFrame({‘food‘: [‘bacon‘, ‘pulled pork‘, ‘bacon‘, ‘Pastrami‘, ‘corned beef‘, ‘Bacon‘, ‘pastrami‘, ‘honey ham‘, ‘nova lox‘], ‘ounces‘: [4, 3, 12, 6, 7.5, 8, 3, 5, 6]}) meat_to_animal = { ‘bacon‘: ‘pig‘, ‘pulled pork‘: ‘pig‘, ‘pastrami‘: ‘cow‘, ‘corned beef‘: ‘cow‘, ‘honey ham‘: ‘pig‘, ‘nova lox‘: ‘salmon‘ } lowercased = data[‘food‘].str.lower() data[‘animal‘] = lowercased.map(meat_to_animal) data[‘food‘].map(lambda x: meat_to_animal[x.lower()]) #Replacing Values data = pd.Series([1., -999., 2., -999., -1000., 3.]) data.replace(-999, np.nan) data.replace([-999,-1000],np.nan) data.replace([-999,-1000],[np.nan,0]) data.replace({-999:np.nan,-1000:0}) #Renaming Axis Indexes data = pd.DataFrame(np.arange(12).reshape((3, 4)), index=[‘Ohio‘, ‘Colorado‘, ‘New York‘], columns=[‘one‘, ‘two‘, ‘three‘, ‘four‘]) transform = lambda x: x[:4].upper() data.index.map(transform) data.index = data.index.map(transform) data.rename(index=str.title, columns=str.upper) data.rename(index={‘OHIO‘: ‘INDIANA‘},columns={‘three‘: ‘peekaboo‘}) data.rename(index={‘OHIO‘: ‘INDIANA‘}, inplace=True) #离散化和分箱 ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] bins = [18, 25, 35, 60, 100] cats = pd.cut(ages,bins) cats.codes cats.categories pd.value_counts(cats) pd.cut(ages,[18,26,36,61,100],right=False) group_names = [‘Youth‘, ‘YoungAdult‘, ‘MiddleAged‘, ‘Senior‘] pd.cut(ages,bins,labels=group_names) data = np.random.rand(20) pd.cut(data, 4, precision=2) data = np.random.randn(1000) cats = pd.qcut(data, 4) #Detecting and Filtering Outliers data = pd.DataFrame(np.random.randn(1000, 4)) data.describe() col = data[2] col[np.abs(col)>3] data[(np.abs(data) > 3).any(1)] data[np.abs(data) > 3] = np.sign(data) * 3 data.describe() np.sign(data).head() #随机排列 df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4))) sampler = np.random.permutation(5) df.take(sampler) df.sample(n=3) #随机取3行 choices = pd.Series([5, 7, -1, 6, 4]) draws = choices.sample(n=10, replace=True) #Computing Indicator/Dummy Variables df = pd.DataFrame({‘key‘: [‘b‘, ‘b‘, ‘a‘, ‘c‘, ‘a‘, ‘b‘], ‘data1‘: range(6)}) pd.get_dummies(df[‘key‘]) dummies = pd.get_dummies(df[‘key‘], prefix=‘key‘) df_with_dummy = df[[‘data1‘]].join(dummies) np.random.seed(12345) values = np.random.rand(10) bins = [0, 0.2, 0.4, 0.6, 0.8, 1] pd.get_dummies(pd.cut(values, bins))
3、String Object Methods
val = ‘a,b, guido‘ val.split(‘,‘) pieces = [x.strip() for x in val.split(‘,‘)] first, second, third = pieces first + ‘::‘ + second + ‘::‘ + third #等价于下面表达式 ‘::‘.join(pieces) ‘guido‘ in val val.index(‘,‘) #如果不存在会报错 val.find(‘:‘) # val.count(‘,‘) val.replace(‘,‘, ‘::‘) val.replace(‘,‘, ‘‘) #endswith startswith rfind strip rstrip lstrip lower upper casefold ljust rjust
4、正则表达式
import re text = "foo bar\t baz \tqux" re.split(‘\s+‘, text) #先编译后调用split方法,等价于下面的方法 regex = re.compile(‘\s+‘) regex.split(text) regex.findall(text) text = """Dave Steve Rob Ryan """ pattern = r‘[A-Z0-9._%+-][A-Z0-9.-]+\.[A-Z]{2,4}‘ regex = re.compile(pattern, flags=re.IGNORECASE) regex.findall(text) m = regex.search(text) #返回第一个匹配类型 text[m.start():m.end()] print(regex.match(text)) #匹配是否发生在开始位置 print(regex.sub(‘REDACTED‘, text)) #通过替换匹配的值返回一个新值 pattern = r‘([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})‘ regex = re.compile(pattern, flags=re.IGNORECASE) m = regex.match(‘‘) regex.findall(text) print(regex.sub(r‘Username: \1, Domain: \2, Suffix: \3‘, text))
5、Vectorized String Functions in pandas
data = {‘Dave‘: ‘‘, ‘Steve‘: ‘‘,‘Rob‘: ‘‘, ‘Wes‘: np.nan} data = pd.Series(data) data.isnull() data.str.contains(‘gmail‘) pattern=‘([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})‘ data.str.findall(pattern, flags=re.IGNORECASE) matches = data.str.match(pattern, flags=re.IGNORECASE) matches.str.get(1)
相关推荐
茄肥猫的窝 2020-10-29
kkbb 2020-10-27
jinhao 2020-09-07
lanmantech 2020-07-26
一次次尝试 2020-06-25
zhangxiaojiakele 2020-05-26
winmeanyoung 2020-04-26
囧芝麻 2020-03-27
muhongdi 2020-02-03
Norsaa 2019-12-12
learnpy 2019-12-02
大白配小猪 2019-11-15
你情我愿 2018-05-29
morexyoung 2019-11-01
duanlove技术路途 2019-10-25
chenhui 2019-08-16
HappinessSourceL 2019-06-26
dongnaosenlu 2019-06-13