数据准备2 数据清洗
数据准备2 数据清洗
?
import platform from pandas import DataFrame print("当前所使用的python版本是:", platform.python_version()) data = {‘Chinese‘: [66, 95, 93, 90, 80], ‘English‘: [65, 85, 92, 88, 90], ‘Math‘: [30, 98, 96, 77, 90]} df = DataFrame(data, index=[‘Zhangfei‘, ‘Guanyu‘, ‘Zhaoyun‘, ‘huangzhong‘, ‘Machao‘])
1.删除不必要的行
# 删除列 print("删除列:") df1 = df.drop(columns=[‘Chinese‘]) print(df1) print("---------------") # 删除行 print("删除行:") df1_1 = df.drop(index=[‘Machao‘]) print(df1_1) print("---------------")
2.重命名列
print("重命名列") df2 = df.rename(columns={‘Chinese‘: ‘语文‘, ‘English‘: ‘英语‘, ‘math‘: ‘数学‘}) print(df2) print("---------------")
3.清除重复行
df_temp = DataFrame({‘A‘: [1, 1, 2, 2], ‘B‘: [‘a‘, ‘b‘, ‘a‘, ‘b‘]}) df3 = df_temp.drop_duplicates(‘B‘, ‘first‘, inplace=False) print(df3) # 这里drop_duplicate的参数是 # subset,指定对特定列进行清除重复项 # keep,‘first‘,删除重复项且保留第一次出现的,另外还有‘last‘和‘False‘ # inplace:默认False---保留副本,True为在原数据上修改 print("---------------") ‘‘‘ def drop_duplicates( self, subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", inplace: bool = False, ignore_index: bool = False, ) -> Optional["DataFrame"] ‘‘‘
4.更改字段数据格式
print(‘更改字段数据格式‘) df1_1[‘English‘].astype(‘str‘)
5.清除数据间的空格 --- str.strip
# 清除两侧空格 # df1_1[‘English‘]=df1_1[‘English‘].map(str.strip()) # 清除左侧空格 # df1_1[‘English‘]=df1_1[‘English‘].map(str.lstrip()) # 清除右侧空格 # df1_1[‘English‘]=df1_1[‘English‘].map(str.rstrip()) # 删除‘English‘字段中的美元符号‘$‘ # df1_1[‘English‘]=df1_1[‘English‘].str.strip()
6.大小写转换、首字母大写
print("大小写转换、首字母大写") df1.columns = df1.columns.str.upper() print(df1) df1.columns = df1.columns.str.lower() print(df1) df1.columns = df1.columns.str.title() print(df1) print("------------------")
7.查找空值
print("查询空值:") df7 = DataFrame({‘A‘: [1, 1, 2, 2], ‘B‘: [‘a‘, ‘b‘, ‘a‘, None]}) # 对全表查询 print(df7.isnull()) # 对列查询 print(df7.isnull().any()) # 注,要设置为None才是空值,‘‘不为空值 print("------------------")
8.apply函数进行数据清洗
# 对某一列数据全部大写 print("apply转换为大写:") df8_1 = DataFrame({‘A‘: [1, 1, 2, 2], ‘B‘: ["a", "b", "c", "d"]}) df8_1[‘B‘] = df8_1[‘B‘].apply(str.upper) print(df8_1) print() # 注,无法对None使用upper 且 需要是String类型,否则会报错:TypeError: descriptor ‘upper‘ requires a ‘str‘ object but received a ‘NoneType‘
# apply调用定义的函数,apply调用时不需要在函数名后传参 # 对单元格的值*2 print("A列值转换为两倍:") def double_val(x): return 2*x df8_1[‘A‘] = df8_1[‘A‘].apply(double_val) print(df8_1) print()
# 通过函数新增列:取英语和数学之和的m倍,取英语和数学之和的n倍 print("新增列:") def newpercentile_columns(df1, m, n): df1[‘percentile_m‘] = (df1[u‘English‘]+df1[u‘Math‘])*m df1[‘percentile_n‘] = (df1[u‘English‘]+df1[u‘Math‘])*n return df1 df1 = df1.apply(newpercentile_columns, axis=1, args=(2, 3)) print(df1)
?
相关推荐
茄肥猫的窝 2020-10-29
kkbb 2020-10-27
jinhao 2020-09-07
一次次尝试 2020-06-25
zhangxiaojiakele 2020-05-26
柠檬班 2020-05-16
winmeanyoung 2020-04-26
囧芝麻 2020-03-27
muhongdi 2020-02-03
Norsaa 2019-12-12
learnpy 2019-12-02
大白配小猪 2019-11-15
你情我愿 2018-05-29
morexyoung 2019-11-01
duanlove技术路途 2019-10-25
chenhui 2019-08-16
HappinessSourceL 2019-06-26
dongnaosenlu 2019-06-13