压缩pandas中dataframe内存
从这里找的一个宝贝源码,可以大大缓解内存问题。https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65/code
# @from: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65/code # @liscense: Apache 2.0 # @author: weijian def reduce_mem_usage(props): # 计算当前内存 start_mem_usg = props.memory_usage().sum() / 1024 ** 2 print("Memory usage of the dataframe is :", start_mem_usg, "MB") # 哪些列包含空值,空值用-999填充。why:因为np.nan当做float处理 NAlist = [] for col in props.columns: # 这里只过滤了objectd格式,如果你的代码中还包含其他类型,请一并过滤 if (props[col].dtypes != object): print("**************************") print("columns: ", col) print("dtype before", props[col].dtype) # 判断是否是int类型 isInt = False mmax = props[col].max() mmin = props[col].min() # Integer does not support NA, therefore Na needs to be filled if not np.isfinite(props[col]).all(): NAlist.append(col) props[col].fillna(-999, inplace=True) # 用-999填充 # test if column can be converted to an integer asint = props[col].fillna(0).astype(np.int64) result = np.fabs(props[col] - asint) result = result.sum() if result < 0.01: # 绝对误差和小于0.01认为可以转换的,要根据task修改 isInt = True # make interger / unsigned Integer datatypes if isInt: if mmin >= 0: # 最小值大于0,转换成无符号整型 if mmax <= 255: props[col] = props[col].astype(np.uint8) elif mmax <= 65535: props[col] = props[col].astype(np.uint16) elif mmax <= 4294967295: props[col] = props[col].astype(np.uint32) else: props[col] = props[col].astype(np.uint64) else: # 转换成有符号整型 if mmin > np.iinfo(np.int8).min and mmax < np.iinfo(np.int8).max: props[col] = props[col].astype(np.int8) elif mmin > np.iinfo(np.int16).min and mmax < np.iinfo(np.int16).max: props[col] = props[col].astype(np.int16) elif mmin > np.iinfo(np.int32).min and mmax < np.iinfo(np.int32).max: props[col] = props[col].astype(np.int32) elif mmin > np.iinfo(np.int64).min and mmax < np.iinfo(np.int64).max: props[col] = props[col].astype(np.int64) else: # 注意:这里对于float都转换成float16,需要根据你的情况自己更改 props[col] = props[col].astype(np.float16) print("dtype after", props[col].dtype) print("********************************") print("___MEMORY USAGE AFTER COMPLETION:___") mem_usg = props.memory_usage().sum() / 1024**2 print("Memory usage is: ",mem_usg," MB") print("This is ",100*mem_usg/start_mem_usg,"% of the initial size") return props, NAlist
相关推荐
三石 2020-10-30
roamer 2020-10-29
三石 2020-10-29
wangquannuaa 2020-10-15
wangquannuaa 2020-09-29
jzlixiao 2020-09-15
wangquannuaa 2020-08-30
三石 2020-08-23
逍遥友 2020-08-21
jzlixiao 2020-08-18
wangquannuaa 2020-08-17
QianYanDai 2020-08-16
cjsyrwt 2020-08-14
jzlixiao 2020-07-29
xirongxudlut 2020-07-20
mmmjyjy 2020-07-16
QianYanDai 2020-07-05
QianYanDai 2020-07-05
june0 2020-07-04