【数据分析&数据挖掘】数据合并和拼接案例
import pandas as pd import numpy as np # 加载数据——detail detail_1 = pd.read_excel("./meal_order_detail.xlsx", sheetname=0) detail_2 = pd.read_excel("./meal_order_detail.xlsx", sheetname=1) detail_3 = pd.read_excel("./meal_order_detail.xlsx", sheetname=2) print("detail_1 的形状: \n", detail_1.shape) print("detail_1 的列索引: \n", detail_1.columns) print("detail_2 的形状: \n", detail_2.shape) print("detail_2 的列索引: \n", detail_2.columns) print("detail_3 的形状: \n", detail_3.shape) print("detail_3 的列索引: \n", detail_3.columns) print("~"*60) # 将detail_2, detail_3直接追加到detaiL_1下面 detail = pd.concat((detail_1, detail_2, detail_3), axis=0, join="inner") print("detail的形状;\n", detail.shape) # 加载info info = pd.read_csv("./meal_order_info.csv", encoding="ansi") print("info: \n", info.shape) # info与detail进行主键拼接 res = pd.merge(left=detail, right=info, left_on="order_id", right_on="info_id", how="inner") res = pd.merge(left=detail, right=info, left_on="order_id", right_on="info_id", how="left") print("info与detail主键拼接的结果为: \n", res.shape) print("res的列名: \n", res.columns) # 加载users users = pd.read_excel("./users.xlsx") # info与detail进行主键拼接的结果与users进行主键拼接 res = pd.merge(left=res, right=users, left_on="name", right_on="ACCOUNT", how="inner") print("最终进行主键拼接的结果: \n", res) print("最终res的列名称: \n", res.columns) print("name与ACCOUNT对比相同", np.all(res.loc[:, "name"] == res.loc[:, "ACCOUNT"])) print("order_id与info_id对比相同", np.all(res.loc[:, "order_id"] == res.loc[:, "info_id"])) print("emp_id_x与emp_id_y对比相同", np.all(res.loc[:, "emp_id_x"] == res.loc[:, "emp_id_y"])) res.drop(labels=["ACCOUNT", "info_id", "emp_id_y"], axis=1, inplace=True) print("删除3列之后的结果: \n", res.shape) print("删除3列之后的结果: \n", res.columns) drop_list = [] for column in res.columns: # 统计每一列的非空数据的数量 res_count = res.loc[:, column].count() # 如果整列非空数据的数量为0,意味着整列都是空的 if res_count == 0: drop_list.append(column) # 删除整列为空的列 res.drop(labels=drop_list, axis=1, inplace=True) print("去除整列为空的数据之后的结果: \n", res.shape) print("去除整列为空的数据之后的结果: \n", res.columns) drop_dup_list = [] # 如果整列数据完全相同——该列, 该属性对于区分各列没有意义 for column in res.columns: res_ = res.drop_duplicates(subset=column, inplace=False) if res_.shape[0] == 1: print("res_.shape[0]: \n", res_.shape[0]) drop_dup_list.append(column) # 删除全部一样的列 res.drop(labels=drop_dup_list, axis=1, inplace=True) print("最终的结果: \n", res.shape) print("最终的结果: \n", res.columns)
相关推荐
sifeimeng 2020-06-21
阿斌Elements 2020-05-05
songerxing 2020-05-03
yixiaoqi00 2020-04-22
要啥自行车一把梭 2020-04-17
随心而作 2020-04-07
JayFighting 2020-04-07
80327065 2020-03-09
lhxxhl 2020-03-06
woxmh 2020-03-04
hgzhang 2020-02-18
时光如瑾雨微凉 2020-02-02
winmeanyoung 2020-01-29
wyqwilliam 2020-01-17
typhoonpython 2020-01-12
sofast 2020-01-08
snowpage 2014-07-10