使用NVIDIA开源Python RAPIDS库数据分析
import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn import preprocessing from xgboost import XGBClassifier import cudf import xgboost as xgb from sklearn.metrics import accuracy_score import time dataset_len = 8000000 dlen = int(dataset_len/2) X_11 = pd.Series(np.random.normal(2,2,dlen)) X_12 = pd.Series(np.random.normal(9,2,dlen)) X_1 = pd.concat([X_11, X_12]).reset_index(drop=True) X_21 = pd.Series(np.random.normal(1,3,dlen)) X_22 = pd.Series(np.random.normal(7,3,dlen)) X_2 = pd.concat([X_21, X_22]).reset_index(drop=True) X_31 = pd.Series(np.random.normal(3,1,dlen)) X_32 = pd.Series(np.random.normal(3,4,dlen)) X_3 = pd.concat([X_31, X_32]).reset_index(drop=True) Y = pd.Series(np.repeat([0,1],dlen)) df = pd.concat([X_1, X_2, X_3, Y], axis=1) df.columns = [‘X1‘, ‘X2‘, ‘X3‘, ‘Y‘] df.head() X = df.drop([‘Y‘], axis = 1).values y = df[‘Y‘] def preproces(df, X, y, train_size = 0.80): # label_encoder object knows how to understand word labels. label_encoder = preprocessing.LabelEncoder() # Encode labels y = label_encoder.fit_transform(y) # identify shape and indices num_rows, num_columns = df.shape delim_index = int(num_rows * train_size) # Splitting the dataset in training and test sets X_train, y_train = X[:delim_index, :], y[:delim_index] X_test, y_test = X[delim_index:, :], y[delim_index:] # Checking sets dimensions print(‘X_train dimensions: ‘, X_train.shape, ‘y_train: ‘, y_train.shape) print(‘X_test dimensions:‘, X_test.shape, ‘y_validation: ‘, y_test.shape) # Checking dimensions in percentages total = X_train.shape[0] + X_test.shape[0] print(‘X_train Percentage:‘, (X_train.shape[0]/total)*100, ‘%‘) print(‘X_test Percentage:‘, (X_test.shape[0]/total)*100, ‘%‘) return X_train, y_train, X_test, y_test X_train, y_train, X_test, y_test = preproces(df, X, y) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) time_start=time.time() # Initial xgb parameters params = {} clf = xgb.train(params, dtrain) time_end=time.time() times = round(time_end-time_start) print(f‘RAPIDS XGBoost: {times} s‘) # Feature Importance plot! xgb.plot_importance(clf) rapids_pred = clf.predict(dtest) rapids_pred = np.round(rapids_pred) rapids_acc = round(accuracy_score(y_test, rapids_pred), 2) print("XGB accuracy using RAPIDS:", rapids_acc*100, ‘%‘) time_start=time.time() model = XGBClassifier() model.fit(X_train, y_train) time_end=time.time() times = round(time_end-time_start) print(f‘SKlearn XGBoost: {times} s‘ ) sk_pred = model.predict(X_test) sk_pred = np.round(sk_pred) sk_acc = round(accuracy_score(y_test, sk_pred), 2) print("XGB accuracy using Sklearn:", sk_acc*100, ‘%‘)
运行结果Rapids_le.out
X_train dimensions: (6400000, 3) y_train: (6400000,) X_test dimensions: (1600000, 3) y_validation: (1600000,) X_train Percentage: 80.0 % X_test Percentage: 20.0 % [17:04:50] WARNING: /conda/conda-bld/xgboost_1571337679414/work/src/gbm/gbtree.cc:130: Tree method is automatically selected to be ‘approx‘ for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to ‘exact‘. RAPIDS XGBoost: 52 s XGB accuracy using RAPIDS: 98.0 % [17:05:45] WARNING: /conda/conda-bld/xgboost_1571337679414/work/src/gbm/gbtree.cc:130: Tree method is automatically selected to be ‘approx‘ for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to ‘exact‘. SKlearn XGBoost: 858 s XGB accuracy using Sklearn: 98.0 %
相关推荐
flyfor0 2020-11-16
茄肥猫的窝 2020-10-29
kkbb 2020-10-27
gallon00 2020-10-16
aiwozhiai 2020-10-06
HashData0 2020-09-18
GooTal 2020-09-16
qiujiahao 2020-09-15
Dimples 2020-09-15
qiujiahao 2020-09-11
wangquannuaa 2020-08-30
逍遥友 2020-08-21
nxcjh 2020-08-17
CodeAsWind 2020-08-17
BMUranus 2020-08-16
zlfing 2020-08-16
wordmhg 2020-08-16
Cocainebai 2020-08-15
子昊的茶会 2020-08-04