使用NVIDIA开源Python RAPIDS库数据分析

Cocainebai

2020-04-18

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from xgboost import XGBClassifier
import cudf
import xgboost as xgb
from sklearn.metrics import accuracy_score
import time

dataset_len = 8000000
dlen = int(dataset_len/2)
X_11 = pd.Series(np.random.normal(2,2,dlen))
X_12 = pd.Series(np.random.normal(9,2,dlen))
X_1 = pd.concat([X_11, X_12]).reset_index(drop=True)
X_21 = pd.Series(np.random.normal(1,3,dlen))
X_22 = pd.Series(np.random.normal(7,3,dlen))
X_2 = pd.concat([X_21, X_22]).reset_index(drop=True)
X_31 = pd.Series(np.random.normal(3,1,dlen))
X_32 = pd.Series(np.random.normal(3,4,dlen))
X_3 = pd.concat([X_31, X_32]).reset_index(drop=True)
Y = pd.Series(np.repeat([0,1],dlen))
df = pd.concat([X_1, X_2, X_3, Y], axis=1)
df.columns = [‘X1‘, ‘X2‘, ‘X3‘, ‘Y‘]
df.head()

X = df.drop([‘Y‘], axis = 1).values
y = df[‘Y‘]

def preproces(df, X, y, train_size = 0.80):
    # label_encoder object knows how to understand word labels.
    label_encoder = preprocessing.LabelEncoder()
    # Encode labels
    y = label_encoder.fit_transform(y)
    # identify shape and indices
    num_rows, num_columns = df.shape
    delim_index = int(num_rows * train_size)
    # Splitting the dataset in training and test sets
    X_train, y_train = X[:delim_index, :], y[:delim_index]
    X_test, y_test = X[delim_index:, :], y[delim_index:]
    # Checking sets dimensions
    print(‘X_train dimensions: ‘, X_train.shape, ‘y_train: ‘, y_train.shape)
    print(‘X_test dimensions:‘, X_test.shape, ‘y_validation: ‘, y_test.shape)
    # Checking dimensions in percentages
    total = X_train.shape[0] + X_test.shape[0]
    print(‘X_train Percentage:‘, (X_train.shape[0]/total)*100, ‘%‘)
    print(‘X_test Percentage:‘, (X_test.shape[0]/total)*100, ‘%‘)
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = preproces(df, X, y)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

time_start=time.time()
# Initial xgb parameters
params = {}

clf = xgb.train(params, dtrain)
time_end=time.time()
times =  round(time_end-time_start)
print(f‘RAPIDS XGBoost: {times} s‘)
# Feature Importance plot!
xgb.plot_importance(clf)

rapids_pred = clf.predict(dtest)

rapids_pred = np.round(rapids_pred)
rapids_acc = round(accuracy_score(y_test, rapids_pred), 2)
print("XGB accuracy using RAPIDS:", rapids_acc*100, ‘%‘)

time_start=time.time()
model = XGBClassifier()
model.fit(X_train, y_train)
time_end=time.time()
times =  round(time_end-time_start)
print(f‘SKlearn XGBoost: {times} s‘ )

sk_pred = model.predict(X_test)
sk_pred = np.round(sk_pred)
sk_acc = round(accuracy_score(y_test, sk_pred), 2)
print("XGB accuracy using Sklearn:", sk_acc*100, ‘%‘)

运行结果Rapids_le.out

X_train dimensions:  (6400000, 3) y_train:  (6400000,)
X_test dimensions: (1600000, 3) y_validation:  (1600000,)
X_train Percentage: 80.0 %
X_test Percentage: 20.0 %
[17:04:50] WARNING: /conda/conda-bld/xgboost_1571337679414/work/src/gbm/gbtree.cc:130: Tree method is automatically selected to be ‘approx‘ for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to ‘exact‘.
RAPIDS XGBoost: 52 s
XGB accuracy using RAPIDS: 98.0 %
[17:05:45] WARNING: /conda/conda-bld/xgboost_1571337679414/work/src/gbm/gbtree.cc:130: Tree method is automatically selected to be ‘approx‘ for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to ‘exact‘.
SKlearn XGBoost: 858 s
XGB accuracy using Sklearn: 98.0 %

数据分析 test python