如何在C++程序中调用lightgbm (How to use lightgbm in C++ program)

本文作者为tieying zhang,有任何问题请联系[email protected]

Lightgbm以轻量著称,所以在实际的C++程序中,常常需要使用。但是官方文档并没有介绍如何在C++中调用lightgbm接口,也没有任何例子可供参考,网上的文档也基本没有。这篇文章中我介绍下如何在C++中调用lightgbm。有任何问题请联系[email protected]

具体步骤如下:

  1. 首先需要下载lightgbm的源码包,从官网下载即可。官网也给出了如何编译,但是最后一定要sudo make install(这个官网没有给出)。
  2. C++调用的代码片段如下。首先load已经train好的model(以txt的形式存在磁盘上),之后用该模型进行inference,需要预测的数据可以是文件形式直接指定目录,也可以直接多行数据塞给模型。
  3. 编译C++文件:g++ -g -Wall -std=c++11 test.cpp -l_lightgbm 注意,用到了l_lightgbm,这个.so库是上面make install直接放入到了/usr/local/lib下。如果找不到该库,需要whereis查看一下,把相应目录加入到lib path里如:export LD_LIBRARY_PATH=/lib:/usr/lib:/usr/local/lib
#include <LightGBM/c_api.h>
#include <iostream>
#include <vector>

std::string predict(std::string data)
{
    std::string pred_result = "";
    int temp;
    int p = 1;
    BoosterHandle handle;  
    
    // load model 
    temp = LGBM_BoosterCreateFromModelfile("test_model1.txt", &p, &handle);    
    std::cout <<"load result value is "<<temp <<std::endl;
    
    // file data
    const char* para = "None";
    int res = LGBM_BoosterPredictForFile(handle, "test_data.csv", 0, C_API_PREDICT_NORMAL, 0, para, "result");
    std::cout << "file predict result is " << res << std::endl;
    
    // row data
    std::vector<int> row(40, 0);
    void* in_p = static_cast<void*>(row.data());
    
    std::vector<double> out(1, 0);
    double* out_result = static_cast<double*>(out.data());
    
    int64_t out_len;
    res = LGBM_BoosterPredictForMat(handle, in_p, C_API_DTYPE_FLOAT32, 1, 40, 1, C_API_PREDICT_NORMAL, 50, "None", &out_len, out_result);
    std::cout << "row predict return is " << res << std::endl;
    std::cout << "row predict result size is " << out.size() << " value is " << out[0] << std::endl;
   
    return pred_result; 
    /*I know the above return statement is completely insignificant. But i wanted to use the loaded model to predict the data points further.*/
}

int main() {
  predict("hahaha"); 
   
  std::cout << "Ok complete!"<< std::endl;
  return 0;
}

遇到的问题汇总:

  1. lib_lightgbm.so: cannot open shared object file: No such file or directory

export LD_LIBRARY_PATH=/lib:/usr/lib:/usr/local/lib

  1. 代码参照

data_size_t定义在include/LightGBM/meta.h

typedef int32_t data_size_t;
  1. 用C++解析输入file可以借鉴已有code:在application/predictor.hpp中。注意比较重要的是TextReader<data_size_t> predict_data_reader(data_filename, header)使用了utils下面的utils/text_reader.h
  2. 真正的predict函数在application/predictor.cpp里:

    /*!

    • brief predicting on data, then saving result to disk
    • param data_filename Filename of data
    • param result_filename Filename of output result
*/
  void Predict(const char* data_filename, const char* result_filename, bool header) {
    auto writer = VirtualFileWriter::Make(result_filename);
    if (!writer->Init()) {
      Log::Fatal("Prediction results file %s cannot be found", result_filename);
    }
    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(data_filename, header, boosting_->MaxFeatureIdx() + 1, boosting_->LabelIdx()));

    if (parser == nullptr) {
      Log::Fatal("Could not recognize the data format of data file %s", data_filename);
    }

    TextReader<data_size_t> predict_data_reader(data_filename, header);
    std::unordered_map<int, int> feature_names_map_;
    bool need_adjust = false;
    if (header) {
      std::string first_line = predict_data_reader.first_line();
      std::vector<std::string> header_words = Common::Split(first_line.c_str(), "\t,");
      header_words.erase(header_words.begin() + boosting_->LabelIdx());
      for (int i = 0; i < static_cast<int>(header_words.size()); ++i) {
        for (int j = 0; j < static_cast<int>(boosting_->FeatureNames().size()); ++j) {
          if (header_words[i] == boosting_->FeatureNames()[j]) {
            feature_names_map_[i] = j;
            break;
          }
        }
      }
      for (auto s : feature_names_map_) {
        if (s.first != s.second) {
          need_adjust = true;
          break;
        }
      }
    }
    // function for parse data
    std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun;
    double tmp_label;
    parser_fun = [&]
    (const char* buffer, std::vector<std::pair<int, double>>* feature) {
      parser->ParseOneLine(buffer, feature, &tmp_label);
      if (need_adjust) {
        int i = 0, j = static_cast<int>(feature->size());
        while (i < j) {
          if (feature_names_map_.find((*feature)[i].first) != feature_names_map_.end()) {
            (*feature)[i].first = feature_names_map_[(*feature)[i].first];
            ++i;
          } else {
            //move the non-used features to the end of the feature vector
            std::swap((*feature)[i], (*feature)[--j]);
          }
        }
        feature->resize(i);
      }
    };

    std::function<void(data_size_t, const std::vector<std::string>&)> process_fun = [&]
    (data_size_t, const std::vector<std::string>& lines) {
      std::vector<std::pair<int, double>> oneline_features;
      std::vector<std::string> result_to_write(lines.size());
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static) firstprivate(oneline_features)
      for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
        OMP_LOOP_EX_BEGIN();
        oneline_features.clear();
        // parser
        parser_fun(lines[i].c_str(), &oneline_features);
        // predict
        std::vector<double> result(num_pred_one_row_);
        predict_fun_(oneline_features, result.data());
        auto str_result = Common::Join<double>(result, "\t");
        result_to_write[i] = str_result;
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
      for (data_size_t i = 0; i < static_cast<data_size_t>(result_to_write.size()); ++i) {
        writer->Write(result_to_write[i].c_str(), result_to_write[i].size());
        writer->Write("\n", 1);
      }
    };
    predict_data_reader.ReadAllAndProcessParallel(process_fun);
  }