如何在C++程序中调用lightgbm (How to use lightgbm in C++ program)
本文作者为tieying zhang,有任何问题请联系[email protected]
Lightgbm以轻量著称,所以在实际的C++程序中,常常需要使用。但是官方文档并没有介绍如何在C++中调用lightgbm接口,也没有任何例子可供参考,网上的文档也基本没有。这篇文章中我介绍下如何在C++中调用lightgbm。有任何问题请联系[email protected]
具体步骤如下:
- 首先需要下载lightgbm的源码包,从官网下载即可。官网也给出了如何编译,但是最后一定要sudo make install(这个官网没有给出)。
- C++调用的代码片段如下。首先load已经train好的model(以txt的形式存在磁盘上),之后用该模型进行inference,需要预测的数据可以是文件形式直接指定目录,也可以直接多行数据塞给模型。
- 编译C++文件:
g++ -g -Wall -std=c++11 test.cpp -l_lightgbm
注意,用到了l_lightgbm,这个.so库是上面make install直接放入到了/usr/local/lib下。如果找不到该库,需要whereis查看一下,把相应目录加入到lib path里如:export LD_LIBRARY_PATH=/lib:/usr/lib:/usr/local/lib
#include <LightGBM/c_api.h> #include <iostream> #include <vector> std::string predict(std::string data) { std::string pred_result = ""; int temp; int p = 1; BoosterHandle handle; // load model temp = LGBM_BoosterCreateFromModelfile("test_model1.txt", &p, &handle); std::cout <<"load result value is "<<temp <<std::endl; // file data const char* para = "None"; int res = LGBM_BoosterPredictForFile(handle, "test_data.csv", 0, C_API_PREDICT_NORMAL, 0, para, "result"); std::cout << "file predict result is " << res << std::endl; // row data std::vector<int> row(40, 0); void* in_p = static_cast<void*>(row.data()); std::vector<double> out(1, 0); double* out_result = static_cast<double*>(out.data()); int64_t out_len; res = LGBM_BoosterPredictForMat(handle, in_p, C_API_DTYPE_FLOAT32, 1, 40, 1, C_API_PREDICT_NORMAL, 50, "None", &out_len, out_result); std::cout << "row predict return is " << res << std::endl; std::cout << "row predict result size is " << out.size() << " value is " << out[0] << std::endl; return pred_result; /*I know the above return statement is completely insignificant. But i wanted to use the loaded model to predict the data points further.*/ } int main() { predict("hahaha"); std::cout << "Ok complete!"<< std::endl; return 0; }
遇到的问题汇总:
- lib_lightgbm.so: cannot open shared object file: No such file or directory
export LD_LIBRARY_PATH=/lib:/usr/lib:/usr/local/lib
- 代码参照
data_size_t定义在include/LightGBM/meta.h
:
typedef int32_t data_size_t;
- 用C++解析输入file可以借鉴已有code:在application/predictor.hpp中。注意比较重要的是
TextReader<data_size_t>
predict_data_reader(data_filename, header)使用了utils下面的utils/text_reader.h
真正的predict函数在application/predictor.cpp里:
/*!
- brief predicting on data, then saving result to disk
- param data_filename Filename of data
- param result_filename Filename of output result
*/ void Predict(const char* data_filename, const char* result_filename, bool header) { auto writer = VirtualFileWriter::Make(result_filename); if (!writer->Init()) { Log::Fatal("Prediction results file %s cannot be found", result_filename); } auto parser = std::unique_ptr<Parser>(Parser::CreateParser(data_filename, header, boosting_->MaxFeatureIdx() + 1, boosting_->LabelIdx())); if (parser == nullptr) { Log::Fatal("Could not recognize the data format of data file %s", data_filename); } TextReader<data_size_t> predict_data_reader(data_filename, header); std::unordered_map<int, int> feature_names_map_; bool need_adjust = false; if (header) { std::string first_line = predict_data_reader.first_line(); std::vector<std::string> header_words = Common::Split(first_line.c_str(), "\t,"); header_words.erase(header_words.begin() + boosting_->LabelIdx()); for (int i = 0; i < static_cast<int>(header_words.size()); ++i) { for (int j = 0; j < static_cast<int>(boosting_->FeatureNames().size()); ++j) { if (header_words[i] == boosting_->FeatureNames()[j]) { feature_names_map_[i] = j; break; } } } for (auto s : feature_names_map_) { if (s.first != s.second) { need_adjust = true; break; } } } // function for parse data std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun; double tmp_label; parser_fun = [&] (const char* buffer, std::vector<std::pair<int, double>>* feature) { parser->ParseOneLine(buffer, feature, &tmp_label); if (need_adjust) { int i = 0, j = static_cast<int>(feature->size()); while (i < j) { if (feature_names_map_.find((*feature)[i].first) != feature_names_map_.end()) { (*feature)[i].first = feature_names_map_[(*feature)[i].first]; ++i; } else { //move the non-used features to the end of the feature vector std::swap((*feature)[i], (*feature)[--j]); } } feature->resize(i); } }; std::function<void(data_size_t, const std::vector<std::string>&)> process_fun = [&] (data_size_t, const std::vector<std::string>& lines) { std::vector<std::pair<int, double>> oneline_features; std::vector<std::string> result_to_write(lines.size()); OMP_INIT_EX(); #pragma omp parallel for schedule(static) firstprivate(oneline_features) for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) { OMP_LOOP_EX_BEGIN(); oneline_features.clear(); // parser parser_fun(lines[i].c_str(), &oneline_features); // predict std::vector<double> result(num_pred_one_row_); predict_fun_(oneline_features, result.data()); auto str_result = Common::Join<double>(result, "\t"); result_to_write[i] = str_result; OMP_LOOP_EX_END(); } OMP_THROW_EX(); for (data_size_t i = 0; i < static_cast<data_size_t>(result_to_write.size()); ++i) { writer->Write(result_to_write[i].c_str(), result_to_write[i].size()); writer->Write("\n", 1); } }; predict_data_reader.ReadAllAndProcessParallel(process_fun); }