From 0cb436db5cd5d5791404da8fc050d36d8e93d4bc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 5 Dec 2023 14:01:17 +0000 Subject: [PATCH] templates for bins in pairwise ranking dataset --- include/LightGBM/bin.h | 4 +- include/LightGBM/dataset.h | 4 ++ .../LightGBM/pairwise_ranking_feature_group.h | 12 +++--- src/io/bin.cpp | 26 ++++++------- src/io/dataset.cpp | 4 +- src/io/pairwise_lambdarank_bin.hpp | 37 +++++++------------ src/io/pairwise_ranking_feature_group.cpp | 19 +++------- 7 files changed, 46 insertions(+), 60 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index be306acf6928..50adb57714db 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -474,7 +474,7 @@ class Bin { * \return The bin data object */ template typename PAIRWISE_BIN_TYPE> - static Bin* CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + static Bin* CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! * \brief Create object for bin data of one feature, used for pairwise ranking, for an original sparse bin @@ -484,7 +484,7 @@ class Bin { * \return The bin data object */ template typename PAIRWISE_BIN_TYPE> - static Bin* CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + static Bin* CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); /*! * \brief Deep copy the bin diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 00281367b093..79879bc4ad41 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -269,6 +269,10 @@ class Metadata { } } + inline data_size_t paired_ranking_item_index_map_size() const { + return static_cast(paired_ranking_item_index_map_.size()); + } + /*! * \brief Get data boundaries on queries, if not exists, will return nullptr * we assume data will order by query, diff --git a/include/LightGBM/pairwise_ranking_feature_group.h b/include/LightGBM/pairwise_ranking_feature_group.h index 2055855b775c..ed0953e97d0f 100644 --- a/include/LightGBM/pairwise_ranking_feature_group.h +++ b/include/LightGBM/pairwise_ranking_feature_group.h @@ -28,8 +28,8 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { * \param is_first_or_second_in_pairing Mark whether features in this group belong to the first or second element in the pairing */ - PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_data, const int is_first_or_second_in_pairing): - FeatureGroup(other, num_data), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) {} + PairwiseRankingFeatureGroup(const FeatureGroup& other, int num_original_data, const int is_first_or_second_in_pairing, int num_pairs, const std::pair* paired_ranking_item_index_map): + FeatureGroup(other, num_original_data), paired_ranking_item_index_map_(paired_ranking_item_index_map), num_data_(num_pairs), is_first_or_second_in_pairing_(is_first_or_second_in_pairing) {} /*! * \brief Constructor from memory when data is present @@ -62,11 +62,11 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { * \param memory Pointer of memory * \param group_id Id of group */ - const char* LoadDefinitionFromMemory(const void* memory, int group_id) { + const char* LoadDefinitionFromMemory(const void* /*memory*/, int /*group_id*/) { // TODO(shiyu1994) } - inline BinIterator* SubFeatureIterator(int sub_feature) { + inline BinIterator* SubFeatureIterator(int /*sub_feature*/) { // TODO(shiyu1994) } @@ -79,11 +79,11 @@ class PairwiseRankingFeatureGroup: public FeatureGroup { } private: - template + template typename PAIRWISE_BIN_TYPE> void CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse); void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) override; - + /*! \brief Pairwise data index to original data indices for ranking with pairwise features */ const std::pair* paired_ranking_item_index_map_; /*! \brief Number of pairwise data */ diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 95626d150d96..6deabe562ca9 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -634,36 +634,36 @@ namespace LightGBM { } template typename PAIRWISE_BIN_TYPE> - Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { + Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { if (num_bin <= 16) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else if (num_bin <= 256) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else if (num_bin <= 65536) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } else { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new DenseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new DenseBin(num_original_data)); } } template typename PAIRWISE_BIN_TYPE> - Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin) { + Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map) { if (num_bin <= 256) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new SparseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); } else if (num_bin <= 65536) { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new SparseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); } else { - return new PAIRWISE_BIN_TYPE>(num_data, paired_ranking_item_index_map, new SparseBin(num_data), min_bin, max_bin, most_freq_bin); + return new PAIRWISE_BIN_TYPE>(num_pairs, paired_ranking_item_index_map, new SparseBin(num_original_data)); } } - template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); - template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + template Bin* Bin::CreateSparsePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); - template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); - template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_data, int num_bin, const std::pair* paired_ranking_item_index_map, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin); + template Bin* Bin::CreateDensePairwiseRankingBin(data_size_t num_original_data, int num_bin, data_size_t num_pairs, const std::pair* paired_ranking_item_index_map); MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate, const std::vector& offsets) { diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 4dafa4ef5ced..0b4985c5a2ee 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -820,7 +820,7 @@ void Dataset::CreateValid(const Dataset* dataset) { gpu_device_id_ = dataset->gpu_device_id_; } -void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vector> pair_index_map) { +void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vector> /* TODO(shiyu1994) pair_index_map*/) { metadata_.BuildPairwiseFeatureRanking(dataset->metadata()); feature_groups_.clear(); @@ -859,7 +859,7 @@ void Dataset::CreatePairWiseRankingData(const Dataset* dataset, std::vectorfeature2subfeature_[original_group_feature_start + feature_index_in_group]); cur_feature_index += 1; } - feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_, is_first_or_second_in_pairing)); + feature_groups_.emplace_back(new PairwiseRankingFeatureGroup(*dataset->feature_groups_[original_group_index].get(), num_data_, is_first_or_second_in_pairing, metadata_.paired_ranking_item_index_map_size(), metadata_.paired_ranking_item_index_map())); num_total_bin += dataset->FeatureGroupNumBin(original_group_index); group_bin_boundaries_.push_back(num_total_bin); group_feature_cnt_[i] = dataset->group_feature_cnt_[original_group_index]; diff --git a/src/io/pairwise_lambdarank_bin.hpp b/src/io/pairwise_lambdarank_bin.hpp index 1e177c694170..c3d1854acda3 100644 --- a/src/io/pairwise_lambdarank_bin.hpp +++ b/src/io/pairwise_lambdarank_bin.hpp @@ -109,44 +109,33 @@ class PairwiseRankingSecondIterator: public BinIterator { data_size_t prev_index_; }; -template -class PairwiseRankingFirstBin: public BIN_TYPE { +template typename ITERATOR_TYPE> +class PairwiseRankingBin: public BIN_TYPE { public: - PairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin): BIN_TYPE(0), unpaired_bin_(unpaired_bin), min_bin_(min_bin), max_bin_(max_bin), most_freq_bin_(most_freq_bin) { - paired_ranking_item_index_map_ = paired_ranking_item_index_map; + PairwiseRankingBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): BIN_TYPE(0), paired_ranking_item_index_map_(paired_ranking_item_index_map), unpaired_bin_(unpaired_bin) { num_data_ = num_data; } BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { - return new PairwiseRankingFirstIterator(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin_, max_bin_, most_freq_bin_); + return new ITERATOR_TYPE(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin, max_bin, most_freq_bin); } - private: + protected: const std::pair* paired_ranking_item_index_map_; const std::shared_ptr unpaired_bin_; - const uint32_t min_bin_; - const uint32_t max_bin_; - const uint32_t most_freq_bin_; + data_size_t num_data_; }; template -class PairwiseRankingSecondBin: public BIN_TYPE { +class PairwiseRankingFirstBin: public PairwiseRankingBin { public: - PairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin, const uint32_t min_bin, const uint32_t max_bin, const uint32_t most_freq_bin): BIN_TYPE(0), unpaired_bin_(unpaired_bin), min_bin_(min_bin), max_bin_(max_bin), most_freq_bin_(most_freq_bin) { - paired_ranking_item_index_map_ = paired_ranking_item_index_map; - num_data_ = num_data; - } - - BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override { - return new PairwiseRankingSecondIterator(unpaired_bin_.get(), paired_ranking_item_index_map_, min_bin_, max_bin_, most_freq_bin_); - } + PairwiseRankingFirstBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} +}; - private: - const std::pair* paired_ranking_item_index_map_; - const std::shared_ptr unpaired_bin_; - const uint32_t min_bin_; - const uint32_t max_bin_; - const uint32_t most_freq_bin_; +template +class PairwiseRankingSecondBin: public PairwiseRankingBin { + public: + PairwiseRankingSecondBin(data_size_t num_data, const std::pair* paired_ranking_item_index_map, const BIN_TYPE* unpaired_bin): PairwiseRankingBin(num_data, paired_ranking_item_index_map, unpaired_bin) {} }; } // LightGBM diff --git a/src/io/pairwise_ranking_feature_group.cpp b/src/io/pairwise_ranking_feature_group.cpp index f0b9485fb266..cdde9fbaadc2 100644 --- a/src/io/pairwise_ranking_feature_group.cpp +++ b/src/io/pairwise_ranking_feature_group.cpp @@ -11,24 +11,17 @@ namespace LightGBM { template typename PAIRWISE_BIN_TYPE> void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + CHECK(!is_multi_val); // do not support multi-value bin for now if (is_multi_val) { multi_bin_data_.clear(); for (int i = 0; i < num_feature_; ++i) { - uint32_t most_freq_bin = bin_mappers_[i]->GetMostFreqBin(); - int addi = most_freq_bin == 0 ? 0 : 1; - if (!is_multi_val) { - uint32_t min_bin = bin_offsets_[i]; - uint32_t max_bin = bin_offsets_[i + 1] - 1; - } else { - uint32_t min_bin = 1; - uint32_t max_bin = bin_mappers_[i]->num_bin() - 1 + addi; - } + int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { multi_bin_data_.emplace_back(Bin::CreateSparsePairwiseRankingBin( - num_data, bin_mappers_[i]->num_bin() + addi, paired_ranking_item_index_map_)); + num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); } else { multi_bin_data_.emplace_back( - Bin::CreateDensePairwiseRankingBin(num_data, bin_mappers_[i]->num_bin() + addi, paired_ranking_item_index_map_)); + Bin::CreateDensePairwiseRankingBin(num_data, bin_mappers_[i]->num_bin() + addi, num_data_, paired_ranking_item_index_map_)); } } is_multi_val_ = true; @@ -37,10 +30,10 @@ void PairwiseRankingFeatureGroup::CreateBinDataInner(int num_data, bool is_multi (!force_dense && num_feature_ == 1 && bin_mappers_[0]->sparse_rate() >= kSparseThreshold)) { is_sparse_ = true; - bin_data_.reset(Bin::CreateSparsePairwiseRankingBin(num_data, num_total_bin_, paired_ranking_item_index_map_)); + bin_data_.reset(Bin::CreateSparsePairwiseRankingBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); } else { is_sparse_ = false; - bin_data_.reset(Bin::CreateDensePairwiseRankingBin(num_data, num_total_bin_, paired_ranking_item_index_map_)); + bin_data_.reset(Bin::CreateDensePairwiseRankingBin(num_data, num_total_bin_, num_data_, paired_ranking_item_index_map_)); } is_multi_val_ = false; }