OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
OneHotEncoder.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2023 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef __CUDACC__
18 
19 #include "OneHotEncoder.h"
21 #include "Shared/ThreadInfo.h"
22 
23 #include <tbb/parallel_for.h>
24 #include <tbb/parallel_sort.h>
25 
26 namespace TableFunctions_Namespace {
27 
28 namespace OneHotEncoder_Namespace {
29 
43 NEVER_INLINE HOST std::pair<std::vector<int32_t>, bool> get_top_k_keys(
44  const Column<TextEncodingDict>& text_col,
45  const int32_t top_k,
46  const double min_perc_col_total_per_key) {
47  auto timer = DEBUG_TIMER(__func__);
48  const auto col_min_max = get_column_min_max(text_col);
49  const int32_t col_size = text_col.size();
50  const int32_t col_range = col_min_max.second - col_min_max.first + 1;
51 
52  // Todo(todd): Calculate the counts in parallel when `col_range` is
53  // relatively small, and better still, vary the number of threads
54  // based on that range.
55  // There is a tradeoff between parallelism and the memory pressure and performance
56  // overhead of allocating separate count buffers per thread. When `col_range`
57  // is relatively small, it will be advantageous to maximize parallelism,
58  // but in cases when `col_range` is large, it will likely be better to use
59  // a single or only a few threads
60 
61  // Note that when trying to parallelize this method with
62  // the use of TBB::concurrent_vector, the runtime went up 10X,
63  // from roughly 100ms for a 120M record column to 1000ms
64  std::vector<int32_t> key_counts(col_range, 0);
65  for (int32_t idx = 0; idx < col_size; ++idx) {
66  if (!text_col.isNull(idx)) {
67  const int32_t key_idx = text_col[idx] - col_min_max.first;
68  key_counts[key_idx]++;
69  }
70  }
71 
72  std::vector<int32_t> permutation_idxs(col_range);
73 
74  tbb::parallel_for(tbb::blocked_range<int32_t>(0, col_range),
75  [&](const tbb::blocked_range<int32_t>& r) {
76  const int32_t r_end = r.end();
77  for (int32_t p = r.begin(); p < r_end; ++p) {
78  permutation_idxs[p] = p;
79  }
80  });
81 
82  // Sort key_counts in descending order
83  tbb::parallel_sort(
84  permutation_idxs.begin(),
85  permutation_idxs.begin() + col_range,
86  [&](const int32_t& a, const int32_t& b) { return key_counts[a] > key_counts[b]; });
87  int32_t actual_top_k = std::min(col_range, top_k);
88  std::vector<int32_t> top_k_keys;
89  top_k_keys.reserve(actual_top_k);
90  const float col_size_fp = static_cast<float>(col_size);
91  int32_t k = 0;
92  // Todo(todd): Optimize the below code to do a binary search to find
93  // the first key (if it exists) where key_count_perc < min_perc_col_total_per_key,
94  // and then do a parallel for pass to write to the `top_k_keys` vec
95  for (; k < actual_top_k; ++k) {
96  const int32_t key_counts_idx = permutation_idxs[k];
97  const int32_t key_count = key_counts[key_counts_idx];
98  const float key_count_perc = key_count / col_size_fp;
99  if (key_count_perc < min_perc_col_total_per_key) {
100  break;
101  }
102  top_k_keys.emplace_back(key_counts_idx + col_min_max.first);
103  }
104  const bool has_other_keys = k < col_range && key_counts[permutation_idxs[k]] > 0;
105  return std::make_pair(top_k_keys, has_other_keys);
106 }
107 
119 template <typename F>
120 NEVER_INLINE HOST std::vector<std::vector<F>> allocate_one_hot_cols(
121  const int64_t num_one_hot_cols,
122  const int64_t col_size) {
123  std::vector<std::vector<F>> one_hot_allocated_buffers(num_one_hot_cols);
124  const int64_t target_num_col_allocations_per_thread =
125  std::ceil(100000.0 / (col_size + 1));
126  const ThreadInfo thread_info(std::thread::hardware_concurrency(),
127  num_one_hot_cols,
128  target_num_col_allocations_per_thread);
129  CHECK_GE(thread_info.num_threads, 1L);
130  CHECK_GE(thread_info.num_elems_per_thread, 1L);
131  std::vector<std::future<void>> allocator_threads;
132  for (int64_t col_idx = 0; col_idx < num_one_hot_cols;
133  col_idx += thread_info.num_elems_per_thread) {
134  allocator_threads.emplace_back(std::async(
136  [&one_hot_allocated_buffers, num_one_hot_cols, col_size, &thread_info](
137  const int64_t start_col_idx) {
138  const int64_t end_col_idx = std::min(
139  start_col_idx + thread_info.num_elems_per_thread, num_one_hot_cols);
140  for (int64_t alloc_col_idx = start_col_idx; alloc_col_idx < end_col_idx;
141  ++alloc_col_idx) {
142  one_hot_allocated_buffers[alloc_col_idx].resize(col_size, 0);
143  }
144  },
145  col_idx));
146  }
147  return one_hot_allocated_buffers;
148 }
149 
150 template NEVER_INLINE HOST std::vector<std::vector<float>> allocate_one_hot_cols(
151  const int64_t num_one_hot_cols,
152  const int64_t col_size);
153 template NEVER_INLINE HOST std::vector<std::vector<double>> allocate_one_hot_cols(
154  const int64_t num_one_hot_cols,
155  const int64_t col_size);
156 
164 std::pair<int32_t, int32_t> get_min_max_keys(const std::vector<int32_t>& top_k_keys) {
165  int32_t min_key = std::numeric_limits<int32_t>::max();
166  int32_t max_key = std::numeric_limits<int32_t>::lowest();
167  for (const auto& key : top_k_keys) {
169  continue;
170  }
171  if (key < min_key) {
172  min_key = key;
173  }
174  if (key > max_key) {
175  max_key = key;
176  }
177  }
178  return std::make_pair(min_key, max_key);
179 }
180 
181 constexpr int16_t INVALID_COL_IDX{-1};
182 
190  KeyToOneHotColBytemap(const std::vector<int32_t>& top_k_keys,
191  const int32_t min_key,
192  const int32_t max_key,
193  const bool has_other_key)
194  : min_key_(min_key)
195  , max_key_(max_key)
196  , has_other_key_(has_other_key)
197  , other_key_(top_k_keys.size())
198  , bytemap_(init_bytemap(top_k_keys, min_key, max_key, has_other_key)) {}
199 
200  static std::vector<int16_t> init_bytemap(const std::vector<int32_t>& top_k_keys,
201  const int32_t min_key,
202  const int32_t max_key,
203  const bool has_other_key) {
204  // The bytemap can be quite large if the dictionary-encoded key range is large, so for
205  // efficiency we store the offsets as int16_t Since we use `top_k_keys.size()` as the
206  // sentinel for the OTHER key, we check to see if the top_k_keys.size() is smaller
207  // than the maximum allowable value for int16_t
208  if (static_cast<int64_t>(top_k_keys.size()) >= std::numeric_limits<int16_t>::max()) {
209  std::ostringstream error_oss;
210  error_oss << "Error: More than " << std::numeric_limits<int16_t>::max() - 1
211  << " top k categorical keys not allowed.";
212  throw std::runtime_error(error_oss.str());
213  }
214  std::vector<int16_t> bytemap(max_key - min_key + 1,
215  has_other_key ? top_k_keys.size() : INVALID_COL_IDX);
216  int16_t offset = 0;
217  for (const auto& key : top_k_keys) {
218  bytemap[key - min_key] = offset++;
219  }
220  return bytemap;
221  }
222 
223  inline int16_t get_col_idx_for_key(const int32_t key) const {
224  if (key < min_key_ || key > max_key_) {
226  }
227  return bytemap_[key - min_key_];
228  }
229 
230  const int32_t min_key_;
231  const int32_t max_key_;
232  const bool has_other_key_;
233  const int32_t other_key_;
234  const std::vector<int16_t> bytemap_;
235 };
236 
237 template <typename F>
239  const Column<TextEncodingDict>& text_col,
241  one_hot_encoding_info) {
242  auto timer = DEBUG_TIMER(__func__);
243  CHECK(one_hot_encoding_info.is_one_hot_encoded);
244  OneHotEncodedCol<F> one_hot_encoded_col;
245  bool include_others_key = false;
246  std::vector<int> top_k_keys;
247  if (one_hot_encoding_info.cat_features.empty()) {
248  CHECK_GT(one_hot_encoding_info.top_k_attrs, 1);
249  CHECK_GE(one_hot_encoding_info.min_attr_proportion, 0);
250 
251  const auto [top_k_keys_temp, has_other_keys] =
252  get_top_k_keys(text_col,
253  one_hot_encoding_info.top_k_attrs,
254  one_hot_encoding_info.min_attr_proportion);
255  top_k_keys = top_k_keys_temp;
256  include_others_key = one_hot_encoding_info.include_others_attr && has_other_keys;
257  // If top k keys comprises all keys (i.e. k == n) and no other key is requested,
258  // then remove the least common key as otherwise we overdetermine the degrees
259  // of freedom and can get strange regression coefficients/results
260  // Note we do not remove a key if there is only one key
261  if (!has_other_keys && !one_hot_encoding_info.include_others_attr &&
262  top_k_keys.size() > 1) {
263  top_k_keys.pop_back();
264  }
265  for (const auto top_k_key : top_k_keys) {
266  one_hot_encoded_col.cat_features.emplace_back(
267  text_col.string_dict_proxy_->getString(top_k_key));
268  }
269  } else {
270  one_hot_encoded_col.cat_features = one_hot_encoding_info.cat_features;
271  for (const auto& cat_feature : one_hot_encoded_col.cat_features) {
272  top_k_keys.emplace_back(text_col.string_dict_proxy_->getIdOfString(cat_feature));
273  }
274  }
275 
276  const int64_t num_one_hot_cols = top_k_keys.size() + (include_others_key ? 1 : 0);
277  const int64_t col_size = text_col.size();
278  one_hot_encoded_col.encoded_buffers =
279  allocate_one_hot_cols<F>(num_one_hot_cols, col_size);
280  constexpr int64_t max_bytemap_size = 10000000L;
281 
282  const auto [min_key, max_key] = get_min_max_keys(top_k_keys);
283  const int64_t key_range = max_key - min_key + 1;
284  if (key_range > max_bytemap_size) {
285  throw std::runtime_error(
286  "One-hot vectors currently can only be generated on string columns with less "
287  "that " +
288  std::to_string(max_bytemap_size) + " unique entries.");
289  }
290  const KeyToOneHotColBytemap key_to_one_hot_bytemap(
291  top_k_keys, min_key, max_key, include_others_key);
292 
293  tbb::parallel_for(tbb::blocked_range<int64_t>(0, col_size),
294  [&](const tbb::blocked_range<int64_t>& r) {
295  const int64_t r_end = r.end();
296  for (int64_t row_idx = r.begin(); row_idx < r_end; ++row_idx) {
297  const int32_t key = text_col[row_idx];
298  const auto col_idx =
299  key_to_one_hot_bytemap.get_col_idx_for_key(key);
300  if (col_idx != INVALID_COL_IDX) {
301  one_hot_encoded_col.encoded_buffers[col_idx][row_idx] = 1.;
302  }
303  }
304  });
305 
306  return one_hot_encoded_col;
307 }
308 
309 template NEVER_INLINE HOST OneHotEncodedCol<float> one_hot_encode(
310  const Column<TextEncodingDict>& text_col,
312  one_hot_encoding_info);
313 
314 template NEVER_INLINE HOST OneHotEncodedCol<double> one_hot_encode(
315  const Column<TextEncodingDict>& text_col,
317  one_hot_encoding_info);
318 
319 template <typename F>
320 NEVER_INLINE HOST std::vector<OneHotEncodedCol<F>> one_hot_encode(
321  const ColumnList<TextEncodingDict>& text_cols,
322  const std::vector<
324  one_hot_encoding_infos) {
325  const int64_t num_input_cols = text_cols.numCols();
326  // std::vector<std::vector<std::vector<F>>> one_hot_buffers;
327  std::vector<OneHotEncodedCol<F>> one_hot_encoded_cols;
328  one_hot_encoded_cols.reserve(num_input_cols);
329  for (int64_t input_col_idx = 0; input_col_idx < num_input_cols; ++input_col_idx) {
330  Column<TextEncodingDict> dummy_text_col(
331  reinterpret_cast<TextEncodingDict*>(text_cols.ptrs_[input_col_idx]),
332  text_cols.num_rows_,
333  text_cols.string_dict_proxies_[input_col_idx]);
334  one_hot_encoded_cols.emplace_back(
335  one_hot_encode<F>(dummy_text_col, one_hot_encoding_infos[input_col_idx]));
336  }
337  return one_hot_encoded_cols;
338 }
339 
340 template NEVER_INLINE HOST std::vector<OneHotEncodedCol<float>> one_hot_encode(
341  const ColumnList<TextEncodingDict>& text_cols,
342  const std::vector<
344  one_hot_encoding_infos);
345 
346 template NEVER_INLINE HOST std::vector<OneHotEncodedCol<double>> one_hot_encode(
347  const ColumnList<TextEncodingDict>& text_cols,
348  const std::vector<
350  one_hot_encoding_infos);
351 
352 } // namespace OneHotEncoder_Namespace
353 
354 } // namespace TableFunctions_Namespace
355 
356 #endif // #ifndef __CUDACC__
std::pair< int32_t, int32_t > get_min_max_keys(const std::vector< int32_t > &top_k_keys)
Finds the minimum and maximum keys in a given vector of keys and returns them as a pair...
int64_t num_elems_per_thread
Definition: ThreadInfo.h:23
NEVER_INLINE HOST OneHotEncodedCol< F > one_hot_encode(const Column< TextEncodingDict > &text_col, const TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo &one_hot_encoding_info)
Takes a column of text-encoded data and one-hot encoding information as input. It performs the one-ho...
NEVER_INLINE HOST std::pair< T, T > get_column_min_max(const Column< T > &col)
#define CHECK_GE(x, y)
Definition: Logger.h:306
std::string getString(int32_t string_id) const
#define CHECK_GT(x, y)
Definition: Logger.h:305
std::string to_string(char const *&&v)
constexpr double a
Definition: Utm.h:32
#define HOST
future< Result > async(Fn &&fn, Args &&...args)
NEVER_INLINE HOST std::vector< std::vector< F > > allocate_one_hot_cols(const int64_t num_one_hot_cols, const int64_t col_size)
Allocates memory for the one-hot encoded columns and initializes them to zero. It takes the number of...
static constexpr int32_t INVALID_STR_ID
StringDictionaryProxy ** string_dict_proxies_
int64_t num_threads
Definition: ThreadInfo.h:22
NEVER_INLINE HOST std::pair< std::vector< int32_t >, bool > get_top_k_keys(const Column< TextEncodingDict > &text_col, const int32_t top_k, const double min_perc_col_total_per_key)
This function calculates the top k most frequent keys (categories) in the provided column based on a ...
StringDictionaryProxy * string_dict_proxy_
DEVICE int64_t numCols() const
static std::vector< int16_t > init_bytemap(const std::vector< int32_t > &top_k_keys, const int32_t min_key, const int32_t max_key, const bool has_other_key)
DEVICE bool isNull(int64_t index) const
A struct that creates a bytemap to map each key to its corresponding one-hot column index...
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
#define NEVER_INLINE
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
DEVICE int64_t size() const
int32_t getIdOfString(const std::string &str) const
KeyToOneHotColBytemap(const std::vector< int32_t > &top_k_keys, const int32_t min_key, const int32_t max_key, const bool has_other_key)