OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
OneDalFunctions.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc., Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifndef __CUDACC__
20 #ifdef HAVE_ONEDAL
21 
24 #include "daal.h"
25 
26 using namespace daal::algorithms;
27 using namespace daal::data_management;
28 
29 template <typename T>
30 const NumericTablePtr prepare_data_table(const T* data,
31 
32  const int64_t num_rows) {
33  // Prepare input data as structure of arrays (SOA) as columnar format (zero-copy)
34  const auto data_table = SOANumericTable::create(1 /* num_columns */, num_rows);
35  data_table->setArray<T>(const_cast<T*>(data), 0);
36  return data_table;
37 }
38 
39 template <typename T>
40 // const NumericTablePtr prepare_data_table(const std::vector<std::vector<T>>& data,
41 const NumericTablePtr prepare_data_table(const std::vector<const T*>& data,
42  const int64_t num_rows) {
43  // Data dimensions
44  const size_t num_columns = data.size();
45 
46  // Prepare input data as structure of arrays (SOA) as columnar format (zero-copy)
47  const auto data_table = SOANumericTable::create(num_columns, num_rows);
48  for (size_t i = 0; i < num_columns; ++i) {
49  data_table->setArray<T>(const_cast<T*>(data[i]), i);
50  }
51  return data_table;
52 }
53 
54 template <typename T>
55 const NumericTablePtr prepare_pivoted_data_table(const T* data, const int64_t num_elems) {
56  // Data dimensions
57  // Prepare input data as structure of arrays (SOA) as columnar format (zero-copy)
58  const auto data_table = SOANumericTable::create(num_elems, 1);
59  for (size_t c = 0; c < static_cast<size_t>(num_elems); ++c) {
60  data_table->setArray<T>(const_cast<T*>(data) + c, c);
61  }
62  return data_table;
63 }
64 
65 kmeans::init::Method get_kmeans_init_type(const KMeansInitStrategy init_type) {
66  const static std::map<KMeansInitStrategy, kmeans::init::Method> kmeans_init_type_map = {
67  {KMeansInitStrategy::DEFAULT, kmeans::init::Method::deterministicDense},
68  {KMeansInitStrategy::DETERMINISTIC, kmeans::init::Method::deterministicDense},
69  {KMeansInitStrategy::RANDOM, kmeans::init::Method::randomDense},
70  {KMeansInitStrategy::PLUS_PLUS, kmeans::init::Method::parallelPlusDense}};
71 
72  const auto itr = kmeans_init_type_map.find(init_type);
73  if (itr == kmeans_init_type_map.end()) {
74  std::ostringstream oss;
75  oss << "Invalid Kmeans cluster centroid initialization type. "
76  << "Was expecting one of DETERMINISTIC, RANDOM, or PLUS_PLUS.";
77  throw std::runtime_error(oss.str());
78  }
79  return itr->second;
80 }
81 
82 template <typename T, kmeans::init::Method M>
83 const NumericTablePtr init_centroids_for_type(const NumericTablePtr& input_features_table,
84  const int32_t num_clusters) {
85  kmeans::init::Batch<T, M> init(num_clusters);
86  init.input.set(kmeans::init::data, input_features_table);
87  init.compute();
88  return init.getResult()->get(kmeans::init::centroids);
89 }
90 
91 template <typename T>
92 const NumericTablePtr init_centroids(const NumericTablePtr& input_features_table,
93  const kmeans::init::Method& init_type,
94  const int32_t num_clusters) {
95  switch (init_type) {
96  case kmeans::init::Method::deterministicDense:
97  return init_centroids_for_type<T, kmeans::init::Method::deterministicDense>(
98  input_features_table, num_clusters);
99  case kmeans::init::Method::randomDense:
100  return init_centroids_for_type<T, kmeans::init::Method::randomDense>(
101  input_features_table, num_clusters);
102  case kmeans::init::Method::plusPlusDense:
103  return init_centroids_for_type<T, kmeans::init::Method::plusPlusDense>(
104  input_features_table, num_clusters);
105  case kmeans::init::Method::parallelPlusDense:
106  return init_centroids_for_type<T, kmeans::init::Method::parallelPlusDense>(
107  input_features_table, num_clusters);
108  default: {
109  UNREACHABLE();
110  return init_centroids_for_type<T, kmeans::init::Method::deterministicDense>(
111  input_features_table, num_clusters);
112  }
113  }
114 }
115 
116 template <typename T>
117 NEVER_INLINE HOST int32_t onedal_kmeans_impl(const std::vector<const T*>& input_features,
118  int32_t* output_clusters,
119  const int64_t num_rows,
120  const int num_clusters,
121  const int num_iterations,
122  const KMeansInitStrategy kmeans_init_type) {
123  try {
124  const auto features_table = prepare_data_table(input_features, num_rows);
125  const auto onedal_kmeans_init_type = get_kmeans_init_type(kmeans_init_type);
126  const auto centroids =
127  init_centroids<T>(features_table, onedal_kmeans_init_type, num_clusters);
128  const auto assignments_table =
129  HomogenNumericTable<int32_t>::create(output_clusters, 1, num_rows);
130  const kmeans::ResultPtr result(new kmeans::Result);
131  result->set(kmeans::assignments, assignments_table);
132  result->set(kmeans::objectiveFunction,
133  HomogenNumericTable<T>::create(1, 1, NumericTable::doAllocate));
134  result->set(kmeans::nIterations,
135  HomogenNumericTable<int>::create(1, 1, NumericTable::doAllocate));
136  kmeans::Batch<> algorithm(num_clusters, num_iterations);
137  algorithm.input.set(kmeans::data, features_table);
138  algorithm.input.set(kmeans::inputCentroids, centroids);
139  algorithm.parameter().resultsToEvaluate = kmeans::computeAssignments;
140  algorithm.setResult(result);
141  algorithm.compute();
142  } catch (std::exception& e) {
143  throw std::runtime_error(e.what());
144  }
145  return num_rows;
146 }
147 
148 template <typename T>
149 NEVER_INLINE HOST int32_t onedal_dbscan_impl(const std::vector<const T*>& input_features,
150  int32_t* output_clusters,
151  const int64_t num_rows,
152  const double epsilon,
153  const int32_t min_observations) {
154  try {
155  const auto features_table = prepare_data_table(input_features, num_rows);
156  const auto assignments_table =
157  HomogenNumericTable<int32_t>::create(output_clusters, 1, num_rows);
158  const dbscan::ResultPtr result(new dbscan::Result);
159  result->set(dbscan::assignments, assignments_table);
160  result->set(dbscan::nClusters,
161  HomogenNumericTable<int>::create(1, 1, NumericTable::doAllocate));
162  dbscan::Batch<> algorithm(epsilon, min_observations);
163  algorithm.input.set(dbscan::data, features_table);
164  algorithm.parameter().resultsToCompute = dbscan::assignments;
165  algorithm.setResult(result);
166  algorithm.compute();
167  } catch (std::exception& e) {
168  throw std::runtime_error(e.what());
169  }
170  return num_rows;
171 }
172 
173 template <typename T>
174 int32_t extract_model_coefs(const NumericTablePtr& coefs_table,
175  int32_t* coef_idxs,
176  T* coefs) {
177  const int64_t num_coefs = coefs_table->getNumberOfColumns();
178  for (int64_t coef_idx = 0; coef_idx < num_coefs; ++coef_idx) {
179  coef_idxs[coef_idx] = coef_idx;
180  coefs[coef_idx] =
181  coefs_table->NumericTable::getValue<T>(coef_idx, static_cast<size_t>(0));
182  }
183  return num_coefs;
184 }
185 
186 template <typename T>
187 NEVER_INLINE HOST int32_t
188 onedal_linear_reg_fit_impl(const T* input_labels,
189  const std::vector<const T*>& input_features,
190  int32_t* output_coef_idxs,
191  T* output_coefs,
192  const int64_t num_rows) {
193  try {
194  const auto labels_table = prepare_data_table(input_labels, num_rows);
195  const auto features_table = prepare_data_table(input_features, num_rows);
196 
197  linear_regression::training::Batch<T, linear_regression::training::Method::qrDense>
198  algorithm;
199 
200  algorithm.input.set(linear_regression::training::data, features_table);
201  algorithm.input.set(linear_regression::training::dependentVariables, labels_table);
202 
203  algorithm.compute();
204  const auto training_result = algorithm.getResult();
205  const auto coefs_table =
206  training_result->get(linear_regression::training::model)->getBeta();
207  return extract_model_coefs<T>(coefs_table, output_coef_idxs, output_coefs);
208  } catch (std::exception& e) {
209  throw std::runtime_error(e.what());
210  }
211 }
212 
213 template <typename T>
214 NEVER_INLINE HOST linear_regression::ModelPtr build_linear_reg_model(
215  const T* model_coefs,
216  const int64_t num_coefs) {
217  // See comment at end of onedal_lin_reg_fit_impl
218  // We need to unpivot the model data back to the native
219  // format oneDal expects, with 1 column per beta
220  const auto betas_table = prepare_pivoted_data_table(model_coefs, num_coefs);
221  CHECK_EQ(betas_table->getNumberOfColumns(), num_coefs);
222 
223  // Create model builder with true intercept flag
224  linear_regression::ModelBuilder<T> model_builder(num_coefs - 1,
225  1 /* num_dependent_variables */);
226 
227  // Retrive pointer to the begining of betas_table
228  BlockDescriptor<T> block_result;
229 
230  // Use generic code for getting start and end iterators for betas table, even though we
231  // currently only support case of one dependent variable (i.e. 1 row in the betas table)
232  betas_table->getBlockOfRows(0, betas_table->getNumberOfRows(), readOnly, block_result);
233  size_t num_betas =
234  (betas_table->getNumberOfRows()) * (betas_table->getNumberOfColumns());
235 
236  // Initialize iterators for beta array with itrecepts
237  T* first_itr = block_result.getBlockPtr();
238  T* last_itr = first_itr + num_betas;
239  model_builder.setBeta(first_itr, last_itr);
240  betas_table->releaseBlockOfRows(block_result);
241 
242  return model_builder.getModel();
243 }
244 
245 template <typename T>
246 NEVER_INLINE HOST int32_t
247 onedal_linear_reg_predict_impl(const std::vector<const T*>& input_features,
248  T* output_predictions,
249  const int64_t num_rows,
250  const T* coefs) {
251  try {
252  const auto features_table = prepare_data_table(input_features, num_rows);
253  const auto model_ptr = build_linear_reg_model(coefs, input_features.size() + 1);
254 
255  linear_regression::prediction::Batch<> algorithm;
256  algorithm.input.set(linear_regression::prediction::data, features_table);
257  algorithm.input.set(linear_regression::prediction::model, model_ptr);
258 
259  const auto predictions_table =
260  HomogenNumericTable<T>::create(output_predictions, 1, num_rows);
261 
262  const linear_regression::prediction::ResultPtr result(
263  new linear_regression::prediction::Result);
264  result->set(linear_regression::prediction::prediction, predictions_table);
265  algorithm.setResult(result);
266  algorithm.compute();
267  return num_rows;
268  } catch (std::exception& e) {
269  throw std::runtime_error(e.what());
270  }
271 }
272 
273 #endif // #ifdef HAVE_ONEDAL
274 #endif // #ifdef __CUDACC__
#define CHECK_EQ(x, y)
Definition: Logger.h:301
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
#define UNREACHABLE()
Definition: Logger.h:337
std::pair< FILE *, std::string > create(const std::string &basePath, const int fileId, const size_t pageSize, const size_t numPages)
Definition: File.cpp:57
KMeansInitStrategy
#define HOST
void init(LogOptions const &log_opts)
Definition: Logger.cpp:360
#define NEVER_INLINE