OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLPackFunctions.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc., Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifndef __CUDACC__
20 #ifdef HAVE_MLPACK
21 #ifdef HAVE_TBB
22 
23 #include <tbb/parallel_for.h>
24 #include <mlpack/methods/dbscan/dbscan.hpp>
25 #include <mlpack/methods/kmeans/kmeans.hpp>
28 
29 using MatrixT = arma::Mat<double>;
30 
31 arma::Mat<double> create_input_matrix(const std::vector<const float*>& input_features,
32  const int64_t num_rows) {
33  const int64_t num_features = input_features.size();
34  arma::Mat<double> features_matrix(num_rows, num_features);
35  for (int64_t c = 0; c < num_features; ++c) {
36  double* matrix_col_ptr = features_matrix.colptr(c);
37  const float* input_features_col = input_features[c];
38  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
39  [&](const tbb::blocked_range<int64_t>& r) {
40  const int64_t start_idx = r.begin();
41  const int64_t end_idx = r.end();
42  for (int64_t r = start_idx; r < end_idx; ++r) {
43  matrix_col_ptr[r] = input_features_col[r];
44  }
45  });
46  }
47  return features_matrix;
48 }
49 
50 arma::Mat<double> create_input_matrix(const std::vector<const double*>& input_features,
51  const int64_t num_rows) {
52  const int64_t num_features = input_features.size();
53  arma::Mat<double> features_matrix(num_rows, num_features);
54  for (int64_t c = 0; c < num_features; ++c) {
55  memcpy(features_matrix.colptr(c), input_features[c], sizeof(double) * num_rows);
56  }
57  return features_matrix;
58 }
59 
60 void rewrite_cluster_id_nulls(const arma::Row<size_t>& cluster_assignments,
61  int32_t* output_clusters,
62  const int64_t num_rows) {
63  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
64  [&](const tbb::blocked_range<int64_t>& r) {
65  const int64_t start_idx = r.begin();
66  const int64_t end_idx = r.end();
67  for (int64_t r = start_idx; r < end_idx; ++r) {
68  output_clusters[r] = cluster_assignments[r] == SIZE_MAX
69  ? -1
70  : cluster_assignments[r];
71  }
72  });
73 }
74 
75 template <typename InitStrategy>
76 void run_kmeans(const MatrixT& input_features_matrix_transposed,
77  arma::Row<size_t>& cluster_assignments,
78  const int32_t num_clusters) {
79  mlpack::kmeans::KMeans<mlpack::metric::EuclideanDistance,
80  InitStrategy,
81  mlpack::kmeans::MaxVarianceNewCluster,
82  mlpack::kmeans::NaiveKMeans,
83  MatrixT>
84  kmeans;
85  kmeans.Cluster(input_features_matrix_transposed,
86  static_cast<size_t>(num_clusters),
87  cluster_assignments);
88 }
89 
90 template <typename T>
91 NEVER_INLINE HOST int32_t mlpack_kmeans_impl(const std::vector<const T*>& input_features,
92  int32_t* output_clusters,
93  const int64_t num_rows,
94  const int32_t num_clusters,
95  const int32_t num_iterations,
96  const KMeansInitStrategy init_type) {
97  try {
98  const auto input_features_matrix = create_input_matrix(input_features, num_rows);
99  const MatrixT input_features_matrix_transposed = input_features_matrix.t();
100  arma::Row<size_t> cluster_assignments;
101  switch (init_type) {
104  run_kmeans<mlpack::kmeans::SampleInitialization>(
105  input_features_matrix_transposed, cluster_assignments, num_clusters);
106  break;
107  }
108  // Note that PlusPlus strategy is unimplemented for version of MLPack pulled via apt
109  // in Ubuntu 20.04, but landed at the beginning of 2021 so will be available if we
110  // package it
111  // case KMeansInitStrategy::PLUS_PLUS: {
112  // //run_kmeans<mlpack::kmeans::KMeansPlusPlusInitialization>(input_features_matrix_transposed,
113  // cluster_assignments, num_clusters);
114  // //break;
115  //}
116  default: {
117  throw std::runtime_error(
118  "MLPack KMeans initialization not implemented for given strategy.");
119  break;
120  }
121  }
122 
123  rewrite_cluster_id_nulls(cluster_assignments, output_clusters, num_rows);
124  } catch (std::exception& e) {
125  throw std::runtime_error(e.what());
126  }
127  return num_rows;
128 }
129 
130 template <typename T>
131 NEVER_INLINE HOST int32_t mlpack_dbscan_impl(const std::vector<const T*>& input_features,
132  int32_t* output_clusters,
133  const int64_t num_rows,
134  const double epsilon,
135  const int32_t min_observations) {
136  try {
137  const auto input_features_matrix = create_input_matrix(input_features, num_rows);
138  const MatrixT input_features_matrix_transposed = input_features_matrix.t();
139 
140  mlpack::dbscan::DBSCAN<> dbscan(epsilon, min_observations);
141  arma::Row<size_t> cluster_assignments;
142  dbscan.Cluster(input_features_matrix_transposed, cluster_assignments);
143 
144  rewrite_cluster_id_nulls(cluster_assignments, output_clusters, num_rows);
145  } catch (std::exception& e) {
146  throw std::runtime_error(e.what());
147  }
148  return num_rows;
149 }
150 
151 template <typename T>
152 NEVER_INLINE HOST int32_t
153 mlpack_linear_reg_fit_impl(const T* input_labels,
154  const std::vector<const T*>& input_features,
155  int32_t* output_coef_idxs,
156  T* output_coefs,
157  const int64_t num_rows) {
158  try {
159  // Implement simple linear regression entirely in Armadillo
160  // to avoid overhead of mlpack copies and extra matrix inversion
161  arma::Mat<T> X(num_rows, input_features.size() + 1);
162  // Intercept
163  X.unsafe_col(0).fill(1);
164  // Now copy feature column pointers
165  const int64_t num_features = input_features.size();
166  for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
167  memcpy(
168  X.colptr(feature_idx + 1), input_features[feature_idx], sizeof(T) * num_rows);
169  }
170  const arma::Mat<T> Xt = X.t();
171  const arma::Mat<T> XtX = Xt * X; // XtX aka "Gram Matrix"
172  const arma::Col<T> Y(input_labels, num_rows);
173  const arma::Col<T> B_est = arma::solve(XtX, Xt * Y);
174  for (int64_t coef_idx = 0; coef_idx < num_features + 1; ++coef_idx) {
175  output_coef_idxs[coef_idx] = coef_idx;
176  output_coefs[coef_idx] = B_est[coef_idx];
177  }
178  return num_features + 1;
179  } catch (std::exception& e) {
180  throw std::runtime_error(e.what());
181  }
182 }
183 
184 template <typename T>
185 NEVER_INLINE HOST int32_t
186 mlpack_linear_reg_predict_impl(const std::vector<const T*>& input_features,
187  T* output_predictions,
188  const int64_t num_rows,
189  const T* coefs) {
190  try {
191  // Implement simple linear regression entirely in Armadillo
192  // to avoid overhead of mlpack copies and extra matrix inversion
193  const int64_t num_features = input_features.size();
194  const int64_t num_coefs = num_features + 1;
195  arma::Mat<T> X(num_rows, num_coefs);
196  X.unsafe_col(0).fill(1);
197  for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
198  memcpy(
199  X.colptr(feature_idx + 1), input_features[feature_idx], sizeof(T) * num_rows);
200  }
201  const arma::Col<T> B(coefs, num_coefs);
202  const arma::Col<T> Y_hat = X * B;
203  memcpy(output_predictions, Y_hat.colptr(0), sizeof(T) * num_rows);
204  return num_rows;
205  } catch (std::exception& e) {
206  throw std::runtime_error(e.what());
207  }
208 }
209 
210 #endif // #ifdef HAVE_TBB
211 #endif // #ifdef HAVE_MLPACK
212 #endif // #ifdef __CUDACC__
KMeansInitStrategy
#define HOST
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
#define NEVER_INLINE