OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLFunctions.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifdef HAVE_MLPACK
20 
21 #include "../../QueryEngine/OmniSciTypes.h"
22 
23 #include <mlpack/methods/dbscan/dbscan.hpp>
24 #include <mlpack/methods/kmeans/kmeans.hpp>
25 
26 // clang-format off
27 /*
28  UDTF: dbscan__cpu_(Cursor<Column<int>, ColumnList<double>>, float, int, RowMultiplier) -> Column<int>, Column<int>
29 */
30 // clang-format on
31 EXTENSION_NOINLINE int32_t dbscan__cpu_(const Column<int>& input_ids,
32  const ColumnList<double>& cluster_features,
33  const float epsilon,
34  const int min_num_points,
35  const int output_size_multiplier,
36  Column<int>& output_ids,
37  Column<int>& output_clusters) {
38  if (epsilon <= 0.0) {
39  throw std::runtime_error("DBSCAN: epsilon must be positive");
40  }
41  if (min_num_points < 1) {
42  throw std::runtime_error("DBSCAN: min_num_points must be >= 1");
43  }
44  const int64_t num_rows = input_ids.getSize();
45 #ifndef __CUDACC__
46  const int64_t num_cluster_features = cluster_features.getCols();
47  arma::Mat<double> cluster_features_matrix(num_rows, num_cluster_features);
48  for (int64_t c = 0; c < num_cluster_features; ++c) {
49  memcpy(cluster_features_matrix.colptr(c),
50  cluster_features(c).ptr,
51  sizeof(double) * num_rows);
52  }
53  arma::Mat<double> cluster_features_matrix_transposed = cluster_features_matrix.t();
54  mlpack::dbscan::DBSCAN<> dbscan(epsilon, min_num_points);
55  arma::Row<size_t> cluster_assignments;
56  dbscan.Cluster(cluster_features_matrix_transposed, cluster_assignments);
57 
58  for (int64_t r = 0; r < num_rows; ++r) {
59  output_ids[r] = input_ids[r];
60  output_clusters[r] = cluster_assignments[r] == SIZE_MAX ? -1 : cluster_assignments[r];
61  }
62 #endif
63  return num_rows;
64 }
65 
66 // clang-format off
67 /*
68  UDTF: kmeans__cpu_(Cursor<Column<int>, ColumnList<double>>, int, RowMultiplier) -> Column<int>, Column<int>
69 */
70 // clang-format on
71 EXTENSION_NOINLINE int32_t kmeans__cpu_(const Column<int>& input_ids,
72  const ColumnList<double>& cluster_features,
73  const int num_clusters,
74  const int output_size_multiplier,
75  Column<int>& output_ids,
76  Column<int>& output_clusters) {
77  if (num_clusters <= 0) {
78  throw std::runtime_error("KMEANS: num_clusters must be positive integer");
79  }
80  const int64_t num_rows = input_ids.getSize();
81 #ifndef __CUDACC__
82  const int64_t num_cluster_features = cluster_features.getCols();
83  arma::Mat<double> cluster_features_matrix(num_rows, num_cluster_features);
84  for (int64_t c = 0; c < num_cluster_features; ++c) {
85  memcpy(cluster_features_matrix.colptr(c),
86  cluster_features(c).ptr,
87  sizeof(double) * num_rows);
88  }
89  arma::Mat<double> cluster_features_matrix_transposed = cluster_features_matrix.t();
90  mlpack::kmeans::KMeans<> kmeans;
91  arma::Row<size_t> cluster_assignments;
92  kmeans.Cluster(cluster_features_matrix_transposed,
93  static_cast<size_t>(num_clusters),
94  cluster_assignments);
95 
96  for (int64_t r = 0; r < num_rows; ++r) {
97  output_ids[r] = input_ids[r];
98  output_clusters[r] = cluster_assignments[r] == SIZE_MAX ? -1 : cluster_assignments[r];
99  }
100 #endif
101  return num_rows;
102 }
103 
104 #endif
#define SIZE_MAX
#define EXTENSION_NOINLINE
Definition: OmniSciTypes.h:28