OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SimilarityTableFunctions.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc., Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifndef __CUDACC__
20 #ifdef HAVE_TBB
21 
23 
24 using namespace TableFunctions_Namespace;
25 
26 template <typename U, typename K, typename S>
27 int64_t write_cos_sim(const std::vector<S>& similarity_vector,
28  const std::vector<U>& key_map,
29  const ColumnMetadata& primary_key_metadata,
30  Column<K>& output_key_col,
31  Column<S>& output_similarity) {
32  const uint64_t num_rows = key_map.size();
33  set_output_row_size(num_rows);
34 
35  for (U c = 0; c != key_map.size(); ++c) {
36  output_key_col[c] = primary_key_metadata.map_to_uncompressed_range(key_map[c]);
37  output_similarity[c] = similarity_vector[c];
38  }
39  return num_rows;
40 }
41 
42 template <typename U, typename K, typename S>
43 int64_t write_cos_sim(const DenseMatrix<S>& similarity_matrix,
44  const std::vector<U>& key_map,
45  const ColumnMetadata& primary_key_metadata,
46  Column<K>& output_key_col_1,
47  Column<K>& output_key_col_2,
48  Column<S>& output_similarity) {
49  const uint64_t num_rows =
50  similarity_matrix.num_cols * (similarity_matrix.num_cols + 1) / 2; // rows of a
51 
52  set_output_row_size(num_rows);
53 
54  uint64_t output_idx = 0;
55  for (U c = 0; c != similarity_matrix.num_cols; ++c) {
56  const U uncompressed_col_key =
57  primary_key_metadata.map_to_uncompressed_range(key_map[c]);
58  const U max_row = c + 1;
59  for (U r = 0; r != max_row; ++r) {
60  const U uncompressed_row_key =
61  primary_key_metadata.map_to_uncompressed_range(key_map[r]);
62  output_key_col_1[output_idx] = uncompressed_row_key;
63  output_key_col_2[output_idx] = uncompressed_col_key;
64  output_similarity[output_idx] = similarity_matrix.get(r, c);
65  output_idx++;
66  }
67  }
68  return num_rows;
69 }
70 
71 template <typename K, typename F, typename M, typename U, typename S>
72 int64_t similarity_vector_impl(const Column<K>& matrix_primary_key,
73  const ColumnList<F>& matrix_pivot_features,
74  const Column<M>& metric,
75  const CompositeKeyMetadata& matrix_primary_key_metadata,
76  const CompositeKeyMetadata& matrix_pivot_features_metadata,
77  const ColumnList<F>& vector_pivot_features,
78  const Column<M>& vector_metric,
79  const CompositeKeyMetadata& vector_pivot_features_metadata,
80  Column<K>& output_primary_key,
81  Column<S>& output_similarity,
82  const bool normalize_by_idf) {
83  CompositeKeyMetadata unioned_pivot_features_metadata = unionCompositeKeyMetadata(
84  matrix_pivot_features_metadata, vector_pivot_features_metadata);
85 
86  // Need to override unioned metadata with the null sentinel for each the matrix and
87  // vector pivot columns, as those are input dependant
88  copyCompositeKeyMetadataNulls(unioned_pivot_features_metadata,
89  matrix_pivot_features_metadata);
90 
91  SparseMatrixCsc<U, S> sparse_matrix_csc =
92  pivot_table_to_sparse_csc_matrix<K, F, M, U, S>(matrix_primary_key,
93  matrix_pivot_features,
94  metric,
95  matrix_primary_key_metadata,
96  unioned_pivot_features_metadata);
97  copyCompositeKeyMetadataNulls(unioned_pivot_features_metadata,
98  vector_pivot_features_metadata);
99 
100  SparseVector<U, S> sparse_vector = pivot_table_to_sparse_vector<F, M, U, S>(
101  vector_pivot_features, vector_metric, unioned_pivot_features_metadata);
102 
103  if (normalize_by_idf) {
104  const std::vector<double> idf_vec = idf_normalize(
105  sparse_matrix_csc, static_cast<U>(unioned_pivot_features_metadata.num_keys));
106  const size_t sparse_vec_size = sparse_vector.data.size();
107  for (size_t r = 0; r < sparse_vec_size; ++r) {
108  sparse_vector.data[r] *= idf_vec[sparse_vector.row_indices[r]];
109  }
110  }
111 
112  const std::vector<S> similarity_vector =
113  multiply_matrix_by_vector(sparse_matrix_csc, sparse_vector, true);
114 
115  const int64_t num_rows =
116  write_cos_sim(similarity_vector,
117  sparse_matrix_csc.col_values,
118  matrix_primary_key_metadata.keys_metadata[0].column_metadata,
119  output_primary_key,
120  output_similarity);
121 
122  return num_rows;
123 }
124 
125 template <typename K, typename F, typename M, typename U, typename S>
126 int64_t similarity_impl(const Column<K>& primary_key,
127  const ColumnList<F>& pivot_features,
128  const Column<M>& metric,
129  const CompositeKeyMetadata& primary_key_metadata,
130  const CompositeKeyMetadata& pivot_features_metadata,
131  Column<K>& output_primary_key_1,
132  Column<K>& output_primary_key_2,
133  Column<S>& output_similarity,
134  const bool normalize_by_idf) {
135  SparseMatrixCsc<U, S> sparse_matrix_csc =
136  pivot_table_to_sparse_csc_matrix<K, F, M, U, S>(primary_key,
137  pivot_features,
138  metric,
139  primary_key_metadata,
140  pivot_features_metadata);
141 
142  if (normalize_by_idf) {
143  idf_normalize(sparse_matrix_csc, static_cast<U>(pivot_features_metadata.num_keys));
144  }
145 
146  const DenseMatrix<S> similarity_matrix =
147  multiply_matrix_by_transpose(sparse_matrix_csc, true);
148 
149  const int64_t num_rows =
150  write_cos_sim(similarity_matrix,
151  sparse_matrix_csc.col_values,
152  primary_key_metadata.keys_metadata[0].column_metadata,
153  output_primary_key_1,
154  output_primary_key_2,
155  output_similarity);
156 
157  return num_rows;
158 }
159 
160 // clang-format off
161 /*
162  UDTF: tf_feature_similarity__cpu_template(Cursor<Column<K> primary_key,
163  ColumnList<F> pivot_features, Column<M> metric> primary_features,
164  Cursor<ColumnList<F> comparison_pivot_features, Column<M> comparison_metric> comparison_features,
165  bool use_tf_idf | default=false) -> Column<K> class | input_id=args<0>, Column<float> similarity_score, K=[int64_t, TextEncodingDict], F=[int64_t], M=[int64_t, double]
166 */
167 // clang-format on
168 
169 template <typename K, typename F, typename M>
170 int64_t tf_feature_similarity__cpu_template(
171  const Column<K>& primary_key,
172  const ColumnList<F>& pivot_features,
173  const Column<M>& metric,
174  const ColumnList<F>& comparison_pivot_features,
175  const Column<M>& comparison_metric,
176  const bool use_tf_idf,
177  Column<K>& output_primary_key,
178  Column<float>& output_similarity) {
179  if (pivot_features.numCols() != comparison_pivot_features.numCols()) {
180  std::cout << "Error: Pivot features must have the same number of keys." << std::endl;
182  return 0;
183  }
184 
185  const auto primary_key_metadata = getCompositeKeyMetadata(primary_key);
186  const auto pivot_features_metadata = getCompositeKeyMetadata(pivot_features);
187  const auto comparison_pivot_features_metadata =
188  getCompositeKeyMetadata(comparison_pivot_features);
189 
190  // todo: should extend by comparison_pivot_features
191  const uint64_t max_dimension_range =
192  std::max(primary_key_metadata.num_keys, pivot_features_metadata.num_keys);
193 
194  if (max_dimension_range > std::numeric_limits<uint32_t>::max()) {
195  return similarity_vector_impl<K, F, M, uint64_t, float>(
196  primary_key,
197  pivot_features,
198  metric,
199  primary_key_metadata,
200  pivot_features_metadata,
201  comparison_pivot_features,
202  comparison_metric,
203  comparison_pivot_features_metadata,
204  output_primary_key,
205  output_similarity,
206  use_tf_idf);
207 
208  } else {
209  return similarity_vector_impl<K, F, M, uint32_t, float>(
210  primary_key,
211  pivot_features,
212  metric,
213  primary_key_metadata,
214  pivot_features_metadata,
215  comparison_pivot_features,
216  comparison_metric,
217  comparison_pivot_features_metadata,
218  output_primary_key,
219  output_similarity,
220  use_tf_idf);
221  }
222 }
223 
224 // clang-format off
225 /*
226  UDTF: tf_feature_similarity__cpu_template(Cursor<Column<K> primary_key, ColumnList<TextEncodingDict> pivot_features,
227  Column<M> metric> primary_features, Cursor<ColumnList<TextEncodingDict> comparison_pivot_features,
228  Column<M> comparison_metric> comparison_features, bool use_tf_idf | default=false) ->
229  Column<K> class | input_id=args<0>, Column<float> similarity_score, K=[int64_t, TextEncodingDict], M=[int64_t, double]
230 */
231 // clang-format on
232 
233 template <typename K, typename M>
234 int64_t tf_feature_similarity__cpu_template(
235  const Column<K>& primary_key,
236  const ColumnList<TextEncodingDict>& pivot_features,
237  const Column<M>& metric,
238  const ColumnList<TextEncodingDict>& comparison_pivot_features,
239  const Column<M>& comparison_metric,
240  const bool use_tf_idf,
241  Column<K>& output_primary_key,
242  Column<float>& output_similarity) {
243  if (pivot_features.numCols() != comparison_pivot_features.numCols()) {
244  std::cout << "Error: Pivot features must have the same number of keys." << std::endl;
246  return 0;
247  }
248 
249  const int64_t num_feature_cols = pivot_features.numCols();
250  const int64_t num_comparison_rows = comparison_pivot_features.size();
251  std::vector<int8_t*> new_col_ptrs;
252  std::vector<StringDictionaryProxy*> new_sdp_ptrs;
253  std::vector<std::vector<int32_t>> translated_col_ids(num_feature_cols);
254  for (int64_t col_idx = 0; col_idx < num_feature_cols; ++col_idx) {
255  const auto primary_sdp = pivot_features.string_dict_proxies_[col_idx];
256  const auto& primary_sdp_string_dict_id = primary_sdp->getDictKey();
257  const auto comparison_sdp = comparison_pivot_features.string_dict_proxies_[col_idx];
258  const auto& comparison_string_dict_id = comparison_sdp->getDictKey();
259  if (primary_sdp_string_dict_id != comparison_string_dict_id) {
260  const auto translation_map =
261  comparison_sdp->buildIntersectionTranslationMapToOtherProxy(primary_sdp, {});
262  translated_col_ids[col_idx].resize(num_comparison_rows);
263  int32_t* translated_ids = translated_col_ids[col_idx].data();
264  const auto source_col_ptr =
265  reinterpret_cast<const int32_t*>(comparison_pivot_features.ptrs_[col_idx]);
266  for (int64_t row_idx = 0; row_idx < num_comparison_rows; ++row_idx) {
267  const auto source_id = source_col_ptr[row_idx];
268  const auto translated_id =
269  source_id != inline_null_value<int32_t>() ? translation_map[source_id] : -1;
270  translated_ids[row_idx] =
271  translated_id == -1 ? inline_null_value<int32_t>() : translated_id;
272  }
273  new_col_ptrs.emplace_back(reinterpret_cast<int8_t*>(translated_ids));
274  new_sdp_ptrs.emplace_back(primary_sdp);
275  } else {
276  new_col_ptrs.emplace_back(comparison_pivot_features.ptrs_[col_idx]);
277  new_sdp_ptrs.emplace_back(comparison_sdp);
278  }
279  }
280  ColumnList<TextEncodingDict> translated_comparison_pivot_features(
281  new_col_ptrs.data(), num_feature_cols, num_comparison_rows, new_sdp_ptrs.data());
282 
283  const auto primary_key_metadata = getCompositeKeyMetadata(primary_key);
284  const auto pivot_features_metadata = getCompositeKeyMetadata(pivot_features);
285  const auto comparison_pivot_features_metadata =
286  getCompositeKeyMetadata(translated_comparison_pivot_features);
287 
288  // todo: should extend by comparison_pivot_features
289  const uint64_t max_dimension_range =
290  std::max(primary_key_metadata.num_keys, pivot_features_metadata.num_keys);
291 
292  if (max_dimension_range > std::numeric_limits<uint32_t>::max()) {
293  return similarity_vector_impl<K, TextEncodingDict, M, uint64_t, float>(
294  primary_key,
295  pivot_features,
296  metric,
297  primary_key_metadata,
298  pivot_features_metadata,
299  translated_comparison_pivot_features,
300  comparison_metric,
301  comparison_pivot_features_metadata,
302  output_primary_key,
303  output_similarity,
304  use_tf_idf);
305 
306  } else {
307  return similarity_vector_impl<K, TextEncodingDict, M, uint32_t, float>(
308  primary_key,
309  pivot_features,
310  metric,
311  primary_key_metadata,
312  pivot_features_metadata,
313  translated_comparison_pivot_features,
314  comparison_metric,
315  comparison_pivot_features_metadata,
316  output_primary_key,
317  output_similarity,
318  use_tf_idf);
319  }
320 }
321 
322 // clang-format off
323 /*
324  UDTF: tf_feature_self_similarity__cpu_template(Cursor<Column<K> primary_key, ColumnList<F> pivot_features,
325  Column<M> metric> primary_features, bool use_tf_idf | default=false) -> Column<K> class1 | input_id=args<0>,
326  Column<K> class2 | input_id=args<0>, Column<float> similarity_score,
327  K=[int64_t, TextEncodingDict], F=[int64_t, TextEncodingDict], M=[int64_t, double]
328 */
329 // clang-format on
330 
331 template <typename K, typename F, typename M>
332 int64_t tf_feature_self_similarity__cpu_template(const Column<K>& primary_key,
333  const ColumnList<F>& pivot_features,
334  const Column<M>& metric,
335  const bool use_tf_idf,
336  Column<K>& output_primary_key_1,
337  Column<K>& output_primary_key_2,
338  Column<float>& output_similarity) {
339  const auto primary_key_metadata = getCompositeKeyMetadata(primary_key);
340  const auto pivot_features_metadata = getCompositeKeyMetadata(pivot_features);
341 
342  const uint64_t max_dimension_range =
343  std::max(primary_key_metadata.num_keys, pivot_features_metadata.num_keys);
344  if (max_dimension_range > std::numeric_limits<uint32_t>::max()) {
345  return similarity_impl<K, F, M, uint64_t, float>(primary_key,
346  pivot_features,
347  metric,
348  primary_key_metadata,
349  pivot_features_metadata,
350  output_primary_key_1,
351  output_primary_key_2,
352  output_similarity,
353  use_tf_idf);
354 
355  } else {
356  return similarity_impl<K, F, M, uint32_t, float>(primary_key,
357  pivot_features,
358  metric,
359  primary_key_metadata,
360  pivot_features_metadata,
361  output_primary_key_1,
362  output_primary_key_2,
363  output_similarity,
364  use_tf_idf);
365  }
366 }
367 
368 #endif // #ifdef HAVE_TBB
369 #endif // #ifndef __CUDACC__
EXTENSION_NOINLINE_HOST void set_output_row_size(int64_t num_rows)
DEVICE int64_t numCols() const
DEVICE int64_t size() const
StringDictionaryProxy ** string_dict_proxies_
DEVICE int64_t numCols() const
const shared::StringDictKey & getDictKey() const noexcept