OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TableFunctionsCommon.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifndef __CUDACC__
20 
21 #include <filesystem>
22 #include <mutex>
23 #include <shared_mutex>
24 #include <string>
25 #include <tuple>
26 #include <unordered_map>
27 #include <vector>
28 
30 
31 #ifdef HAVE_SYSTEM_TFS
32 
33 template <typename T>
34 NEVER_INLINE HOST std::pair<T, T> get_column_min_max(const Column<T>& col);
35 
36 NEVER_INLINE HOST std::pair<int32_t, int32_t> get_column_min_max(
37  const Column<TextEncodingDict>& col);
38 
39 #else // Define stubs to allow build to complete when ENABLE_SYSTEM_TFS=off.
40 
41 template <typename T>
42 std::pair<T, T> get_column_min_max(Column<T> const&) {
43  throw std::runtime_error("Table function called but built with ENABLE_SYSTEM_TFS=off.");
44 }
45 
46 std::pair<int32_t, int32_t> get_column_min_max(Column<TextEncodingDict> const&) {
47  throw std::runtime_error("Table function called but built with ENABLE_SYSTEM_TFS=off.");
48 }
49 
50 #endif
51 
52 template <typename T>
53 NEVER_INLINE HOST double get_column_mean(const T* data, const int64_t num_rows);
54 
55 template <typename T>
56 NEVER_INLINE HOST double get_column_mean(const Column<T>& col);
57 
58 template <typename T>
59 NEVER_INLINE HOST double get_column_std_dev(const Column<T>& col, const double mean);
60 
61 template <typename T>
62 NEVER_INLINE HOST double get_column_std_dev(const T* data,
63  const int64_t num_rows,
64  const double mean);
65 
66 // Assumes nulls have been removed
67 template <typename T>
68 void z_std_normalize_col(const T* input_data,
69  T* output_data,
70  const int64_t num_rows,
71  const double mean,
72  const double std_dev);
73 
74 // Assumes nulls have been removed
75 template <typename T>
76 std::vector<std::vector<T>> z_std_normalize_data(const std::vector<T*>& input_data,
77  const int64_t num_rows);
78 
79 template <typename T>
81  ZStdNormalizationSummaryStats(const std::vector<std::vector<T>>& normalized_data,
82  const std::vector<T>& means,
83  const std::vector<T>& std_devs)
84  : normalized_data(normalized_data), means(means), std_devs(std_devs) {}
85 
86  std::vector<std::vector<T>> normalized_data;
87  std::vector<T> means;
88  std::vector<T> std_devs;
89 };
90 
91 template <typename T>
93  const std::vector<T*>& input_data,
94  const int64_t num_rows);
95 
96 template <typename T>
97 NEVER_INLINE HOST std::tuple<T, T, bool> get_column_metadata(const Column<T>& col);
98 
99 NEVER_INLINE HOST std::tuple<int32_t, int32_t, bool> get_column_metadata(
100  const Column<TextEncodingDict>& col);
101 
102 template <typename T1, typename T2>
104 distance_in_meters(const T1 fromlon, const T1 fromlat, const T2 tolon, const T2 tolat);
105 
106 inline int64_t x_y_bin_to_bin_index(const int64_t x_bin,
107  const int64_t y_bin,
108  const int64_t num_x_bins) {
109  return y_bin * num_x_bins + x_bin;
110 }
111 
112 inline std::pair<int64_t, int64_t> bin_to_x_y_bin_indexes(const int64_t bin,
113  const int64_t num_x_bins) {
114  return std::make_pair(bin % num_x_bins, bin / num_x_bins);
115 }
116 
117 namespace FileUtilities {
118 std::vector<std::filesystem::path> get_fs_paths(const std::string& file_or_directory);
119 }
120 
121 enum BoundsType { Min, Max };
122 
124 
125 template <typename T>
126 NEVER_INLINE HOST bool is_valid_tf_input(const T input,
127  const T bounds_val,
128  const BoundsType bounds_type,
129  const IntervalType interval_type);
130 
131 #endif //__CUDACC__
NEVER_INLINE HOST std::pair< T, T > get_column_min_max(const Column< T > &col)
void z_std_normalize_col(const T *input_data, T *output_data, const int64_t num_rows, const double mean, const double std_dev)
std::vector< std::filesystem::path > get_fs_paths(const std::string &file_or_directory)
std::vector< std::vector< T > > normalized_data
#define HOST
ZStdNormalizationSummaryStats(const std::vector< std::vector< T >> &normalized_data, const std::vector< T > &means, const std::vector< T > &std_devs)
int64_t x_y_bin_to_bin_index(const int64_t x_bin, const int64_t y_bin, const int64_t num_x_bins)
std::pair< int64_t, int64_t > bin_to_x_y_bin_indexes(const int64_t bin, const int64_t num_x_bins)
EXTENSION_NOINLINE double distance_in_meters(const double fromlon, const double fromlat, const double tolon, const double tolat)
Computes the distance, in meters, between two WGS-84 positions.
#define NEVER_INLINE
NEVER_INLINE HOST std::tuple< T, T, bool > get_column_metadata(const Column< T > &col)
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
NEVER_INLINE HOST double get_column_std_dev(const Column< T > &col, const double mean)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)
NEVER_INLINE HOST bool is_valid_tf_input(const T input, const T bounds_val, const BoundsType bounds_type, const IntervalType interval_type)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)