OmniSciDB  ca0c39ec8f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TableFunctionsCommon.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifndef __CUDACC__
20 
21 #include <filesystem>
22 #include <mutex>
23 #include <shared_mutex>
24 #include <string>
25 #include <tuple>
26 #include <unordered_map>
27 #include <vector>
28 
30 
31 template <typename T>
32 NEVER_INLINE HOST std::pair<T, T> get_column_min_max(const Column<T>& col);
33 
34 NEVER_INLINE HOST std::pair<int32_t, int32_t> get_column_min_max(
35  const Column<TextEncodingDict>& col);
36 
37 template <typename T>
38 NEVER_INLINE HOST double get_column_mean(const T* data, const int64_t num_rows);
39 
40 template <typename T>
41 NEVER_INLINE HOST double get_column_mean(const Column<T>& col);
42 
43 template <typename T>
44 NEVER_INLINE HOST double get_column_std_dev(const Column<T>& col, const double mean);
45 
46 template <typename T>
47 NEVER_INLINE HOST double get_column_std_dev(const T* data,
48  const int64_t num_rows,
49  const double mean);
50 
51 // Assumes nulls have been removed
52 template <typename T>
53 void z_std_normalize_col(const T* input_data,
54  T* output_data,
55  const int64_t num_rows,
56  const double mean,
57  const double std_dev);
58 
59 // Assumes nulls have been removed
60 template <typename T>
61 std::vector<std::vector<T>> z_std_normalize_data(const std::vector<T*>& input_data,
62  const int64_t num_rows);
63 
64 template <typename T>
65 NEVER_INLINE HOST std::tuple<T, T, bool> get_column_metadata(const Column<T>& col);
66 
67 NEVER_INLINE HOST std::tuple<int32_t, int32_t, bool> get_column_metadata(
68  const Column<TextEncodingDict>& col);
69 
70 template <typename T1, typename T2>
72 distance_in_meters(const T1 fromlon, const T1 fromlat, const T2 tolon, const T2 tolat);
73 
74 inline int64_t x_y_bin_to_bin_index(const int64_t x_bin,
75  const int64_t y_bin,
76  const int64_t num_x_bins) {
77  return y_bin * num_x_bins + x_bin;
78 }
79 
80 inline std::pair<int64_t, int64_t> bin_to_x_y_bin_indexes(const int64_t bin,
81  const int64_t num_x_bins) {
82  return std::make_pair(bin % num_x_bins, bin / num_x_bins);
83 }
84 
85 struct CacheDataTf {
86  int8_t* data_buffer;
87  size_t num_bytes;
88 
89  CacheDataTf(const size_t num_bytes) : num_bytes(num_bytes) {
90  data_buffer = new int8_t[num_bytes];
91  }
92 
93  ~CacheDataTf() { delete[] data_buffer; }
94 };
95 
97  public:
98  bool isKeyCached(const std::string& key) const;
99 
100  bool isKeyCachedAndSameLength(const std::string& key, const size_t num_bytes) const;
101 
102  // Assumes dest_buffer is already appropriately sized
103  template <typename T>
104  void getDataForKey(const std::string& key, T* dest_buffer) const;
105 
106  template <typename T>
107  const T& getDataRefForKey(const std::string& key) const;
108 
109  template <typename T>
110  const T* getDataPtrForKey(const std::string& key) const;
111 
112  template <typename T>
113  void putDataForKey(const std::string& key,
114  T* const data_buffer,
115  const size_t num_elements);
116 
117  private:
118  const size_t parallel_copy_min_bytes{1 << 20};
119 
120  void copyData(int8_t* dest, const int8_t* source, const size_t num_bytes) const;
121 
122  std::unordered_map<std::string, std::shared_ptr<CacheDataTf>> data_cache_;
124 };
125 
126 template <typename T>
127 class DataCache {
128  public:
129  bool isKeyCached(const std::string& key) const;
130 
131  std::shared_ptr<T> getDataForKey(const std::string& key) const;
132 
133  void putDataForKey(const std::string& key, std::shared_ptr<T> const data);
134 
135  private:
136  std::unordered_map<std::string, std::shared_ptr<T>> data_cache_;
138 };
139 
140 namespace FileUtilities {
141 std::vector<std::filesystem::path> get_fs_paths(const std::string& file_or_directory);
142 }
143 
144 enum BoundsType { Min, Max };
145 
147 
148 template <typename T>
149 NEVER_INLINE HOST bool is_valid_tf_input(const T input,
150  const T bounds_val,
151  const BoundsType bounds_type,
152  const IntervalType interval_type);
153 
154 #endif //__CUDACC__
bool isKeyCachedAndSameLength(const std::string &key, const size_t num_bytes) const
void copyData(int8_t *dest, const int8_t *source, const size_t num_bytes) const
bool isKeyCached(const std::string &key) const
NEVER_INLINE HOST std::pair< T, T > get_column_min_max(const Column< T > &col)
void z_std_normalize_col(const T *input_data, T *output_data, const int64_t num_rows, const double mean, const double std_dev)
std::vector< std::filesystem::path > get_fs_paths(const std::string &file_or_directory)
std::unordered_map< std::string, std::shared_ptr< T > > data_cache_
const T * getDataPtrForKey(const std::string &key) const
void putDataForKey(const std::string &key, T *const data_buffer, const size_t num_elements)
#define HOST
std::shared_ptr< T > getDataForKey(const std::string &key) const
bool isKeyCached(const std::string &key) const
int64_t x_y_bin_to_bin_index(const int64_t x_bin, const int64_t y_bin, const int64_t num_x_bins)
void putDataForKey(const std::string &key, std::shared_ptr< T > const data)
void getDataForKey(const std::string &key, T *dest_buffer) const
std::pair< int64_t, int64_t > bin_to_x_y_bin_indexes(const int64_t bin, const int64_t num_x_bins)
EXTENSION_NOINLINE double distance_in_meters(const double fromlon, const double fromlat, const double tolon, const double tolat)
Computes the distance, in meters, between two WGS-84 positions.
const size_t parallel_copy_min_bytes
#define NEVER_INLINE
NEVER_INLINE HOST std::tuple< T, T, bool > get_column_metadata(const Column< T > &col)
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
std::shared_mutex cache_mutex_
CacheDataTf(const size_t num_bytes)
NEVER_INLINE HOST double get_column_std_dev(const Column< T > &col, const double mean)
std::shared_timed_mutex shared_mutex
const T & getDataRefForKey(const std::string &key) const
std::shared_mutex cache_mutex_
NEVER_INLINE HOST bool is_valid_tf_input(const T input, const T bounds_val, const BoundsType bounds_type, const IntervalType interval_type)
std::unordered_map< std::string, std::shared_ptr< CacheDataTf > > data_cache_
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)