OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Utilities.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef HAVE_SYSTEM_TFS
18 #ifndef __CUDACC__
19 
20 #include <cstring> // std::memcpy
21 #include <filesystem>
22 #include <memory>
23 #include <mutex>
24 #include <regex>
25 #include <shared_mutex>
26 #include <string>
27 #include <unordered_map>
28 
29 #include <tbb/parallel_for.h>
30 
31 #include "Utilities.h"
32 
33 #define NANOSECONDS_PER_SECOND 1000000000
34 
35 template <typename T>
36 TEMPLATE_NOINLINE std::pair<T, T> get_column_min_max(const Column<T>& col) {
37  T col_min = std::numeric_limits<T>::max();
38  T col_max = std::numeric_limits<T>::lowest();
39  const int64_t num_rows = col.size();
40  for (int64_t r = 0; r < num_rows; ++r) {
41  if (col[r] < col_min) {
42  col_min = col[r];
43  }
44  if (col[r] > col_max) {
45  col_max = col[r];
46  }
47  }
48  return std::make_pair(col_min, col_max);
49 }
50 
51 template <typename T1, typename T2>
53 distance_in_meters(const T1 fromlon, const T1 fromlat, const T2 tolon, const T2 tolat) {
54  T1 latitudeArc = (fromlat - tolat) * 0.017453292519943295769236907684886;
55  T1 longitudeArc = (fromlon - tolon) * 0.017453292519943295769236907684886;
56  T1 latitudeH = sin(latitudeArc * 0.5);
57  latitudeH *= latitudeH;
58  T1 lontitudeH = sin(longitudeArc * 0.5);
59  lontitudeH *= lontitudeH;
60  T1 tmp = cos(fromlat * 0.017453292519943295769236907684886) *
61  cos(tolat * 0.017453292519943295769236907684886);
62  return 6372797.560856 * (2.0 * asin(sqrt(latitudeH + tmp * lontitudeH)));
63 }
64 
65 bool DataBufferCache::isKeyCached(const std::string& key) const {
66  std::shared_lock<std::shared_mutex> read_lock(cache_mutex_);
67  return data_cache_.count(key) > 0;
68 }
69 
70 bool DataBufferCache::isKeyCachedAndSameLength(const std::string& key,
71  const size_t num_bytes) const {
72  std::shared_lock<std::shared_mutex> read_lock(cache_mutex_);
73  const auto& cached_data_itr = data_cache_.find(key);
74  if (cached_data_itr == data_cache_.end()) {
75  return false;
76  }
77  return num_bytes == cached_data_itr->second->num_bytes;
78 }
79 
80 template <typename T>
81 void DataBufferCache::getDataForKey(const std::string& key, T* dest_buffer) const {
82  auto timer = DEBUG_TIMER(__func__);
83  std::shared_lock<std::shared_mutex> read_lock(cache_mutex_);
84  const auto& cached_data_itr = data_cache_.find(key);
85  if (cached_data_itr == data_cache_.end()) {
86  const std::string error_msg = "Data for key " + key + " not found in cache.";
87  throw std::runtime_error(error_msg);
88  }
89  copyData(reinterpret_cast<int8_t*>(dest_buffer),
90  cached_data_itr->second->data_buffer,
91  cached_data_itr->second->num_bytes);
92 }
93 
94 template <typename T>
95 const T& DataBufferCache::getDataRefForKey(const std::string& key) const {
96  std::shared_lock<std::shared_mutex> read_lock(cache_mutex_);
97  const auto& cached_data_itr = data_cache_.find(key);
98  if (cached_data_itr == data_cache_.end()) {
99  const std::string error_msg{"Data for key " + key + " not found in cache."};
100  throw std::runtime_error(error_msg);
101  }
102  return *reinterpret_cast<const T*>(cached_data_itr->second->data_buffer);
103 }
104 
105 template <typename T>
106 const T* DataBufferCache::getDataPtrForKey(const std::string& key) const {
107  std::shared_lock<std::shared_mutex> read_lock(cache_mutex_);
108  const auto& cached_data_itr = data_cache_.find(key);
109  if (cached_data_itr == data_cache_.end()) {
110  return nullptr;
111  }
112  return reinterpret_cast<const T* const>(cached_data_itr->second->data_buffer);
113 }
114 
115 template <typename T>
116 void DataBufferCache::putDataForKey(const std::string& key,
117  T* const data_buffer,
118  const size_t num_elements) {
119  auto timer = DEBUG_TIMER(__func__);
120  const size_t num_bytes(num_elements * sizeof(T));
121  auto cache_data = std::make_shared<CacheData>(num_bytes);
122  copyData(cache_data->data_buffer, reinterpret_cast<int8_t*>(data_buffer), num_bytes);
123  std::unique_lock<std::shared_mutex> write_lock(cache_mutex_);
124  const auto& cached_data_itr = data_cache_.find(key);
125  if (data_cache_.find(key) != data_cache_.end()) {
126  const std::string warning_msg =
127  "Data for key " + key + " already exists in cache. Replacing.";
128  std::cout << warning_msg << std::endl;
129  cached_data_itr->second.reset();
130  cached_data_itr->second = cache_data;
131  return;
132  }
133  data_cache_.insert(std::make_pair(key, cache_data));
134 }
135 
136 void DataBufferCache::copyData(int8_t* dest,
137  const int8_t* source,
138  const size_t num_bytes) const {
139  if (num_bytes < parallel_copy_min_bytes) {
140  std::memcpy(dest, source, num_bytes);
141  return;
142  }
143  const size_t max_bytes_per_thread = parallel_copy_min_bytes;
144  const size_t num_threads =
145  (num_bytes + max_bytes_per_thread - 1) / max_bytes_per_thread;
147  tbb::blocked_range<size_t>(0, num_threads, 1),
148  [&](const tbb::blocked_range<size_t>& r) {
149  const size_t end_chunk_idx = r.end();
150  for (size_t chunk_idx = r.begin(); chunk_idx != end_chunk_idx; ++chunk_idx) {
151  const size_t start_byte = chunk_idx * max_bytes_per_thread;
152  const size_t length =
153  std::min(start_byte + max_bytes_per_thread, num_bytes) - start_byte;
154  std::memcpy(dest + start_byte, source + start_byte, length);
155  }
156  });
157 }
158 
159 /* Definitions for DataCache */
160 
161 template <typename T>
162 bool DataCache<T>::isKeyCached(const std::string& key) const {
163  std::shared_lock<std::shared_mutex> read_lock(cache_mutex_);
164  return data_cache_.count(key) > 0;
165 }
166 
167 template <typename T>
168 std::shared_ptr<T> DataCache<T>::getDataForKey(const std::string& key) const {
169  std::shared_lock<std::shared_mutex> read_lock(cache_mutex_);
170  const auto& cached_data_itr = data_cache_.find(key);
171  if (cached_data_itr == data_cache_.end()) {
172  const std::string error_msg{"Data for key " + key + " not found in cache."};
173  throw std::runtime_error(error_msg);
174  }
175  return cached_data_itr->second;
176 }
177 
178 template <typename T>
179 void DataCache<T>::putDataForKey(const std::string& key, std::shared_ptr<T> const data) {
180  std::unique_lock<std::shared_mutex> write_lock(cache_mutex_);
181  const auto& cached_data_itr = data_cache_.find(key);
182  if (cached_data_itr != data_cache_.end()) {
183  const std::string warning_msg =
184  "Data for key " + key + " already exists in cache. Replacing.";
185  std::cout << warning_msg << std::endl;
186  cached_data_itr->second.reset();
187  cached_data_itr->second = data;
188  }
189  data_cache_.insert(std::make_pair(key, data));
190 }
191 
192 namespace FileUtilities {
193 
194 // Following implementation taken from https://stackoverflow.com/a/65851545
195 
196 std::regex glob_to_regex(const std::string& glob, bool case_sensitive = false) {
197  // Note It is possible to automate checking if filesystem is case sensitive or not (e.g.
198  // by performing a test first time this function is ran)
199  std::string regex_string{glob};
200  // Escape all regex special chars:
201  regex_string = std::regex_replace(regex_string, std::regex("\\\\"), "\\\\");
202  regex_string = std::regex_replace(regex_string, std::regex("\\^"), "\\^");
203  regex_string = std::regex_replace(regex_string, std::regex("\\."), "\\.");
204  regex_string = std::regex_replace(regex_string, std::regex("\\$"), "\\$");
205  regex_string = std::regex_replace(regex_string, std::regex("\\|"), "\\|");
206  regex_string = std::regex_replace(regex_string, std::regex("\\("), "\\(");
207  regex_string = std::regex_replace(regex_string, std::regex("\\)"), "\\)");
208  regex_string = std::regex_replace(regex_string, std::regex("\\{"), "\\{");
209  regex_string = std::regex_replace(regex_string, std::regex("\\{"), "\\}");
210  regex_string = std::regex_replace(regex_string, std::regex("\\["), "\\[");
211  regex_string = std::regex_replace(regex_string, std::regex("\\]"), "\\]");
212  regex_string = std::regex_replace(regex_string, std::regex("\\+"), "\\+");
213  regex_string = std::regex_replace(regex_string, std::regex("\\/"), "\\/");
214  // Convert wildcard specific chars '*?' to their regex equivalents:
215  regex_string = std::regex_replace(regex_string, std::regex("\\?"), ".");
216  regex_string = std::regex_replace(regex_string, std::regex("\\*"), ".*");
217 
218  return std::regex(
219  regex_string,
220  case_sensitive ? std::regex_constants::ECMAScript : std::regex_constants::icase);
221 }
222 
223 std::vector<std::filesystem::path> get_fs_paths(const std::string& file_or_directory) {
224  const std::filesystem::path file_or_directory_path(file_or_directory);
225  const auto file_status = std::filesystem::status(file_or_directory_path);
226 
227  std::vector<std::filesystem::path> fs_paths;
228  if (std::filesystem::is_regular_file(file_status)) {
229  fs_paths.emplace_back(file_or_directory_path);
230  return fs_paths;
231  } else if (std::filesystem::is_directory(file_status)) {
232  for (std::filesystem::directory_entry const& entry :
233  std::filesystem::directory_iterator(file_or_directory_path)) {
234  if (std::filesystem::is_regular_file(std::filesystem::status(entry))) {
235  fs_paths.emplace_back(entry.path());
236  }
237  }
238  return fs_paths;
239  } else {
240  const auto parent_path = file_or_directory_path.parent_path();
241  const auto parent_status = std::filesystem::status(parent_path);
242  if (std::filesystem::is_directory(parent_status)) {
243  const auto file_glob = file_or_directory_path.filename();
244  const std::regex glob_regex{glob_to_regex(file_glob.string(), false)};
245 
246  for (std::filesystem::directory_entry const& entry :
247  std::filesystem::directory_iterator(parent_path)) {
248  if (std::filesystem::is_regular_file(std::filesystem::status(entry))) {
249  const auto entry_filename = entry.path().filename().string();
250  if (std::regex_match(entry_filename, glob_regex)) {
251  fs_paths.emplace_back(entry.path());
252  }
253  }
254  }
255  return fs_paths;
256  }
257  }
258  return fs_paths;
259 }
260 
261 } // namespace FileUtilities
262 
263 template <typename T>
264 bool is_valid_tf_input(const T input,
265  const T bounds_val,
266  const BoundsType bounds_type,
267  const IntervalType interval_type) {
268  switch (bounds_type) {
269  case BoundsType::Min:
270  switch (interval_type) {
271  case IntervalType::Inclusive:
272  return input >= bounds_val;
273  case IntervalType::Exclusive:
274  return input > bounds_val;
275  default:
276  UNREACHABLE();
277  }
278  case BoundsType::Max:
279  switch (interval_type) {
280  case IntervalType::Inclusive:
281  return input <= bounds_val;
282  case IntervalType::Exclusive:
283  return input < bounds_val;
284  default:
285  UNREACHABLE();
286  }
287  break;
288  default:
289  UNREACHABLE();
290  }
291  UNREACHABLE();
292  return false; // To address compiler warning
293 }
294 
295 #endif // __CUDACC__
296 #endif // HAVE_SYSTEM_TFS
#define const
DEVICE int64_t size() const
Definition: OmniSciTypes.h:218
#define TEMPLATE_NOINLINE
Definition: OmniSciTypes.h:30
#define UNREACHABLE()
Definition: Logger.h:253
std::vector< std::string > glob(const std::string &pattern)
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
EXTENSION_NOINLINE double distance_in_meters(const double fromlon, const double fromlat, const double tolon, const double tolat)
Computes the distance, in meters, between two WGS-84 positions.
mapd_shared_lock< mapd_shared_mutex > read_lock
#define DEBUG_TIMER(name)
Definition: Logger.h:352
mapd_unique_lock< mapd_shared_mutex > write_lock