OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TableFunctionsCommon.cpp File Reference
#include "TableFunctionsCommon.h"
#include <cstring>
#include <filesystem>
#include <memory>
#include <mutex>
#include <regex>
#include <shared_mutex>
#include <string>
#include <unordered_map>
#include <tbb/parallel_for.h>
#include <tbb/task_arena.h>
+ Include dependency graph for TableFunctionsCommon.cpp:

Go to the source code of this file.

Namespaces

 FileUtilities
 

Macros

#define NANOSECONDS_PER_SECOND   1000000000
 

Functions

template<typename T >
NEVER_INLINE HOST std::pair< T, T > get_column_min_max (const Column< T > &col)
 
template NEVER_INLINE HOST
std::pair< int8_t, int8_t > 
get_column_min_max (const Column< int8_t > &col)
 
template NEVER_INLINE HOST
std::pair< int16_t, int16_t > 
get_column_min_max (const Column< int16_t > &col)
 
template NEVER_INLINE HOST
std::pair< int32_t, int32_t > 
get_column_min_max (const Column< int32_t > &col)
 
template NEVER_INLINE HOST
std::pair< int64_t, int64_t > 
get_column_min_max (const Column< int64_t > &col)
 
template NEVER_INLINE HOST
std::pair< float, float > 
get_column_min_max (const Column< float > &col)
 
template NEVER_INLINE HOST
std::pair< double, double > 
get_column_min_max (const Column< double > &col)
 
std::pair< int32_t, int32_t > get_column_min_max (const Column< TextEncodingDict > &col)
 
template<typename T >
NEVER_INLINE HOST double get_column_mean (const Column< T > &col)
 
template NEVER_INLINE HOST double get_column_mean (const Column< int32_t > &col)
 
template NEVER_INLINE HOST double get_column_mean (const Column< int64_t > &col)
 
template NEVER_INLINE HOST double get_column_mean (const Column< float > &col)
 
template NEVER_INLINE HOST double get_column_mean (const Column< double > &col)
 
template<typename T >
NEVER_INLINE HOST double get_column_mean (const T *data, const int64_t num_rows)
 
template NEVER_INLINE HOST double get_column_mean (const int32_t *data, const int64_t num_rows)
 
template NEVER_INLINE HOST double get_column_mean (const int64_t *data, const int64_t num_rows)
 
template NEVER_INLINE HOST double get_column_mean (const float *data, const int64_t num_rows)
 
template NEVER_INLINE HOST double get_column_mean (const double *data, const int64_t num_rows)
 
template<typename T >
NEVER_INLINE HOST double get_column_std_dev (const Column< T > &col, const double mean)
 
template NEVER_INLINE HOST double get_column_std_dev (const Column< int32_t > &col, const double mean)
 
template NEVER_INLINE HOST double get_column_std_dev (const Column< int64_t > &col, const double mean)
 
template NEVER_INLINE HOST double get_column_std_dev (const Column< float > &col, const double mean)
 
template NEVER_INLINE HOST double get_column_std_dev (const Column< double > &col, const double mean)
 
template<typename T >
NEVER_INLINE HOST double get_column_std_dev (const T *data, const int64_t num_rows, const double mean)
 
template NEVER_INLINE HOST double get_column_std_dev (const int32_t *data, const int64_t num_rows, const double mean)
 
template NEVER_INLINE HOST double get_column_std_dev (const int64_t *data, const int64_t num_rows, const double mean)
 
template NEVER_INLINE HOST double get_column_std_dev (const float *data, const int64_t num_rows, const double mean)
 
template NEVER_INLINE HOST double get_column_std_dev (const double *data, const int64_t num_rows, const double mean)
 
template<typename T >
NEVER_INLINE HOST std::tuple
< T, T, bool > 
get_column_metadata (const Column< T > &col)
 
template NEVER_INLINE HOST
std::tuple< int8_t, int8_t,
bool > 
get_column_metadata (const Column< int8_t > &col)
 
template NEVER_INLINE HOST
std::tuple< int16_t, int16_t,
bool > 
get_column_metadata (const Column< int16_t > &col)
 
template NEVER_INLINE HOST
std::tuple< int32_t, int32_t,
bool > 
get_column_metadata (const Column< int32_t > &col)
 
template NEVER_INLINE HOST
std::tuple< int64_t, int64_t,
bool > 
get_column_metadata (const Column< int64_t > &col)
 
template NEVER_INLINE HOST
std::tuple< float, float, bool > 
get_column_metadata (const Column< float > &col)
 
template NEVER_INLINE HOST
std::tuple< double, double,
bool > 
get_column_metadata (const Column< double > &col)
 
std::tuple< int32_t, int32_t,
bool > 
get_column_metadata (const Column< TextEncodingDict > &col)
 
template<typename T >
void z_std_normalize_col (const T *input_data, T *output_data, const int64_t num_rows, const double mean, const double std_dev)
 
template void z_std_normalize_col (const float *input_data, float *output_data, const int64_t num_rows, const double mean, const double std_dev)
 
template void z_std_normalize_col (const double *input_data, double *output_data, const int64_t num_rows, const double mean, const double std_dev)
 
template<typename T >
std::vector< std::vector< T > > z_std_normalize_data (const std::vector< T * > &input_data, const int64_t num_rows)
 
template std::vector
< std::vector< float > > 
z_std_normalize_data (const std::vector< float * > &input_data, const int64_t num_rows)
 
template std::vector
< std::vector< double > > 
z_std_normalize_data (const std::vector< double * > &input_data, const int64_t num_rows)
 
template<typename T1 , typename T2 >
NEVER_INLINE HOST T1 distance_in_meters (const T1 fromlon, const T1 fromlat, const T2 tolon, const T2 tolat)
 
template NEVER_INLINE HOST float distance_in_meters (const float fromlon, const float fromlat, const float tolon, const float tolat)
 
template NEVER_INLINE HOST float distance_in_meters (const float fromlon, const float fromlat, const double tolon, const double tolat)
 
template NEVER_INLINE HOST double distance_in_meters (const double fromlon, const double fromlat, const float tolon, const float tolat)
 
template NEVER_INLINE HOST double distance_in_meters (const double fromlon, const double fromlat, const double tolon, const double tolat)
 Computes the distance, in meters, between two WGS-84 positions. More...
 
std::regex FileUtilities::glob_to_regex (const std::string &glob, bool case_sensitive=false)
 
std::vector
< std::filesystem::path > 
FileUtilities::get_fs_paths (const std::string &file_or_directory)
 
template<typename T >
NEVER_INLINE HOST bool is_valid_tf_input (const T input, const T bounds_val, const BoundsType bounds_type, const IntervalType interval_type)
 
template NEVER_INLINE HOST bool is_valid_tf_input (const int32_t input, const int32_t bounds_val, const BoundsType bounds_type, const IntervalType interval_type)
 
template NEVER_INLINE HOST bool is_valid_tf_input (const int64_t input, const int64_t bounds_val, const BoundsType bounds_type, const IntervalType interval_type)
 
template NEVER_INLINE HOST bool is_valid_tf_input (const float input, const float bounds_val, const BoundsType bounds_type, const IntervalType interval_type)
 
template NEVER_INLINE HOST bool is_valid_tf_input (const double input, const double bounds_val, const BoundsType bounds_type, const IntervalType interval_type)
 

Macro Definition Documentation

#define NANOSECONDS_PER_SECOND   1000000000

Definition at line 33 of file TableFunctionsCommon.cpp.

Function Documentation

template<typename T1 , typename T2 >
NEVER_INLINE HOST T1 distance_in_meters ( const T1  fromlon,
const T1  fromlat,
const T2  tolon,
const T2  tolat 
)

Definition at line 397 of file TableFunctionsCommon.cpp.

397  {
398  T1 latitudeArc = (fromlat - tolat) * 0.017453292519943295769236907684886;
399  T1 longitudeArc = (fromlon - tolon) * 0.017453292519943295769236907684886;
400  T1 latitudeH = sin(latitudeArc * 0.5);
401  latitudeH *= latitudeH;
402  T1 lontitudeH = sin(longitudeArc * 0.5);
403  lontitudeH *= lontitudeH;
404  T1 tmp = cos(fromlat * 0.017453292519943295769236907684886) *
405  cos(tolat * 0.017453292519943295769236907684886);
406  return 6372797.560856 * (2.0 * asin(sqrt(latitudeH + tmp * lontitudeH)));
407 }
template NEVER_INLINE HOST float distance_in_meters ( const float  fromlon,
const float  fromlat,
const float  tolon,
const float  tolat 
)
template NEVER_INLINE HOST float distance_in_meters ( const float  fromlon,
const float  fromlat,
const double  tolon,
const double  tolat 
)
template NEVER_INLINE HOST double distance_in_meters ( const double  fromlon,
const double  fromlat,
const float  tolon,
const float  tolat 
)
template NEVER_INLINE HOST double distance_in_meters ( const double  fromlon,
const double  fromlat,
const double  tolon,
const double  tolat 
)

Computes the distance, in meters, between two WGS-84 positions.

The result is equal to EARTH_RADIUS_IN_METERS*ArcInRadians(from,to)

ArcInRadians is equal to Distance(from,to)/EARTH_RADIUS_IN_METERS = 2*asin(sqrt(h(d/EARTH_RADIUS_IN_METERS )))

where:

  • d is the distance in meters between 'from' and 'to' positions.
  • h is the haversine function: h(x)=sinĀ²(x/2)

code attribution: http://blog.julien.cayzac.name/2008/10/arc-and-distance-between-two-points-on.html

The haversine formula gives: h(d/R) = h(from.lat-to.lat)+h(from.lon-to.lon)+cos(from.lat)*cos(to.lat)

See Also
http://en.wikipedia.org/wiki/Law_of_haversines

Definition at line 423 of file ExtensionFunctions.hpp.

Referenced by GeoRaster< T, Z >::calculate_bins_and_scales(), length_linestring(), ST_Distance_Point_LineString_Geodesic(), and ST_Distance_Point_Point_Geodesic().

426  {
427  double latitudeArc = (fromlat - tolat) * 0.017453292519943295769236907684886;
428  double longitudeArc = (fromlon - tolon) * 0.017453292519943295769236907684886;
429  double latitudeH = sin(latitudeArc * 0.5);
430  latitudeH *= latitudeH;
431  double lontitudeH = sin(longitudeArc * 0.5);
432  lontitudeH *= lontitudeH;
433  double tmp = cos(fromlat * 0.017453292519943295769236907684886) *
434  cos(tolat * 0.017453292519943295769236907684886);
435  return 6372797.560856 * (2.0 * asin(sqrt(latitudeH + tmp * lontitudeH)));
436 }

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST double get_column_mean ( const Column< T > &  col)

Definition at line 109 of file TableFunctionsCommon.cpp.

References get_column_mean(), Column< T >::ptr_, and Column< T >::size_.

Referenced by get_column_mean(), and z_std_normalize_data().

109  {
110  return get_column_mean(col.ptr_, col.size_);
111 }
T * ptr_
Definition: heavydbTypes.h:454
NEVER_INLINE HOST double get_column_mean(const Column< T > &col)
int64_t size_
Definition: heavydbTypes.h:455

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template NEVER_INLINE HOST double get_column_mean ( const Column< int32_t > &  col)
template NEVER_INLINE HOST double get_column_mean ( const Column< int64_t > &  col)
template NEVER_INLINE HOST double get_column_mean ( const Column< float > &  col)
template NEVER_INLINE HOST double get_column_mean ( const Column< double > &  col)
template<typename T >
NEVER_INLINE HOST double get_column_mean ( const T *  data,
const int64_t  num_rows 
)

Definition at line 119 of file TableFunctionsCommon.cpp.

References max_inputs_per_thread, threading_serial::parallel_for(), and heavydb.dtypes::T.

119  {
120  // const int64_t num_rows = col.size();
121  const size_t max_thread_count = std::thread::hardware_concurrency();
122  const size_t max_inputs_per_thread = 200000;
123  const size_t num_threads = std::min(
124  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
125 
126  std::vector<double> local_col_sums(num_threads, 0.);
127  std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
128  tbb::task_arena limited_arena(num_threads);
129 
130  limited_arena.execute([&] {
131  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
132  [&](const tbb::blocked_range<int64_t>& r) {
133  const int64_t start_idx = r.begin();
134  const int64_t end_idx = r.end();
135  double local_col_sum = 0.;
136  int64_t local_col_non_null_count = 0;
137  for (int64_t r = start_idx; r < end_idx; ++r) {
138  const T val = data[r];
139  if (val == inline_null_value<T>()) {
140  continue;
141  }
142  local_col_sum += data[r];
143  local_col_non_null_count++;
144  }
145  size_t thread_idx = tbb::this_task_arena::current_thread_index();
146  local_col_sums[thread_idx] += local_col_sum;
147  local_col_non_null_counts[thread_idx] += local_col_non_null_count;
148  });
149  });
150 
151  double col_sum = 0.0;
152  int64_t col_non_null_count = 0L;
153 
154  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
155  col_sum += local_col_sums[thread_idx];
156  col_non_null_count += local_col_non_null_counts[thread_idx];
157  }
158 
159  return col_non_null_count == 0 ? 0 : col_sum / col_non_null_count;
160 }
const size_t max_inputs_per_thread
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

template NEVER_INLINE HOST double get_column_mean ( const int32_t *  data,
const int64_t  num_rows 
)
template NEVER_INLINE HOST double get_column_mean ( const int64_t *  data,
const int64_t  num_rows 
)
template NEVER_INLINE HOST double get_column_mean ( const float *  data,
const int64_t  num_rows 
)
template NEVER_INLINE HOST double get_column_mean ( const double *  data,
const int64_t  num_rows 
)
template<typename T >
NEVER_INLINE HOST std::tuple<T, T, bool> get_column_metadata ( const Column< T > &  col)

Definition at line 254 of file TableFunctionsCommon.cpp.

References Column< T >::isNull(), max_inputs_per_thread, threading_serial::parallel_for(), Column< T >::size(), and heavydb.dtypes::T.

Referenced by get_column_metadata().

254  {
255  T col_min = std::numeric_limits<T>::max();
256  T col_max = std::numeric_limits<T>::lowest();
257  bool has_nulls = false;
258  const int64_t num_rows = col.size();
259  const size_t max_thread_count = std::thread::hardware_concurrency();
260  const size_t max_inputs_per_thread = 200000;
261  const size_t num_threads = std::min(
262  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
263 
264  std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
265  std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
266  std::vector<bool> local_col_has_nulls(num_threads, false);
267  tbb::task_arena limited_arena(num_threads);
268 
269  limited_arena.execute([&] {
270  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
271  [&](const tbb::blocked_range<int64_t>& r) {
272  const int64_t start_idx = r.begin();
273  const int64_t end_idx = r.end();
274  T local_col_min = std::numeric_limits<T>::max();
275  T local_col_max = std::numeric_limits<T>::lowest();
276  bool local_has_nulls = false;
277  for (int64_t r = start_idx; r < end_idx; ++r) {
278  if (col.isNull(r)) {
279  local_has_nulls = true;
280  continue;
281  }
282  if (col[r] < local_col_min) {
283  local_col_min = col[r];
284  }
285  if (col[r] > local_col_max) {
286  local_col_max = col[r];
287  }
288  }
289  const size_t thread_idx =
290  tbb::this_task_arena::current_thread_index();
291  if (local_has_nulls) {
292  local_col_has_nulls[thread_idx] = true;
293  }
294  if (local_col_min < local_col_mins[thread_idx]) {
295  local_col_mins[thread_idx] = local_col_min;
296  }
297  if (local_col_max > local_col_maxes[thread_idx]) {
298  local_col_maxes[thread_idx] = local_col_max;
299  }
300  });
301  });
302 
303  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
304  if (local_col_has_nulls[thread_idx]) {
305  has_nulls = true;
306  }
307  if (local_col_mins[thread_idx] < col_min) {
308  col_min = local_col_mins[thread_idx];
309  }
310  if (local_col_maxes[thread_idx] > col_max) {
311  col_max = local_col_maxes[thread_idx];
312  }
313  }
314  return {col_min, col_max, has_nulls};
315 }
DEVICE int64_t size() const
Definition: heavydbTypes.h:469
const size_t max_inputs_per_thread
DEVICE bool isNull(int64_t index) const
Definition: heavydbTypes.h:471
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template NEVER_INLINE HOST std::tuple<int8_t, int8_t, bool> get_column_metadata ( const Column< int8_t > &  col)
template NEVER_INLINE HOST std::tuple<int16_t, int16_t, bool> get_column_metadata ( const Column< int16_t > &  col)
template NEVER_INLINE HOST std::tuple<int32_t, int32_t, bool> get_column_metadata ( const Column< int32_t > &  col)
template NEVER_INLINE HOST std::tuple<int64_t, int64_t, bool> get_column_metadata ( const Column< int64_t > &  col)
template NEVER_INLINE HOST std::tuple<float, float, bool> get_column_metadata ( const Column< float > &  col)
template NEVER_INLINE HOST std::tuple<double, double, bool> get_column_metadata ( const Column< double > &  col)
std::tuple<int32_t, int32_t, bool> get_column_metadata ( const Column< TextEncodingDict > &  col)

Definition at line 330 of file TableFunctionsCommon.cpp.

References get_column_metadata(), Column< T >::ptr_, Column< TextEncodingDict >::ptr_, Column< T >::size_, and Column< TextEncodingDict >::size_.

331  {
332  Column<int32_t> int_alias_col;
333  int_alias_col.ptr_ = reinterpret_cast<int32_t*>(col.ptr_);
334  int_alias_col.size_ = col.size_;
335  return get_column_metadata(int_alias_col);
336 }
T * ptr_
Definition: heavydbTypes.h:454
TextEncodingDict * ptr_
Definition: heavydbTypes.h:502
int64_t size_
Definition: heavydbTypes.h:455
NEVER_INLINE HOST std::tuple< T, T, bool > get_column_metadata(const Column< T > &col)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST std::pair<T, T> get_column_min_max ( const Column< T > &  col)

Definition at line 36 of file TableFunctionsCommon.cpp.

References Column< T >::isNull(), max_inputs_per_thread, threading_serial::parallel_for(), Column< T >::size(), and heavydb.dtypes::T.

Referenced by ct_union_pushdown_stats__cpu_template(), GeoRaster< T, Z >::GeoRaster(), get_column_min_max(), get_min_or_max(), and get_min_or_max_union().

36  {
37  T col_min = std::numeric_limits<T>::max();
38  T col_max = std::numeric_limits<T>::lowest();
39  const int64_t num_rows = col.size();
40  const size_t max_thread_count = std::thread::hardware_concurrency();
41  const size_t max_inputs_per_thread = 200000;
42  const size_t num_threads = std::min(
43  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
44 
45  std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
46  std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
47  tbb::task_arena limited_arena(num_threads);
48 
49  limited_arena.execute([&] {
50  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
51  [&](const tbb::blocked_range<int64_t>& r) {
52  const int64_t start_idx = r.begin();
53  const int64_t end_idx = r.end();
54  T local_col_min = std::numeric_limits<T>::max();
55  T local_col_max = std::numeric_limits<T>::lowest();
56  for (int64_t r = start_idx; r < end_idx; ++r) {
57  if (col.isNull(r)) {
58  continue;
59  }
60  if (col[r] < local_col_min) {
61  local_col_min = col[r];
62  }
63  if (col[r] > local_col_max) {
64  local_col_max = col[r];
65  }
66  }
67  size_t thread_idx = tbb::this_task_arena::current_thread_index();
68  if (local_col_min < local_col_mins[thread_idx]) {
69  local_col_mins[thread_idx] = local_col_min;
70  }
71  if (local_col_max > local_col_maxes[thread_idx]) {
72  local_col_maxes[thread_idx] = local_col_max;
73  }
74  });
75  });
76 
77  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
78  if (local_col_mins[thread_idx] < col_min) {
79  col_min = local_col_mins[thread_idx];
80  }
81  if (local_col_maxes[thread_idx] > col_max) {
82  col_max = local_col_maxes[thread_idx];
83  }
84  }
85  return std::make_pair(col_min, col_max);
86 }
DEVICE int64_t size() const
Definition: heavydbTypes.h:469
const size_t max_inputs_per_thread
DEVICE bool isNull(int64_t index) const
Definition: heavydbTypes.h:471
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template NEVER_INLINE HOST std::pair<int8_t, int8_t> get_column_min_max ( const Column< int8_t > &  col)
template NEVER_INLINE HOST std::pair<int16_t, int16_t> get_column_min_max ( const Column< int16_t > &  col)
template NEVER_INLINE HOST std::pair<int32_t, int32_t> get_column_min_max ( const Column< int32_t > &  col)
template NEVER_INLINE HOST std::pair<int64_t, int64_t> get_column_min_max ( const Column< int64_t > &  col)
template NEVER_INLINE HOST std::pair<float, float> get_column_min_max ( const Column< float > &  col)
template NEVER_INLINE HOST std::pair<double, double> get_column_min_max ( const Column< double > &  col)
std::pair<int32_t, int32_t> get_column_min_max ( const Column< TextEncodingDict > &  col)

Definition at line 101 of file TableFunctionsCommon.cpp.

References get_column_min_max(), Column< T >::ptr_, Column< TextEncodingDict >::ptr_, Column< T >::size_, and Column< TextEncodingDict >::size_.

101  {
102  Column<int32_t> int_alias_col;
103  int_alias_col.ptr_ = reinterpret_cast<int32_t*>(col.ptr_);
104  int_alias_col.size_ = col.size_;
105  return get_column_min_max(int_alias_col);
106 }
NEVER_INLINE HOST std::pair< T, T > get_column_min_max(const Column< T > &col)
T * ptr_
Definition: heavydbTypes.h:454
TextEncodingDict * ptr_
Definition: heavydbTypes.h:502
int64_t size_
Definition: heavydbTypes.h:455

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST double get_column_std_dev ( const Column< T > &  col,
const double  mean 
)

Definition at line 178 of file TableFunctionsCommon.cpp.

References get_column_std_dev(), Column< T >::ptr_, and Column< T >::size_.

Referenced by get_column_std_dev(), and z_std_normalize_data().

178  {
179  return get_column_std_dev(col.ptr_, col.size_, mean);
180 }
T * ptr_
Definition: heavydbTypes.h:454
int64_t size_
Definition: heavydbTypes.h:455
NEVER_INLINE HOST double get_column_std_dev(const Column< T > &col, const double mean)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template NEVER_INLINE HOST double get_column_std_dev ( const Column< int32_t > &  col,
const double  mean 
)
template NEVER_INLINE HOST double get_column_std_dev ( const Column< int64_t > &  col,
const double  mean 
)
template NEVER_INLINE HOST double get_column_std_dev ( const Column< float > &  col,
const double  mean 
)
template NEVER_INLINE HOST double get_column_std_dev ( const Column< double > &  col,
const double  mean 
)
template<typename T >
NEVER_INLINE HOST double get_column_std_dev ( const T *  data,
const int64_t  num_rows,
const double  mean 
)

Definition at line 192 of file TableFunctionsCommon.cpp.

References max_inputs_per_thread, threading_serial::parallel_for(), and heavydb.dtypes::T.

194  {
195  // const int64_t num_rows = col.size();
196  const size_t max_thread_count = std::thread::hardware_concurrency();
197  const size_t max_inputs_per_thread = 200000;
198  const size_t num_threads = std::min(
199  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
200 
201  std::vector<double> local_col_squared_residuals(num_threads, 0.);
202  std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
203  tbb::task_arena limited_arena(num_threads);
204 
205  limited_arena.execute([&] {
206  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
207  [&](const tbb::blocked_range<int64_t>& r) {
208  const int64_t start_idx = r.begin();
209  const int64_t end_idx = r.end();
210  double local_col_squared_residual = 0.;
211  int64_t local_col_non_null_count = 0;
212  for (int64_t r = start_idx; r < end_idx; ++r) {
213  const T val = data[r];
214  if (val == inline_null_value<T>()) {
215  continue;
216  }
217  const double residual = val - mean;
218  local_col_squared_residual += (residual * residual);
219  local_col_non_null_count++;
220  }
221  size_t thread_idx = tbb::this_task_arena::current_thread_index();
222  local_col_squared_residuals[thread_idx] +=
223  local_col_squared_residual;
224  local_col_non_null_counts[thread_idx] += local_col_non_null_count;
225  });
226  });
227 
228  double col_sum_squared_residual = 0.0;
229  int64_t col_non_null_count = 0;
230 
231  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
232  col_sum_squared_residual += local_col_squared_residuals[thread_idx];
233  col_non_null_count += local_col_non_null_counts[thread_idx];
234  }
235 
236  return col_non_null_count == 0 ? 0
237  : sqrt(col_sum_squared_residual / col_non_null_count);
238 }
const size_t max_inputs_per_thread
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

template NEVER_INLINE HOST double get_column_std_dev ( const int32_t *  data,
const int64_t  num_rows,
const double  mean 
)
template NEVER_INLINE HOST double get_column_std_dev ( const int64_t *  data,
const int64_t  num_rows,
const double  mean 
)
template NEVER_INLINE HOST double get_column_std_dev ( const float *  data,
const int64_t  num_rows,
const double  mean 
)
template NEVER_INLINE HOST double get_column_std_dev ( const double *  data,
const int64_t  num_rows,
const double  mean 
)
template<typename T >
NEVER_INLINE HOST bool is_valid_tf_input ( const T  input,
const T  bounds_val,
const BoundsType  bounds_type,
const IntervalType  interval_type 
)

Definition at line 628 of file TableFunctionsCommon.cpp.

References Exclusive, Inclusive, Max, Min, and UNREACHABLE.

631  {
632  switch (bounds_type) {
633  case BoundsType::Min:
634  switch (interval_type) {
636  return input >= bounds_val;
638  return input > bounds_val;
639  default:
640  UNREACHABLE();
641  }
642  case BoundsType::Max:
643  switch (interval_type) {
645  return input <= bounds_val;
647  return input < bounds_val;
648  default:
649  UNREACHABLE();
650  }
651  break;
652  default:
653  UNREACHABLE();
654  }
655  UNREACHABLE();
656  return false; // To address compiler warning
657 }
#define UNREACHABLE()
Definition: Logger.h:266
template NEVER_INLINE HOST bool is_valid_tf_input ( const int32_t  input,
const int32_t  bounds_val,
const BoundsType  bounds_type,
const IntervalType  interval_type 
)
template NEVER_INLINE HOST bool is_valid_tf_input ( const int64_t  input,
const int64_t  bounds_val,
const BoundsType  bounds_type,
const IntervalType  interval_type 
)
template NEVER_INLINE HOST bool is_valid_tf_input ( const float  input,
const float  bounds_val,
const BoundsType  bounds_type,
const IntervalType  interval_type 
)
template NEVER_INLINE HOST bool is_valid_tf_input ( const double  input,
const double  bounds_val,
const BoundsType  bounds_type,
const IntervalType  interval_type 
)
template<typename T >
void z_std_normalize_col ( const T *  input_data,
T *  output_data,
const int64_t  num_rows,
const double  mean,
const double  std_dev 
)

Definition at line 339 of file TableFunctionsCommon.cpp.

References threading_serial::parallel_for().

Referenced by z_std_normalize_data().

343  {
344  if (std_dev <= 0.0) {
345  throw std::runtime_error("Standard deviation cannot be <= 0");
346  }
347  const double inv_std_dev = 1.0 / std_dev;
348 
349  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
350  [&](const tbb::blocked_range<int64_t>& r) {
351  const int64_t start_idx = r.begin();
352  const int64_t end_idx = r.end();
353  for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
354  output_data[row_idx] = (input_data[row_idx] - mean) * inv_std_dev;
355  }
356  });
357 }
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template void z_std_normalize_col ( const float *  input_data,
float *  output_data,
const int64_t  num_rows,
const double  mean,
const double  std_dev 
)
template void z_std_normalize_col ( const double *  input_data,
double *  output_data,
const int64_t  num_rows,
const double  mean,
const double  std_dev 
)
template<typename T >
std::vector<std::vector<T> > z_std_normalize_data ( const std::vector< T * > &  input_data,
const int64_t  num_rows 
)

Definition at line 371 of file TableFunctionsCommon.cpp.

References get_column_mean(), get_column_std_dev(), and z_std_normalize_col().

Referenced by dbscan__cpu_template(), and kmeans__cpu_template().

372  {
373  const int64_t num_features = input_data.size();
374  std::vector<std::vector<T>> normalized_data(num_features);
375  for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
376  const auto mean = get_column_mean(input_data[feature_idx], num_rows);
377  const auto std_dev = get_column_std_dev(input_data[feature_idx], num_rows, mean);
378  normalized_data[feature_idx].resize(num_rows);
379  z_std_normalize_col(input_data[feature_idx],
380  normalized_data[feature_idx].data(),
381  num_rows,
382  mean,
383  std_dev);
384  }
385  return normalized_data;
386 }
void z_std_normalize_col(const T *input_data, T *output_data, const int64_t num_rows, const double mean, const double std_dev)
NEVER_INLINE HOST double get_column_mean(const Column< T > &col)
NEVER_INLINE HOST double get_column_std_dev(const Column< T > &col, const double mean)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template std::vector<std::vector<float> > z_std_normalize_data ( const std::vector< float * > &  input_data,
const int64_t  num_rows 
)
template std::vector<std::vector<double> > z_std_normalize_data ( const std::vector< double * > &  input_data,
const int64_t  num_rows 
)