#include "TableFunctionsCommon.hpp"
#include <filesystem>
#include <memory>
#include <regex>
#include <string>
#include <tbb/parallel_for.h>
#include <tbb/task_arena.h>

Include dependency graph for TableFunctionsCommon.cpp:

Namespaces
	FileUtilities

Macros
#define	NANOSECONDS_PER_SECOND 1000000000

Functions
template<typename T >
NEVER_INLINE HOST std::pair< T, T >	get_column_min_max (const Column< T > &col)

template NEVER_INLINE HOST std::pair< int8_t, int8_t >	get_column_min_max (const Column< int8_t > &col)

template NEVER_INLINE HOST std::pair< int16_t, int16_t >	get_column_min_max (const Column< int16_t > &col)

template NEVER_INLINE HOST std::pair< int32_t, int32_t >	get_column_min_max (const Column< int32_t > &col)

template NEVER_INLINE HOST std::pair< int64_t, int64_t >	get_column_min_max (const Column< int64_t > &col)

template NEVER_INLINE HOST std::pair< float, float >	get_column_min_max (const Column< float > &col)

template NEVER_INLINE HOST std::pair< double, double >	get_column_min_max (const Column< double > &col)

std::pair< int32_t, int32_t >	get_column_min_max (const Column< TextEncodingDict > &col)

template<typename T >
NEVER_INLINE HOST double	get_column_mean (const T *data, const int64_t num_rows)

template NEVER_INLINE HOST double	get_column_mean (const int8_t *data, const int64_t num_rows)

template NEVER_INLINE HOST double	get_column_mean (const int16_t *data, const int64_t num_rows)

template NEVER_INLINE HOST double	get_column_mean (const int32_t *data, const int64_t num_rows)

template NEVER_INLINE HOST double	get_column_mean (const int64_t *data, const int64_t num_rows)

template NEVER_INLINE HOST double	get_column_mean (const float *data, const int64_t num_rows)

template NEVER_INLINE HOST double	get_column_mean (const double *data, const int64_t num_rows)

template<typename T >
NEVER_INLINE HOST double	get_column_mean (const Column< T > &col)

template NEVER_INLINE HOST double	get_column_mean (const Column< int8_t > &col)

template NEVER_INLINE HOST double	get_column_mean (const Column< int16_t > &col)

template NEVER_INLINE HOST double	get_column_mean (const Column< int32_t > &col)

template NEVER_INLINE HOST double	get_column_mean (const Column< int64_t > &col)

template NEVER_INLINE HOST double	get_column_mean (const Column< float > &col)

template NEVER_INLINE HOST double	get_column_mean (const Column< double > &col)

template<typename T >
NEVER_INLINE HOST double	get_column_std_dev (const Column< T > &col, const double mean)

template NEVER_INLINE HOST double	get_column_std_dev (const Column< int32_t > &col, const double mean)

template NEVER_INLINE HOST double	get_column_std_dev (const Column< int64_t > &col, const double mean)

template NEVER_INLINE HOST double	get_column_std_dev (const Column< float > &col, const double mean)

template NEVER_INLINE HOST double	get_column_std_dev (const Column< double > &col, const double mean)

template<typename T >
NEVER_INLINE HOST double	get_column_std_dev (const T *data, const int64_t num_rows, const double mean)

template NEVER_INLINE HOST double	get_column_std_dev (const int32_t *data, const int64_t num_rows, const double mean)

template NEVER_INLINE HOST double	get_column_std_dev (const int64_t *data, const int64_t num_rows, const double mean)

template NEVER_INLINE HOST double	get_column_std_dev (const float *data, const int64_t num_rows, const double mean)

template NEVER_INLINE HOST double	get_column_std_dev (const double *data, const int64_t num_rows, const double mean)

template<typename T >
NEVER_INLINE HOST std::tuple < T, T, bool >	get_column_metadata (const Column< T > &col)

template NEVER_INLINE HOST std::tuple< int8_t, int8_t, bool >	get_column_metadata (const Column< int8_t > &col)

template NEVER_INLINE HOST std::tuple< int16_t, int16_t, bool >	get_column_metadata (const Column< int16_t > &col)

template NEVER_INLINE HOST std::tuple< int32_t, int32_t, bool >	get_column_metadata (const Column< int32_t > &col)

template NEVER_INLINE HOST std::tuple< int64_t, int64_t, bool >	get_column_metadata (const Column< int64_t > &col)

template NEVER_INLINE HOST std::tuple< float, float, bool >	get_column_metadata (const Column< float > &col)

template NEVER_INLINE HOST std::tuple< double, double, bool >	get_column_metadata (const Column< double > &col)

std::tuple< int32_t, int32_t, bool >	get_column_metadata (const Column< TextEncodingDict > &col)

template<typename T >
void	z_std_normalize_col (const T input_data, T output_data, const int64_t num_rows, const double mean, const double std_dev)

template void	z_std_normalize_col (const float input_data, float output_data, const int64_t num_rows, const double mean, const double std_dev)

template void	z_std_normalize_col (const double input_data, double output_data, const int64_t num_rows, const double mean, const double std_dev)

template<typename T >
std::vector< std::vector< T > >	z_std_normalize_data (const std::vector< T * > &input_data, const int64_t num_rows)

template std::vector < std::vector< float > >	z_std_normalize_data (const std::vector< float * > &input_data, const int64_t num_rows)

template std::vector < std::vector< double > >	z_std_normalize_data (const std::vector< double * > &input_data, const int64_t num_rows)

template<typename T >
ZStdNormalizationSummaryStats< T >	z_std_normalize_data_with_summary_stats (const std::vector< T * > &input_data, const int64_t num_rows)

template ZStdNormalizationSummaryStats < float >	z_std_normalize_data_with_summary_stats (const std::vector< float * > &input_data, const int64_t num_rows)

template ZStdNormalizationSummaryStats < double >	z_std_normalize_data_with_summary_stats (const std::vector< double * > &input_data, const int64_t num_rows)

template<typename T1 , typename T2 >
NEVER_INLINE HOST T1	distance_in_meters (const T1 fromlon, const T1 fromlat, const T2 tolon, const T2 tolat)

template NEVER_INLINE HOST float	distance_in_meters (const float fromlon, const float fromlat, const float tolon, const float tolat)

template NEVER_INLINE HOST float	distance_in_meters (const float fromlon, const float fromlat, const double tolon, const double tolat)

template NEVER_INLINE HOST double	distance_in_meters (const double fromlon, const double fromlat, const float tolon, const float tolat)

template NEVER_INLINE HOST double	distance_in_meters (const double fromlon, const double fromlat, const double tolon, const double tolat)
	Computes the distance, in meters, between two WGS-84 positions. More...

std::regex	FileUtilities::glob_to_regex (const std::string &glob, bool case_sensitive=false)

std::vector < std::filesystem::path >	FileUtilities::get_fs_paths (const std::string &file_or_directory)

template<typename T >
NEVER_INLINE HOST bool	is_valid_tf_input (const T input, const T bounds_val, const BoundsType bounds_type, const IntervalType interval_type)

template NEVER_INLINE HOST bool	is_valid_tf_input (const int32_t input, const int32_t bounds_val, const BoundsType bounds_type, const IntervalType interval_type)

template NEVER_INLINE HOST bool	is_valid_tf_input (const int64_t input, const int64_t bounds_val, const BoundsType bounds_type, const IntervalType interval_type)

template NEVER_INLINE HOST bool	is_valid_tf_input (const float input, const float bounds_val, const BoundsType bounds_type, const IntervalType interval_type)

template NEVER_INLINE HOST bool	is_valid_tf_input (const double input, const double bounds_val, const BoundsType bounds_type, const IntervalType interval_type)

Macro Definition Documentation

#define NANOSECONDS_PER_SECOND 1000000000

Definition at line 29 of file TableFunctionsCommon.cpp.

Function Documentation

template<typename T1 , typename T2 >

NEVER_INLINE HOST T1 distance_in_meters	(	const T1	fromlon,
		const T1	fromlat,
		const T2	tolon,
		const T2	tolat
	)

Definition at line 452 of file TableFunctionsCommon.cpp.

                                                                                        {
   T1 latitudeArc = (fromlat - tolat) * 0.017453292519943295769236907684886;
   T1 longitudeArc = (fromlon - tolon) * 0.017453292519943295769236907684886;
   T1 latitudeH = sin(latitudeArc * 0.5);
   latitudeH *= latitudeH;
   T1 lontitudeH = sin(longitudeArc * 0.5);
   lontitudeH *= lontitudeH;
   T1 tmp = cos(fromlat * 0.017453292519943295769236907684886) *
            cos(tolat * 0.017453292519943295769236907684886);
   return 6372797.560856 * (2.0 * asin(sqrt(latitudeH + tmp * lontitudeH)));
 }

template NEVER_INLINE HOST float distance_in_meters	(	const float	fromlon,
		const float	fromlat,
		const float	tolon,
		const float	tolat
	)

template NEVER_INLINE HOST float distance_in_meters	(	const float	fromlon,
		const float	fromlat,
		const double	tolon,
		const double	tolat
	)

template NEVER_INLINE HOST double distance_in_meters	(	const double	fromlon,
		const double	fromlat,
		const float	tolon,
		const float	tolat
	)

template NEVER_INLINE HOST double distance_in_meters	(	const double	fromlon,
		const double	fromlat,
		const double	tolon,
		const double	tolat
	)

Computes the distance, in meters, between two WGS-84 positions.

The result is equal to EARTH_RADIUS_IN_METERS*ArcInRadians(from,to)

ArcInRadians is equal to Distance(from,to)/EARTH_RADIUS_IN_METERS = 2*asin(sqrt(h(d/EARTH_RADIUS_IN_METERS )))

where:

d is the distance in meters between 'from' and 'to' positions.
h is the haversine function: h(x)=sin²(x/2)

code attribution: http://blog.julien.cayzac.name/2008/10/arc-and-distance-between-two-points-on.html

The haversine formula gives: h(d/R) = h(from.lat-to.lat)+h(from.lon-to.lon)+cos(from.lat)*cos(to.lat)

See Also: http://en.wikipedia.org/wiki/Law_of_haversines

Definition at line 433 of file ExtensionFunctions.hpp.

Referenced by GeoRaster< T, Z >::calculate_bins_and_scales(), length_linestring(), ST_Distance_Point_LineString_Geodesic(), and ST_Distance_Point_Point_Geodesic().

                                               {
   double latitudeArc = (fromlat - tolat) * 0.017453292519943295769236907684886;
   double longitudeArc = (fromlon - tolon) * 0.017453292519943295769236907684886;
   double latitudeH = sin(latitudeArc * 0.5);
   latitudeH *= latitudeH;
   double lontitudeH = sin(longitudeArc * 0.5);
   lontitudeH *= lontitudeH;
   double tmp = cos(fromlat * 0.017453292519943295769236907684886) *
                cos(tolat * 0.017453292519943295769236907684886);
   return 6372797.560856 * (2.0 * asin(sqrt(latitudeH + tmp * lontitudeH)));
 }

Here is the caller graph for this function:

template<typename T >

NEVER_INLINE HOST double get_column_mean	(	const T *	data,
		const int64_t	num_rows
	)

Definition at line 116 of file TableFunctionsCommon.cpp.

References max_inputs_per_thread, threading_serial::parallel_for(), and heavydb.dtypes::T.

Referenced by get_column_mean(), r2_score_impl(), z_std_normalize_data(), and z_std_normalize_data_with_summary_stats().

                                                                                 {
   // const int64_t num_rows = col.size();
   const size_t max_thread_count = std::thread::hardware_concurrency();
   const size_t max_inputs_per_thread = 20000;
   const size_t num_threads = std::min(
       max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
 
   std::vector<double> local_col_sums(num_threads, 0.);
   std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
   tbb::task_arena limited_arena(num_threads);
   limited_arena.execute([&] {
     tbb::parallel_for(
         tbb::blocked_range<int64_t>(0, num_rows),
         [&](const tbb::blocked_range<int64_t>& r) {
           const int64_t start_idx = r.begin();
           const int64_t end_idx = r.end();
           double local_col_sum = 0.;
           int64_t local_col_non_null_count = 0;
           for (int64_t r = start_idx; r < end_idx; ++r) {
             const T val = data[r];
             if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
               if (std::isnan(val) || std::isinf(val)) {
                 continue;
               }
             }
             if (val == inline_null_value<T>()) {
               continue;
             }
             local_col_sum += data[r];
             local_col_non_null_count++;
           }
           size_t thread_idx = tbb::this_task_arena::current_thread_index();
           local_col_sums[thread_idx] += local_col_sum;
           local_col_non_null_counts[thread_idx] += local_col_non_null_count;
         });
   });
 
   double col_sum = 0.0;
   int64_t col_non_null_count = 0L;
 
   for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     col_sum += local_col_sums[thread_idx];
     col_non_null_count += local_col_non_null_counts[thread_idx];
   }
 
   return col_non_null_count == 0 ? 0 : col_sum / col_non_null_count;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template NEVER_INLINE HOST double get_column_mean	(	const int8_t *	data,
		const int64_t	num_rows
	)

template NEVER_INLINE HOST double get_column_mean	(	const int16_t *	data,
		const int64_t	num_rows
	)

template NEVER_INLINE HOST double get_column_mean	(	const int32_t *	data,
		const int64_t	num_rows
	)

template NEVER_INLINE HOST double get_column_mean	(	const int64_t *	data,
		const int64_t	num_rows
	)

template NEVER_INLINE HOST double get_column_mean	(	const float *	data,
		const int64_t	num_rows
	)

template NEVER_INLINE HOST double get_column_mean	(	const double *	data,
		const int64_t	num_rows
	)

template<typename T >

NEVER_INLINE HOST double get_column_mean ( const Column< T > & col )

Definition at line 183 of file TableFunctionsCommon.cpp.

References get_column_mean(), Column< T >::getPtr(), and Column< T >::size().

                                                                {
   return get_column_mean(col.getPtr(), col.size());
 }

Here is the call graph for this function:

template NEVER_INLINE HOST double get_column_mean ( const Column< int8_t > & col )

template NEVER_INLINE HOST double get_column_mean ( const Column< int16_t > & col )

template NEVER_INLINE HOST double get_column_mean ( const Column< int32_t > & col )

template NEVER_INLINE HOST double get_column_mean ( const Column< int64_t > & col )

template NEVER_INLINE HOST double get_column_mean ( const Column< float > & col )

template NEVER_INLINE HOST double get_column_mean ( const Column< double > & col )

template<typename T >

NEVER_INLINE HOST std::tuple<T, T, bool> get_column_metadata ( const Column< T > & col )

Definition at line 276 of file TableFunctionsCommon.cpp.

References Column< T >::isNull(), max_inputs_per_thread, threading_serial::parallel_for(), Column< T >::size(), and heavydb.dtypes::T.

Referenced by get_column_metadata().

                                                                                  {
   T col_min = std::numeric_limits<T>::max();
   T col_max = std::numeric_limits<T>::lowest();
   bool has_nulls = false;
   const int64_t num_rows = col.size();
   const size_t max_thread_count = std::thread::hardware_concurrency();
   const size_t max_inputs_per_thread = 200000;
   const size_t num_threads = std::min(
       max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
 
   std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
   std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
   std::vector<bool> local_col_has_nulls(num_threads, false);
   tbb::task_arena limited_arena(num_threads);
 
   limited_arena.execute([&] {
     tbb::parallel_for(
         tbb::blocked_range<int64_t>(0, num_rows),
         [&](const tbb::blocked_range<int64_t>& r) {
           const int64_t start_idx = r.begin();
           const int64_t end_idx = r.end();
           T local_col_min = std::numeric_limits<T>::max();
           T local_col_max = std::numeric_limits<T>::lowest();
           bool local_has_nulls = false;
           for (int64_t r = start_idx; r < end_idx; ++r) {
             const T val = col[r];
             if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
               if (std::isnan(val) || std::isinf(val)) {
                 continue;
               }
             }
             if (col.isNull(r)) {
               local_has_nulls = true;
               continue;
             }
             if (val < local_col_min) {
               local_col_min = val;
             }
             if (val > local_col_max) {
               local_col_max = val;
             }
           }
           const size_t thread_idx = tbb::this_task_arena::current_thread_index();
           if (local_has_nulls) {
             local_col_has_nulls[thread_idx] = true;
           }
           if (local_col_min < local_col_mins[thread_idx]) {
             local_col_mins[thread_idx] = local_col_min;
           }
           if (local_col_max > local_col_maxes[thread_idx]) {
             local_col_maxes[thread_idx] = local_col_max;
           }
         });
   });
 
   for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     if (local_col_has_nulls[thread_idx]) {
       has_nulls = true;
     }
     if (local_col_mins[thread_idx] < col_min) {
       col_min = local_col_mins[thread_idx];
     }
     if (local_col_maxes[thread_idx] > col_max) {
       col_max = local_col_maxes[thread_idx];
     }
   }
   return {col_min, col_max, has_nulls};
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template NEVER_INLINE HOST std::tuple<int8_t, int8_t, bool> get_column_metadata ( const Column< int8_t > & col )

template NEVER_INLINE HOST std::tuple<int16_t, int16_t, bool> get_column_metadata ( const Column< int16_t > & col )

template NEVER_INLINE HOST std::tuple<int32_t, int32_t, bool> get_column_metadata ( const Column< int32_t > & col )

template NEVER_INLINE HOST std::tuple<int64_t, int64_t, bool> get_column_metadata ( const Column< int64_t > & col )

template NEVER_INLINE HOST std::tuple<float, float, bool> get_column_metadata ( const Column< float > & col )

template NEVER_INLINE HOST std::tuple<double, double, bool> get_column_metadata ( const Column< double > & col )

std::tuple<int32_t, int32_t, bool> get_column_metadata ( const Column< TextEncodingDict > & col )

Definition at line 358 of file TableFunctionsCommon.cpp.

References get_column_metadata(), Column< TextEncodingDict >::getPtr(), and Column< TextEncodingDict >::size().

                                          {
   Column<int32_t> int_alias_col(reinterpret_cast<int32_t*>(col.getPtr()), col.size());
   return get_column_metadata(int_alias_col);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST std::pair<T, T> get_column_min_max ( const Column< T > & col )

Definition at line 32 of file TableFunctionsCommon.cpp.

References max_inputs_per_thread, threading_serial::parallel_for(), Column< T >::size(), and heavydb.dtypes::T.

Referenced by ct_union_pushdown_stats__cpu_template(), RasterFormat_Namespace::format_raster_data(), GeoRaster< T, Z >::GeoRaster(), get_column_min_max(), get_min_or_max(), get_min_or_max_union(), and TableFunctions_Namespace::OneHotEncoder_Namespace::get_top_k_keys().

                                                                          {
   T col_min = std::numeric_limits<T>::max();
   T col_max = std::numeric_limits<T>::lowest();
   const int64_t num_rows = col.size();
   const size_t max_thread_count = std::thread::hardware_concurrency();
   const size_t max_inputs_per_thread = 20000;
   const size_t num_threads = std::min(
       max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
 
   std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
   std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
   tbb::task_arena limited_arena(num_threads);
 
   limited_arena.execute([&] {
     tbb::parallel_for(
         tbb::blocked_range<int64_t>(0, num_rows),
         [&](const tbb::blocked_range<int64_t>& r) {
           const int64_t start_idx = r.begin();
           const int64_t end_idx = r.end();
           T local_col_min = std::numeric_limits<T>::max();
           T local_col_max = std::numeric_limits<T>::lowest();
           for (int64_t r = start_idx; r < end_idx; ++r) {
             const T val = col[r];
             if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
               if (std::isnan(val) || std::isinf(val)) {
                 continue;
               }
             }
             if (val == inline_null_value<T>()) {
               continue;
             }
             if (val < local_col_min) {
               local_col_min = val;
             }
             if (val > local_col_max) {
               local_col_max = val;
             }
           }
           size_t thread_idx = tbb::this_task_arena::current_thread_index();
           if (local_col_min < local_col_mins[thread_idx]) {
             local_col_mins[thread_idx] = local_col_min;
           }
           if (local_col_max > local_col_maxes[thread_idx]) {
             local_col_maxes[thread_idx] = local_col_max;
           }
         });
   });
 
   for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     if (local_col_mins[thread_idx] < col_min) {
       col_min = local_col_mins[thread_idx];
     }
     if (local_col_maxes[thread_idx] > col_max) {
       col_max = local_col_maxes[thread_idx];
     }
   }
   return std::make_pair(col_min, col_max);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template NEVER_INLINE HOST std::pair<int8_t, int8_t> get_column_min_max ( const Column< int8_t > & col )

template NEVER_INLINE HOST std::pair<int16_t, int16_t> get_column_min_max ( const Column< int16_t > & col )

template NEVER_INLINE HOST std::pair<int32_t, int32_t> get_column_min_max ( const Column< int32_t > & col )

template NEVER_INLINE HOST std::pair<int64_t, int64_t> get_column_min_max ( const Column< int64_t > & col )

template NEVER_INLINE HOST std::pair<float, float> get_column_min_max ( const Column< float > & col )

template NEVER_INLINE HOST std::pair<double, double> get_column_min_max ( const Column< double > & col )

std::pair<int32_t, int32_t> get_column_min_max ( const Column< TextEncodingDict > & col )

Definition at line 104 of file TableFunctionsCommon.cpp.

References get_column_min_max(), Column< TextEncodingDict >::getPtr(), and Column< TextEncodingDict >::size().

                                                                                   {
   Column<int32_t> int_alias_col(reinterpret_cast<int32_t*>(col.getPtr()), col.size());
   return get_column_min_max(int_alias_col);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST double get_column_std_dev	(	const Column< T > &	col,
		const double	mean
	)

Definition at line 195 of file TableFunctionsCommon.cpp.

References get_column_std_dev(), Column< T >::getPtr(), and Column< T >::size().

Referenced by get_column_std_dev(), z_std_normalize_data(), and z_std_normalize_data_with_summary_stats().

                                                                                      {
   return get_column_std_dev(col.getPtr(), col.size(), mean);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template NEVER_INLINE HOST double get_column_std_dev	(	const Column< int32_t > &	col,
		const double	mean
	)

template NEVER_INLINE HOST double get_column_std_dev	(	const Column< int64_t > &	col,
		const double	mean
	)

template NEVER_INLINE HOST double get_column_std_dev	(	const Column< float > &	col,
		const double	mean
	)

template NEVER_INLINE HOST double get_column_std_dev	(	const Column< double > &	col,
		const double	mean
	)

template<typename T >

NEVER_INLINE HOST double get_column_std_dev	(	const T *	data,
		const int64_t	num_rows,
		const double	mean
	)

Definition at line 209 of file TableFunctionsCommon.cpp.

References max_inputs_per_thread, threading_serial::parallel_for(), and heavydb.dtypes::T.

                                                                {
   // const int64_t num_rows = col.size();
   const size_t max_thread_count = std::thread::hardware_concurrency();
   const size_t max_inputs_per_thread = 200000;
   const size_t num_threads = std::min(
       max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
 
   std::vector<double> local_col_squared_residuals(num_threads, 0.);
   std::vector<int64_t> local_col_non_null_counts(num_threads, 0L);
   tbb::task_arena limited_arena(num_threads);
 
   limited_arena.execute([&] {
     tbb::parallel_for(
         tbb::blocked_range<int64_t>(0, num_rows),
         [&](const tbb::blocked_range<int64_t>& r) {
           const int64_t start_idx = r.begin();
           const int64_t end_idx = r.end();
           double local_col_squared_residual = 0.;
           int64_t local_col_non_null_count = 0;
           for (int64_t r = start_idx; r < end_idx; ++r) {
             const T val = data[r];
             if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
               if (std::isnan(val) || std::isinf(val)) {
                 continue;
               }
             }
             if (val == inline_null_value<T>()) {
               continue;
             }
             const double residual = val - mean;
             local_col_squared_residual += (residual * residual);
             local_col_non_null_count++;
           }
           size_t thread_idx = tbb::this_task_arena::current_thread_index();
           local_col_squared_residuals[thread_idx] += local_col_squared_residual;
           local_col_non_null_counts[thread_idx] += local_col_non_null_count;
         });
   });
 
   double col_sum_squared_residual = 0.0;
   int64_t col_non_null_count = 0;
 
   for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     col_sum_squared_residual += local_col_squared_residuals[thread_idx];
     col_non_null_count += local_col_non_null_counts[thread_idx];
   }
 
   return col_non_null_count == 0 ? 0
                                  : sqrt(col_sum_squared_residual / col_non_null_count);
 }

Here is the call graph for this function:

template NEVER_INLINE HOST double get_column_std_dev	(	const int32_t *	data,
		const int64_t	num_rows,
		const double	mean
	)

template NEVER_INLINE HOST double get_column_std_dev	(	const int64_t *	data,
		const int64_t	num_rows,
		const double	mean
	)

template NEVER_INLINE HOST double get_column_std_dev	(	const float *	data,
		const int64_t	num_rows,
		const double	mean
	)

template NEVER_INLINE HOST double get_column_std_dev	(	const double *	data,
		const int64_t	num_rows,
		const double	mean
	)

template<typename T >

NEVER_INLINE HOST bool is_valid_tf_input	(	const T	input,
		const T	bounds_val,
		const BoundsType	bounds_type,
		const IntervalType	interval_type
	)

Definition at line 556 of file TableFunctionsCommon.cpp.

References Exclusive, Inclusive, Max, Min, and UNREACHABLE.

                                                                            {
   switch (bounds_type) {
     case BoundsType::Min:
       switch (interval_type) {
         case IntervalType::Inclusive:
           return input >= bounds_val;
         case IntervalType::Exclusive:
           return input > bounds_val;
         default:
           UNREACHABLE();
       }
     case BoundsType::Max:
       switch (interval_type) {
         case IntervalType::Inclusive:
           return input <= bounds_val;
         case IntervalType::Exclusive:
           return input < bounds_val;
         default:
           UNREACHABLE();
       }
       break;
     default:
       UNREACHABLE();
   }
   UNREACHABLE();
   return false;  // To address compiler warning
 }

template NEVER_INLINE HOST bool is_valid_tf_input	(	const int32_t	input,
		const int32_t	bounds_val,
		const BoundsType	bounds_type,
		const IntervalType	interval_type
	)

template NEVER_INLINE HOST bool is_valid_tf_input	(	const int64_t	input,
		const int64_t	bounds_val,
		const BoundsType	bounds_type,
		const IntervalType	interval_type
	)

template NEVER_INLINE HOST bool is_valid_tf_input	(	const float	input,
		const float	bounds_val,
		const BoundsType	bounds_type,
		const IntervalType	interval_type
	)

template NEVER_INLINE HOST bool is_valid_tf_input	(	const double	input,
		const double	bounds_val,
		const BoundsType	bounds_type,
		const IntervalType	interval_type
	)

template<typename T >

void z_std_normalize_col	(	const T *	input_data,
		T *	output_data,
		const int64_t	num_rows,
		const double	mean,
		const double	std_dev
	)

Definition at line 365 of file TableFunctionsCommon.cpp.

References threading_serial::parallel_for().

Referenced by z_std_normalize_data(), and z_std_normalize_data_with_summary_stats().

                                                {
   if (std_dev <= 0.0) {
     throw std::runtime_error("Standard deviation cannot be <= 0");
   }
   const double inv_std_dev = 1.0 / std_dev;
 
   tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
                     [&](const tbb::blocked_range<int64_t>& r) {
                       const int64_t start_idx = r.begin();
                       const int64_t end_idx = r.end();
                       for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
                         output_data[row_idx] = (input_data[row_idx] - mean) * inv_std_dev;
                       }
                     });
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template void z_std_normalize_col	(	const float *	input_data,
		float *	output_data,
		const int64_t	num_rows,
		const double	mean,
		const double	std_dev
	)

template void z_std_normalize_col	(	const double *	input_data,
		double *	output_data,
		const int64_t	num_rows,
		const double	mean,
		const double	std_dev
	)

template<typename T >

std::vector<std::vector<T> > z_std_normalize_data	(	const std::vector< T * > &	input_data,
		const int64_t	num_rows
	)

Definition at line 397 of file TableFunctionsCommon.cpp.

References get_column_mean(), get_column_std_dev(), and z_std_normalize_col().

Referenced by dbscan__cpu_template(), and kmeans__cpu_template().

                                                                          {
   const int64_t num_features = input_data.size();
   std::vector<std::vector<T>> normalized_data(num_features);
   for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
     const auto mean = get_column_mean(input_data[feature_idx], num_rows);
     const auto std_dev = get_column_std_dev(input_data[feature_idx], num_rows, mean);
     normalized_data[feature_idx].resize(num_rows);
     z_std_normalize_col(input_data[feature_idx],
                         normalized_data[feature_idx].data(),
                         num_rows,
                         mean,
                         std_dev);
   }
   return normalized_data;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template std::vector<std::vector<float> > z_std_normalize_data	(	const std::vector< float * > &	input_data,
		const int64_t	num_rows
	)

template std::vector<std::vector<double> > z_std_normalize_data	(	const std::vector< double * > &	input_data,
		const int64_t	num_rows
	)

template<typename T >

ZStdNormalizationSummaryStats<T> z_std_normalize_data_with_summary_stats	(	const std::vector< T * > &	input_data,
		const int64_t	num_rows
	)

Definition at line 422 of file TableFunctionsCommon.cpp.

References get_column_mean(), get_column_std_dev(), and z_std_normalize_col().

Referenced by pca_fit_impl().

                             {
   const int64_t num_features = input_data.size();
   std::vector<std::vector<T>> normalized_data(num_features);
   std::vector<T> means(num_features);
   std::vector<T> std_devs(num_features);
   for (int64_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
     means[feature_idx] = get_column_mean(input_data[feature_idx], num_rows);
     std_devs[feature_idx] =
         get_column_std_dev(input_data[feature_idx], num_rows, means[feature_idx]);
     normalized_data[feature_idx].resize(num_rows);
     z_std_normalize_col(input_data[feature_idx],
                         normalized_data[feature_idx].data(),
                         num_rows,
                         means[feature_idx],
                         std_devs[feature_idx]);
   }
   return ZStdNormalizationSummaryStats<T>(normalized_data, means, std_devs);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template ZStdNormalizationSummaryStats<float> z_std_normalize_data_with_summary_stats	(	const std::vector< float * > &	input_data,
		const int64_t	num_rows
	)

template ZStdNormalizationSummaryStats<double> z_std_normalize_data_with_summary_stats	(	const std::vector< double * > &	input_data,
		const int64_t	num_rows
	)

Namespaces

Macros

Functions

Macro Definition Documentation

Function Documentation