OmniSciDB  a987f07e93
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TableFunctionsStats.hpp File Reference
#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>
#include <rapidjson/document.h>
#include "QueryEngine/heavydbTypes.h"
+ Include dependency graph for TableFunctionsStats.hpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  StatsRequestPredicate
 
struct  ColumnStats< T >
 
struct  StatsRequest
 

Enumerations

enum  StatsRequestPredicateOp { StatsRequestPredicateOp::NONE, StatsRequestPredicateOp::LT, StatsRequestPredicateOp::GT }
 
enum  StatsRequestAggType {
  StatsRequestAggType::COUNT, StatsRequestAggType::MIN, StatsRequestAggType::MAX, StatsRequestAggType::SUM,
  StatsRequestAggType::AVG
}
 

Functions

std::vector< StatsRequestparse_stats_requests_json (const std::string &stats_requests_json_str, const int64_t num_attrs)
 
template<typename TA >
ColumnStats< TA > get_column_stats (const ColumnList< TA > &attrs, StatsRequest &stats_request, std::unordered_map< std::string, ColumnStats< TA >> &stats_map)
 
template<typename TA >
void compute_stats_requests (const ColumnList< TA > &attrs, std::vector< StatsRequest > &stats_requests)
 
template<typename TA >
void populate_output_stats_cols (Column< TextEncodingDict > &stat_names, Column< TA > &stat_vals, const std::vector< StatsRequest > &stats_requests)
 
template<typename T >
NEVER_INLINE HOST ColumnStats< T > get_column_stats (const T *data, const int64_t num_rows, const StatsRequestPredicate &predicate=StatsRequestPredicate())
 
template<typename T >
NEVER_INLINE HOST ColumnStats< T > get_column_stats (const Column< T > &col, const StatsRequestPredicate &predicate=StatsRequestPredicate())
 

Enumeration Type Documentation

enum StatsRequestAggType
strong
Enumerator
COUNT 
MIN 
MAX 
SUM 
AVG 

Definition at line 104 of file TableFunctionsStats.hpp.

Enumerator
NONE 
LT 
GT 

Definition at line 30 of file TableFunctionsStats.hpp.

Function Documentation

template<typename TA >
void compute_stats_requests ( const ColumnList< TA > &  attrs,
std::vector< StatsRequest > &  stats_requests 
)

Definition at line 143 of file TableFunctionsStats.hpp.

References AVG, COUNT, get_column_stats(), MAX, MIN, and SUM.

144  {
145  std::unordered_map<std::string, ColumnStats<TA>> stats_map;
146 
147  for (auto& stats_request : stats_requests) {
148  const auto column_stats = get_column_stats(attrs, stats_request, stats_map);
149  switch (stats_request.agg_type) {
151  stats_request.result = column_stats.non_null_or_filtered_count;
152  break;
153  }
155  stats_request.result = column_stats.min;
156  break;
157  }
159  stats_request.result = column_stats.max;
160  break;
161  }
163  stats_request.result = column_stats.sum;
164  break;
165  }
167  stats_request.result = column_stats.mean;
168  break;
169  }
170  }
171  }
172 }
NEVER_INLINE HOST ColumnStats< T > get_column_stats(const T *data, const int64_t num_rows, const StatsRequestPredicate &predicate)

+ Here is the call graph for this function:

template<typename TA >
ColumnStats<TA> get_column_stats ( const ColumnList< TA > &  attrs,
StatsRequest stats_request,
std::unordered_map< std::string, ColumnStats< TA >> &  stats_map 
)

Definition at line 126 of file TableFunctionsStats.hpp.

References StatsRequest::attr_id, StatsRequest::filter_type, StatsRequest::filter_val, get_column_stats(), StatsRequestPredicate::to_string(), and to_string().

129  {
130  StatsRequestPredicate predicate(stats_request.filter_type, stats_request.filter_val);
131  const std::string request_str_key =
132  std::to_string(stats_request.attr_id) + "||" + predicate.to_string();
133  auto stats_map_itr = stats_map.find(request_str_key);
134  if (stats_map_itr != stats_map.end()) {
135  return stats_map_itr->second;
136  }
137  const auto column_stats = get_column_stats(attrs[stats_request.attr_id], predicate);
138  stats_map[request_str_key] = column_stats;
139  return column_stats;
140 }
NEVER_INLINE HOST ColumnStats< T > get_column_stats(const T *data, const int64_t num_rows, const StatsRequestPredicate &predicate)
std::string to_string(char const *&&v)
StatsRequestPredicateOp filter_type

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST ColumnStats<T> get_column_stats ( const T *  data,
const int64_t  num_rows,
const StatsRequestPredicate predicate = StatsRequestPredicate() 
)

Definition at line 22 of file TableFunctionsStats.cpp.

References ColumnStats< T >::max, max_inputs_per_thread, ColumnStats< T >::mean, ColumnStats< T >::min, ColumnStats< T >::non_null_or_filtered_count, threading_serial::parallel_for(), ColumnStats< T >::sum, heavydb.dtypes::T, and ColumnStats< T >::total_count.

Referenced by compute_stats_requests(), and get_column_stats().

25  {
26  // const int64_t num_rows = col.size();
27  const size_t max_thread_count = std::thread::hardware_concurrency();
28  const size_t max_inputs_per_thread = 20000;
29  const size_t num_threads = std::min(
30  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
31 
32  std::vector<T> local_col_mins(num_threads, std::numeric_limits<T>::max());
33  std::vector<T> local_col_maxes(num_threads, std::numeric_limits<T>::lowest());
34  std::vector<double> local_col_sums(num_threads, 0.);
35  std::vector<int64_t> local_col_non_null_or_filtered_counts(num_threads, 0L);
36  tbb::task_arena limited_arena(num_threads);
37  limited_arena.execute([&] {
38  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
39  [&](const tbb::blocked_range<int64_t>& r) {
40  const int64_t start_idx = r.begin();
41  const int64_t end_idx = r.end();
42  T local_col_min = std::numeric_limits<T>::max();
43  T local_col_max = std::numeric_limits<T>::lowest();
44  double local_col_sum = 0.;
45  int64_t local_col_non_null_or_filtered_count = 0;
46  for (int64_t r = start_idx; r < end_idx; ++r) {
47  const T val = data[r];
48  if (val == inline_null_value<T>()) {
49  continue;
50  }
51  if (!predicate(val)) {
52  continue;
53  }
54  if (val < local_col_min) {
55  local_col_min = val;
56  }
57  if (val > local_col_max) {
58  local_col_max = val;
59  }
60  local_col_sum += data[r];
61  local_col_non_null_or_filtered_count++;
62  }
63  size_t thread_idx = tbb::this_task_arena::current_thread_index();
64  if (local_col_min < local_col_mins[thread_idx]) {
65  local_col_mins[thread_idx] = local_col_min;
66  }
67  if (local_col_max > local_col_maxes[thread_idx]) {
68  local_col_maxes[thread_idx] = local_col_max;
69  }
70  local_col_sums[thread_idx] += local_col_sum;
71  local_col_non_null_or_filtered_counts[thread_idx] +=
72  local_col_non_null_or_filtered_count;
73  });
74  });
75 
76  ColumnStats<T> column_stats;
77  // Use separate double col_sum instead of column_stats.sum to avoid fp imprecision if T
78  // is float
79  double col_sum = 0.0;
80  column_stats.total_count = num_rows;
81 
82  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
83  if (local_col_mins[thread_idx] < column_stats.min) {
84  column_stats.min = local_col_mins[thread_idx];
85  }
86  if (local_col_maxes[thread_idx] > column_stats.max) {
87  column_stats.max = local_col_maxes[thread_idx];
88  }
89  col_sum += local_col_sums[thread_idx];
90  column_stats.non_null_or_filtered_count +=
91  local_col_non_null_or_filtered_counts[thread_idx];
92  }
93 
94  if (column_stats.non_null_or_filtered_count > 0) {
95  column_stats.sum = col_sum;
96  column_stats.mean = col_sum / column_stats.non_null_or_filtered_count;
97  }
98  return column_stats;
99 }
const size_t max_inputs_per_thread
int64_t non_null_or_filtered_count
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST ColumnStats<T> get_column_stats ( const Column< T > &  col,
const StatsRequestPredicate predicate = StatsRequestPredicate() 
)

Definition at line 127 of file TableFunctionsStats.cpp.

References get_column_stats(), Column< T >::getPtr(), and Column< T >::size().

129  {
130  return get_column_stats(col.getPtr(), col.size(), predicate);
131 }
DEVICE int64_t size() const
Definition: heavydbTypes.h:726
NEVER_INLINE HOST ColumnStats< T > get_column_stats(const T *data, const int64_t num_rows, const StatsRequestPredicate &predicate)
DEVICE T * getPtr() const
Definition: heavydbTypes.h:725

+ Here is the call graph for this function:

std::vector<StatsRequest> parse_stats_requests_json ( const std::string &  stats_requests_json_str,
const int64_t  num_attrs 
)

Definition at line 205 of file TableFunctionsStats.cpp.

References StatsRequest::agg_type, StatsRequest::attr_id, convert_string_to_stats_request_agg_type(), convert_string_to_stats_request_predicate_op(), StatsRequest::filter_type, StatsRequest::filter_val, StatsRequest::name, NONE, replace_substrings(), to_string(), and shared::transform().

207  {
208  std::vector<StatsRequest> stats_requests;
209  rapidjson::Document doc;
210 
211  // remove double double quotes our parser introduces
212  const auto fixed_stats_requests_json_str =
213  replace_substrings(stats_requests_json_str, "\"\"", "\"");
214 
215  if (doc.Parse(fixed_stats_requests_json_str.c_str()).HasParseError()) {
216  // Not valid JSON
217  std::cout << "DEBUG: Failed JSON: " << fixed_stats_requests_json_str << std::endl;
218  throw std::runtime_error("Could not parse Stats Requests JSON.");
219  }
220  // Todo (todd): Enforce Schema
221  if (!doc.IsArray()) {
222  throw std::runtime_error("Stats Request JSON did not contain valid root Array.");
223  }
224  const std::vector<std::string> required_keys = {
225  "name", "attr_id", "agg_type", "filter_type"};
226 
227  for (const auto& stat_request_obj : doc.GetArray()) {
228  for (const auto& required_key : required_keys) {
229  if (!stat_request_obj.HasMember(required_key)) {
230  throw std::runtime_error("Stats Request JSON missing key " + required_key + ".");
231  }
232  if (required_key == "attr_id") {
233  if (!stat_request_obj[required_key].IsUint()) {
234  throw std::runtime_error(required_key + " must be int type");
235  }
236  } else {
237  if (!stat_request_obj[required_key].IsString()) {
238  throw std::runtime_error(required_key + " must be string type");
239  }
240  }
241  }
242  StatsRequest stats_request;
243  stats_request.name = stat_request_obj["name"].GetString();
244  stats_request.attr_id = stat_request_obj["attr_id"].GetInt() - 1;
245  if (stats_request.attr_id < 0 || stats_request.attr_id >= num_attrs) {
246  throw std::runtime_error("Invalid attr_id: " +
247  std::to_string(stats_request.attr_id));
248  }
249 
250  std::string agg_type_str = stat_request_obj["agg_type"].GetString();
252  agg_type_str.begin(), agg_type_str.end(), agg_type_str.begin(), ::toupper);
253  stats_request.agg_type = convert_string_to_stats_request_agg_type(agg_type_str);
254 
255  std::string filter_type_str = stat_request_obj["filter_type"].GetString();
256  std::transform(filter_type_str.begin(),
257  filter_type_str.end(),
258  filter_type_str.begin(),
259  ::toupper);
260  stats_request.filter_type =
262  if (stats_request.filter_type != StatsRequestPredicateOp::NONE) {
263  if (!stat_request_obj.HasMember("filter_val")) {
264  throw std::runtime_error("Stats Request JSON missing expected filter_val");
265  }
266  if (!stat_request_obj["filter_val"].IsNumber()) {
267  throw std::runtime_error("Stats Request JSON filter_val should be numeric.");
268  }
269  stats_request.filter_val = stat_request_obj["filter_val"].GetDouble();
270  }
271  stats_requests.emplace_back(stats_request);
272  }
273  return stats_requests;
274 }
StatsRequestPredicateOp convert_string_to_stats_request_predicate_op(const std::string &str)
std::string to_string(char const *&&v)
std::string replace_substrings(const std::string &str, const std::string &pattern_str, const std::string &replacement_str)
OUTPUT transform(INPUT const &input, FUNC const &func)
Definition: misc.h:320
StatsRequestAggType convert_string_to_stats_request_agg_type(const std::string &str)
StatsRequestPredicateOp filter_type
StatsRequestAggType agg_type

+ Here is the call graph for this function:

template<typename TA >
void populate_output_stats_cols ( Column< TextEncodingDict > &  stat_names,
Column< TA > &  stat_vals,
const std::vector< StatsRequest > &  stats_requests 
)

Definition at line 175 of file TableFunctionsStats.hpp.

References StringDictionaryProxy::getOrAddTransient(), setup::name, logger::request_id(), and Column< TextEncodingDict >::string_dict_proxy_.

177  {
178  const int64_t num_requests = static_cast<int64_t>(stats_requests.size());
179  for (int64_t request_id = 0; request_id < num_requests; ++request_id) {
180  stat_names[request_id] =
181  stat_names.string_dict_proxy_->getOrAddTransient(stats_requests[request_id].name);
182  stat_vals[request_id] = stats_requests[request_id].result;
183  }
184 }
StringDictionaryProxy * string_dict_proxy_
Definition: heavydbTypes.h:885
int32_t getOrAddTransient(const std::string &)
RequestId request_id()
Definition: Logger.cpp:867
string name
Definition: setup.in.py:72

+ Here is the call graph for this function: