OmniSciDB  a987f07e93
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HyperLogLog.h File Reference

Functions used to work with HyperLogLog records. More...

#include "Descriptors/CountDistinctDescriptor.h"
#include <cmath>
+ Include dependency graph for HyperLogLog.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

double get_alpha (const size_t m)
 
double get_beta (const uint32_t zeros)
 
template<typename T >
double get_harmonic_mean_denominator (T *M, uint32_t m)
 
template<typename T >
double get_beta_adjusted_estimate (const size_t m, const uint32_t z, T *M)
 
template<typename T >
double get_alpha_adjusted_estimate (const size_t m, T *M)
 
template<typename T >
uint32_t count_zeros (T *M, size_t m)
 
template<class T >
size_t hll_size (const T *M, const size_t bitmap_sz_bits)
 
template<class T1 , class T2 >
void hll_unify (T1 *lhs, T2 *rhs, const size_t m)
 
int hll_size_for_rate (const int err_percent)
 

Variables

int g_hll_precision_bits
 

Detailed Description

Functions used to work with HyperLogLog records.

Definition in file HyperLogLog.h.

Function Documentation

template<typename T >
uint32_t count_zeros ( T *  M,
size_t  m 
)
inline

Definition at line 77 of file HyperLogLog.h.

Referenced by hll_size().

77  {
78  uint32_t zeros = 0;
79  for (uint32_t i = 0; i < m; i++) {
80  if (M[i] == 0) {
81  zeros++;
82  }
83  }
84  return zeros;
85 }

+ Here is the caller graph for this function:

double get_alpha ( const size_t  m)
inline

Definition at line 30 of file HyperLogLog.h.

Referenced by get_alpha_adjusted_estimate(), and get_beta_adjusted_estimate().

30  {
31  switch (m) {
32  case 16:
33  return 0.673;
34  case 32:
35  return 0.697;
36  case 64:
37  return 0.709;
38  default:
39  break;
40  }
41  double alpha = 0.7213 / (1 + 1.079 / m);
42  return alpha;
43 }

+ Here is the caller graph for this function:

template<typename T >
double get_alpha_adjusted_estimate ( const size_t  m,
T *  M 
)
inline

Definition at line 72 of file HyperLogLog.h.

References get_alpha(), and get_harmonic_mean_denominator().

Referenced by hll_size().

72  {
73  return (get_alpha(m) * m * m) * (1 / get_harmonic_mean_denominator(M, m));
74 };
double get_harmonic_mean_denominator(T *M, uint32_t m)
Definition: HyperLogLog.h:56
double get_alpha(const size_t m)
Definition: HyperLogLog.h:30

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

double get_beta ( const uint32_t  zeros)
inline

Definition at line 45 of file HyperLogLog.h.

Referenced by get_beta_adjusted_estimate().

45  {
46  // Using polynomial regression terms found in LogLog-Beta paper and Redis
47  double zl = log(zeros + 1);
48  double beta = -0.370393911 * zeros + 0.070471823 * zl + 0.17393686 * pow(zl, 2) +
49  0.16339839 * pow(zl, 3) + -0.09237745 * pow(zl, 4) +
50  0.03738027 * pow(zl, 5) + -0.005384159 * pow(zl, 6) +
51  0.00042419 * pow(zl, 7);
52  return beta;
53 }

+ Here is the caller graph for this function:

template<typename T >
double get_beta_adjusted_estimate ( const size_t  m,
const uint32_t  z,
T *  M 
)
inline

Definition at line 66 of file HyperLogLog.h.

References get_alpha(), get_beta(), and get_harmonic_mean_denominator().

Referenced by hll_size().

66  {
67  return (get_alpha(m) * m * (m - z) *
68  (1 / (get_beta(z) + get_harmonic_mean_denominator(M, m))));
69 }
double get_beta(const uint32_t zeros)
Definition: HyperLogLog.h:45
double get_harmonic_mean_denominator(T *M, uint32_t m)
Definition: HyperLogLog.h:56
double get_alpha(const size_t m)
Definition: HyperLogLog.h:30

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
double get_harmonic_mean_denominator ( T *  M,
uint32_t  m 
)
inline

Definition at line 56 of file HyperLogLog.h.

Referenced by get_alpha_adjusted_estimate(), and get_beta_adjusted_estimate().

56  {
57  double accumulator = 0.0;
58 
59  for (unsigned i = 0; i < m; i++) {
60  accumulator += (1.0 / (1ULL << M[i]));
61  }
62  return accumulator;
63 }

+ Here is the caller graph for this function:

template<class T >
size_t hll_size ( const T *  M,
const size_t  bitmap_sz_bits 
)
inline

Definition at line 88 of file HyperLogLog.h.

References count_zeros(), get_alpha_adjusted_estimate(), and get_beta_adjusted_estimate().

Referenced by RangeJoinHashTable::approximateTupleCount(), OverlapsJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::approximateTupleCount(), and count_distinct_set_size().

88  {
89  size_t m = 1 << bitmap_sz_bits;
90 
91  uint32_t zeros = count_zeros(M, m);
92  double estimate = get_alpha_adjusted_estimate(m, M);
93  if (estimate <= 2.5 * m) {
94  if (zeros != 0) {
95  estimate = m * log(static_cast<double>(m) / zeros);
96  }
97  } else {
98  if (bitmap_sz_bits == 14) { // Apply LogLog-Beta adjustment only when p=14
99  estimate = get_beta_adjusted_estimate(m, zeros, M);
100  }
101  }
102  // No correction for large estimates since we're using 64-bit hashes.
103  return estimate;
104 }
double get_alpha_adjusted_estimate(const size_t m, T *M)
Definition: HyperLogLog.h:72
double get_beta_adjusted_estimate(const size_t m, const uint32_t z, T *M)
Definition: HyperLogLog.h:66
uint32_t count_zeros(T *M, size_t m)
Definition: HyperLogLog.h:77

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int hll_size_for_rate ( const int  err_percent)
inline

Definition at line 113 of file HyperLogLog.h.

References g_hll_precision_bits.

Referenced by anonymous_namespace{RelAlgExecutor.cpp}::decide_approx_count_distinct_implementation(), and anonymous_namespace{GroupByAndAggregate.cpp}::init_count_distinct_descriptors().

113  {
114  double err_rate{static_cast<double>(err_percent) / 100.0};
115  double k = ceil(2 * log2(1.04 / err_rate));
116  // On the next line, 4 is the minimum for which we have an alpha adjustment factor in
117  // get_alpha()
118  return std::min(16, std::max(static_cast<int>(k), 4));
119 }

+ Here is the caller graph for this function:

template<class T1 , class T2 >
void hll_unify ( T1 *  lhs,
T2 *  rhs,
const size_t  m 
)
inline

Definition at line 107 of file HyperLogLog.h.

Referenced by RangeJoinHashTable::approximateTupleCount(), OverlapsJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::approximateTupleCount(), and count_distinct_set_union().

107  {
108  for (size_t r = 0; r < m; ++r) {
109  rhs[r] = lhs[r] = std::max(static_cast<int8_t>(lhs[r]), static_cast<int8_t>(rhs[r]));
110  }
111 }

+ Here is the caller graph for this function:

Variable Documentation