OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HyperLogLog.h File Reference

Functions used to work with HyperLogLog records. More...

#include "Descriptors/CountDistinctDescriptor.h"
#include <cmath>
+ Include dependency graph for HyperLogLog.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

double get_alpha (const size_t m)
 
double get_beta (const uint32_t zeros)
 
template<typename T >
double get_harmonic_mean_denominator (T *M, uint32_t m)
 
template<typename T >
double get_beta_adjusted_estimate (const size_t m, const uint32_t z, T *M)
 
template<typename T >
double get_alpha_adjusted_estimate (const size_t m, T *M)
 
template<typename T >
uint32_t count_zeros (T *M, size_t m)
 
template<class T >
size_t hll_size (const T *M, const size_t bitmap_sz_bits)
 
template<class T1 , class T2 >
void hll_unify (T1 *lhs, T2 *rhs, const size_t m)
 
int hll_size_for_rate (const int err_percent)
 

Variables

int g_hll_precision_bits
 

Detailed Description

Functions used to work with HyperLogLog records.

Author
Alex Suhan alex@.nosp@m.mapd.nosp@m..com Copyright (c) 2017 MapD Technologies, Inc. All rights reserved.

Definition in file HyperLogLog.h.

Function Documentation

template<typename T >
uint32_t count_zeros ( T *  M,
size_t  m 
)
inline

Definition at line 79 of file HyperLogLog.h.

Referenced by hll_size().

79  {
80  uint32_t zeros = 0;
81  for (uint32_t i = 0; i < m; i++) {
82  if (M[i] == 0) {
83  zeros++;
84  }
85  }
86  return zeros;
87 }

+ Here is the caller graph for this function:

double get_alpha ( const size_t  m)
inline

Definition at line 32 of file HyperLogLog.h.

Referenced by get_alpha_adjusted_estimate(), and get_beta_adjusted_estimate().

32  {
33  switch (m) {
34  case 16:
35  return 0.673;
36  case 32:
37  return 0.697;
38  case 64:
39  return 0.709;
40  default:
41  break;
42  }
43  double alpha = 0.7213 / (1 + 1.079 / m);
44  return alpha;
45 }

+ Here is the caller graph for this function:

template<typename T >
double get_alpha_adjusted_estimate ( const size_t  m,
T *  M 
)
inline

Definition at line 74 of file HyperLogLog.h.

References get_alpha(), and get_harmonic_mean_denominator().

Referenced by hll_size().

74  {
75  return (get_alpha(m) * m * m) * (1 / get_harmonic_mean_denominator(M, m));
76 };
double get_harmonic_mean_denominator(T *M, uint32_t m)
Definition: HyperLogLog.h:58
double get_alpha(const size_t m)
Definition: HyperLogLog.h:32

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

double get_beta ( const uint32_t  zeros)
inline

Definition at line 47 of file HyperLogLog.h.

Referenced by get_beta_adjusted_estimate().

47  {
48  // Using polynomial regression terms found in LogLog-Beta paper and Redis
49  double zl = log(zeros + 1);
50  double beta = -0.370393911 * zeros + 0.070471823 * zl + 0.17393686 * pow(zl, 2) +
51  0.16339839 * pow(zl, 3) + -0.09237745 * pow(zl, 4) +
52  0.03738027 * pow(zl, 5) + -0.005384159 * pow(zl, 6) +
53  0.00042419 * pow(zl, 7);
54  return beta;
55 }

+ Here is the caller graph for this function:

template<typename T >
double get_beta_adjusted_estimate ( const size_t  m,
const uint32_t  z,
T *  M 
)
inline

Definition at line 68 of file HyperLogLog.h.

References get_alpha(), get_beta(), and get_harmonic_mean_denominator().

Referenced by hll_size().

68  {
69  return (get_alpha(m) * m * (m - z) *
70  (1 / (get_beta(z) + get_harmonic_mean_denominator(M, m))));
71 }
double get_beta(const uint32_t zeros)
Definition: HyperLogLog.h:47
double get_harmonic_mean_denominator(T *M, uint32_t m)
Definition: HyperLogLog.h:58
double get_alpha(const size_t m)
Definition: HyperLogLog.h:32

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
double get_harmonic_mean_denominator ( T *  M,
uint32_t  m 
)
inline

Definition at line 58 of file HyperLogLog.h.

Referenced by get_alpha_adjusted_estimate(), and get_beta_adjusted_estimate().

58  {
59  double accumulator = 0.0;
60 
61  for (unsigned i = 0; i < m; i++) {
62  accumulator += (1.0 / (1ULL << M[i]));
63  }
64  return accumulator;
65 }

+ Here is the caller graph for this function:

template<class T >
size_t hll_size ( const T *  M,
const size_t  bitmap_sz_bits 
)
inline

Definition at line 90 of file HyperLogLog.h.

References count_zeros(), get_alpha_adjusted_estimate(), and get_beta_adjusted_estimate().

Referenced by OverlapsJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::approximateTupleCount(), and count_distinct_set_size().

90  {
91  size_t m = 1 << bitmap_sz_bits;
92 
93  uint32_t zeros = count_zeros(M, m);
94  double estimate = get_alpha_adjusted_estimate(m, M);
95  if (estimate <= 2.5 * m) {
96  if (zeros != 0) {
97  estimate = m * log(static_cast<double>(m) / zeros);
98  }
99  } else {
100  if (bitmap_sz_bits == 14) { // Apply LogLog-Beta adjustment only when p=14
101  estimate = get_beta_adjusted_estimate(m, zeros, M);
102  }
103  }
104  // No correction for large estimates since we're using 64-bit hashes.
105  return estimate;
106 }
double get_alpha_adjusted_estimate(const size_t m, T *M)
Definition: HyperLogLog.h:74
double get_beta_adjusted_estimate(const size_t m, const uint32_t z, T *M)
Definition: HyperLogLog.h:68
uint32_t count_zeros(T *M, size_t m)
Definition: HyperLogLog.h:79

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int hll_size_for_rate ( const int  err_percent)
inline

Definition at line 115 of file HyperLogLog.h.

References g_hll_precision_bits.

Referenced by anonymous_namespace{RelAlgExecutor.cpp}::decide_approx_count_distinct_implementation(), and GroupByAndAggregate::initCountDistinctDescriptors().

115  {
116  double err_rate{static_cast<double>(err_percent) / 100.0};
117  double k = ceil(2 * log2(1.04 / err_rate));
118  // On the next line, 4 is the minimum for which we have an alpha adjustment factor in
119  // get_alpha()
120  return std::min(16, std::max(static_cast<int>(k), 4));
121 }

+ Here is the caller graph for this function:

template<class T1 , class T2 >
void hll_unify ( T1 *  lhs,
T2 *  rhs,
const size_t  m 
)
inline

Definition at line 109 of file HyperLogLog.h.

Referenced by OverlapsJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::approximateTupleCount(), and count_distinct_set_union().

109  {
110  for (size_t r = 0; r < m; ++r) {
111  rhs[r] = lhs[r] = std::max(static_cast<int8_t>(lhs[r]), static_cast<int8_t>(rhs[r]));
112  }
113 }

+ Here is the caller graph for this function:

Variable Documentation