OmniSciDB  04ee39c94c
HyperLogLog.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #ifndef QUERYENGINE_HYPERLOGLOG_H
26 #define QUERYENGINE_HYPERLOGLOG_H
27 
29 
30 #include <cmath>
31 
32 inline double get_alpha(const size_t m) {
33  switch (m) {
34  case 16:
35  return 0.673;
36  case 32:
37  return 0.697;
38  case 64:
39  return 0.709;
40  default:
41  break;
42  }
43  double alpha = 0.7213 / (1 + 1.079 / m);
44  return alpha;
45 }
46 
47 inline double get_beta(const uint32_t zeros) {
48  // Using polynomial regression terms found in LogLog-Beta paper and Redis
49  double zl = log(zeros + 1);
50  double beta = -0.370393911 * zeros + 0.070471823 * zl + 0.17393686 * pow(zl, 2) +
51  0.16339839 * pow(zl, 3) + -0.09237745 * pow(zl, 4) +
52  0.03738027 * pow(zl, 5) + -0.005384159 * pow(zl, 6) +
53  0.00042419 * pow(zl, 7);
54  return beta;
55 }
56 
57 template <typename T>
58 inline double get_harmonic_mean_denominator(T* M, uint32_t m) {
59  double accumulator = 0.0;
60 
61  for (unsigned i = 0; i < m; i++) {
62  accumulator += (1.0 / (1ULL << M[i]));
63  }
64  return accumulator;
65 }
66 
67 template <typename T>
68 inline double get_beta_adjusted_estimate(const size_t m, const uint32_t z, T* M) {
69  return (get_alpha(m) * m * (m - z) *
70  (1 / (get_beta(z) + get_harmonic_mean_denominator(M, m))));
71 }
72 
73 template <typename T>
74 inline double get_alpha_adjusted_estimate(const size_t m, T* M) {
75  return (get_alpha(m) * m * m) * (1 / get_harmonic_mean_denominator(M, m));
76 };
77 
78 template <typename T>
79 inline uint32_t count_zeros(T* M, size_t m) {
80  uint32_t zeros = 0;
81  for (uint32_t i = 0; i < m; i++) {
82  if (M[i] == 0) {
83  zeros++;
84  }
85  }
86  return zeros;
87 }
88 
89 template <class T>
90 inline size_t hll_size(const T* M, const size_t bitmap_sz_bits) {
91  size_t m = 1 << bitmap_sz_bits;
92 
93  uint32_t zeros = count_zeros(M, m);
94  double estimate = get_alpha_adjusted_estimate(m, M);
95  if (estimate <= 2.5 * m) {
96  if (zeros != 0) {
97  estimate = m * log(static_cast<double>(m) / zeros);
98  }
99  } else {
100  if (bitmap_sz_bits == 14) { // Apply LogLog-Beta adjustment only when p=14
101  estimate = get_beta_adjusted_estimate(m, zeros, M);
102  }
103  }
104  // No correction for large estimates since we're using 64-bit hashes.
105  return estimate;
106 }
107 
108 template <class T1, class T2>
109 inline void hll_unify(T1* lhs, T2* rhs, const size_t m) {
110  for (size_t r = 0; r < m; ++r) {
111  rhs[r] = lhs[r] = std::max(static_cast<int8_t>(lhs[r]), static_cast<int8_t>(rhs[r]));
112  }
113 }
114 
115 inline int hll_size_for_rate(const int err_percent) {
116  double err_rate{static_cast<double>(err_percent) / 100.0};
117  double k = ceil(2 * log2(1.04 / err_rate));
118  // On the next line, 4 is the minimum for which we have an alpha adjustment factor in
119  // get_alpha()
120  return std::min(16, std::max(static_cast<int>(k), 4));
121 }
122 
123 extern int g_hll_precision_bits;
124 
125 #endif // QUERYENGINE_HYPERLOGLOG_H
Descriptor for the storage layout use for (approximate) count distinct operations.
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:115
void hll_unify(T1 *lhs, T2 *rhs, const size_t m)
Definition: HyperLogLog.h:109
size_t hll_size(const T *M, const size_t bitmap_sz_bits)
Definition: HyperLogLog.h:90
double get_beta(const uint32_t zeros)
Definition: HyperLogLog.h:47
int g_hll_precision_bits
double get_alpha_adjusted_estimate(const size_t m, T *M)
Definition: HyperLogLog.h:74
double get_harmonic_mean_denominator(T *M, uint32_t m)
Definition: HyperLogLog.h:58
double get_beta_adjusted_estimate(const size_t m, const uint32_t z, T *M)
Definition: HyperLogLog.h:68
uint32_t count_zeros(T *M, size_t m)
Definition: HyperLogLog.h:79
double get_alpha(const size_t m)
Definition: HyperLogLog.h:32