OmniSciDB  a987f07e93
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HyperLogLog.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
23 #ifndef QUERYENGINE_HYPERLOGLOG_H
24 #define QUERYENGINE_HYPERLOGLOG_H
25 
27 
28 #include <cmath>
29 
30 inline double get_alpha(const size_t m) {
31  switch (m) {
32  case 16:
33  return 0.673;
34  case 32:
35  return 0.697;
36  case 64:
37  return 0.709;
38  default:
39  break;
40  }
41  double alpha = 0.7213 / (1 + 1.079 / m);
42  return alpha;
43 }
44 
45 inline double get_beta(const uint32_t zeros) {
46  // Using polynomial regression terms found in LogLog-Beta paper and Redis
47  double zl = log(zeros + 1);
48  double beta = -0.370393911 * zeros + 0.070471823 * zl + 0.17393686 * pow(zl, 2) +
49  0.16339839 * pow(zl, 3) + -0.09237745 * pow(zl, 4) +
50  0.03738027 * pow(zl, 5) + -0.005384159 * pow(zl, 6) +
51  0.00042419 * pow(zl, 7);
52  return beta;
53 }
54 
55 template <typename T>
56 inline double get_harmonic_mean_denominator(T* M, uint32_t m) {
57  double accumulator = 0.0;
58 
59  for (unsigned i = 0; i < m; i++) {
60  accumulator += (1.0 / (1ULL << M[i]));
61  }
62  return accumulator;
63 }
64 
65 template <typename T>
66 inline double get_beta_adjusted_estimate(const size_t m, const uint32_t z, T* M) {
67  return (get_alpha(m) * m * (m - z) *
68  (1 / (get_beta(z) + get_harmonic_mean_denominator(M, m))));
69 }
70 
71 template <typename T>
72 inline double get_alpha_adjusted_estimate(const size_t m, T* M) {
73  return (get_alpha(m) * m * m) * (1 / get_harmonic_mean_denominator(M, m));
74 };
75 
76 template <typename T>
77 inline uint32_t count_zeros(T* M, size_t m) {
78  uint32_t zeros = 0;
79  for (uint32_t i = 0; i < m; i++) {
80  if (M[i] == 0) {
81  zeros++;
82  }
83  }
84  return zeros;
85 }
86 
87 template <class T>
88 inline size_t hll_size(const T* M, const size_t bitmap_sz_bits) {
89  size_t m = 1 << bitmap_sz_bits;
90 
91  uint32_t zeros = count_zeros(M, m);
92  double estimate = get_alpha_adjusted_estimate(m, M);
93  if (estimate <= 2.5 * m) {
94  if (zeros != 0) {
95  estimate = m * log(static_cast<double>(m) / zeros);
96  }
97  } else {
98  if (bitmap_sz_bits == 14) { // Apply LogLog-Beta adjustment only when p=14
99  estimate = get_beta_adjusted_estimate(m, zeros, M);
100  }
101  }
102  // No correction for large estimates since we're using 64-bit hashes.
103  return estimate;
104 }
105 
106 template <class T1, class T2>
107 inline void hll_unify(T1* lhs, T2* rhs, const size_t m) {
108  for (size_t r = 0; r < m; ++r) {
109  rhs[r] = lhs[r] = std::max(static_cast<int8_t>(lhs[r]), static_cast<int8_t>(rhs[r]));
110  }
111 }
112 
113 inline int hll_size_for_rate(const int err_percent) {
114  double err_rate{static_cast<double>(err_percent) / 100.0};
115  double k = ceil(2 * log2(1.04 / err_rate));
116  // On the next line, 4 is the minimum for which we have an alpha adjustment factor in
117  // get_alpha()
118  return std::min(16, std::max(static_cast<int>(k), 4));
119 }
120 
121 extern int g_hll_precision_bits;
122 
123 #endif // QUERYENGINE_HYPERLOGLOG_H
Descriptor for the storage layout use for (approximate) count distinct operations.
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
void hll_unify(T1 *lhs, T2 *rhs, const size_t m)
Definition: HyperLogLog.h:107
int g_hll_precision_bits
size_t hll_size(const T *M, const size_t bitmap_sz_bits)
Definition: HyperLogLog.h:88
double get_beta(const uint32_t zeros)
Definition: HyperLogLog.h:45
double get_alpha_adjusted_estimate(const size_t m, T *M)
Definition: HyperLogLog.h:72
double get_harmonic_mean_denominator(T *M, uint32_t m)
Definition: HyperLogLog.h:56
double get_beta_adjusted_estimate(const size_t m, const uint32_t z, T *M)
Definition: HyperLogLog.h:66
uint32_t count_zeros(T *M, size_t m)
Definition: HyperLogLog.h:77
double get_alpha(const size_t m)
Definition: HyperLogLog.h:30