OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CountDistinct.h File Reference

Functions used to work with (approximate) count distinct sets. More...

#include "Descriptors/CountDistinctDescriptor.h"
#include "HyperLogLog.h"
#include "ThirdParty/robin_hood/robin_hood.h"
#include <bitset>
#include <vector>
+ Include dependency graph for CountDistinct.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Typedefs

using CountDistinctDescriptors = std::vector< CountDistinctDescriptor >
 
using CountDistinctSet = robin_hood::unordered_set< int64_t >
 

Functions

size_t bitmap_set_size (const int8_t *bitmap, const size_t bitmap_byte_sz)
 
void bitmap_set_union (int8_t *lhs, int8_t *rhs, const size_t bitmap_sz)
 
void partial_bitmap_union (int8_t *set_vals, const CountDistinctDescriptor &count_distinct_desc)
 
int64_t count_distinct_set_size (const int64_t set_handle, const CountDistinctDescriptor &count_distinct_desc)
 
void count_distinct_set_union (const int64_t new_set_handle, const int64_t old_set_handle, const CountDistinctDescriptor &new_count_distinct_desc, const CountDistinctDescriptor &old_count_distinct_desc)
 

Detailed Description

Functions used to work with (approximate) count distinct sets.

Author
Alex Suhan alex@.nosp@m.mapd.nosp@m..com Copyright (c) 2017 MapD Technologies, Inc. All rights reserved.

Definition in file CountDistinct.h.

Typedef Documentation

Definition at line 36 of file CountDistinct.h.

using CountDistinctSet = robin_hood::unordered_set<int64_t>

Definition at line 37 of file CountDistinct.h.

Function Documentation

size_t bitmap_set_size ( const int8_t *  bitmap,
const size_t  bitmap_byte_sz 
)
inline

Definition at line 39 of file CountDistinct.h.

Referenced by count_distinct_set_size(), and ResultSet::getNDVEstimator().

39  {
40  const auto bitmap_word_count = bitmap_byte_sz >> 3;
41  const auto bitmap_rem_bytes = bitmap_byte_sz & 7;
42  const auto bitmap64 = reinterpret_cast<const int64_t*>(bitmap);
43  size_t set_size = 0;
44  for (size_t i = 0; i < bitmap_word_count; ++i) {
45  std::bitset<64> word_bitset(bitmap64[i]);
46  set_size += word_bitset.count();
47  }
48  const auto rem_bitmap = reinterpret_cast<const int8_t*>(&bitmap64[bitmap_word_count]);
49  for (size_t i = 0; i < bitmap_rem_bytes; ++i) {
50  std::bitset<8> byte_bitset(rem_bitmap[i]);
51  set_size += byte_bitset.count();
52  }
53  return set_size;
54 }

+ Here is the caller graph for this function:

void bitmap_set_union ( int8_t *  lhs,
int8_t *  rhs,
const size_t  bitmap_sz 
)
inline

Definition at line 56 of file CountDistinct.h.

Referenced by count_distinct_set_union(), and partial_bitmap_union().

56  {
57  for (size_t i = 0; i < bitmap_sz; ++i) {
58  lhs[i] = rhs[i] = lhs[i] | rhs[i];
59  }
60 }

+ Here is the caller graph for this function:

int64_t count_distinct_set_size ( const int64_t  set_handle,
const CountDistinctDescriptor count_distinct_desc 
)
inline

Definition at line 77 of file CountDistinct.h.

References CountDistinctDescriptor::approximate, Bitmap, bitmap_set_size(), CountDistinctDescriptor::bitmap_sz_bits, CountDistinctDescriptor::bitmapSizeBytes(), CHECK, CHECK_GT, CountDistinctDescriptor::device_type, GPU, hll_size(), CountDistinctDescriptor::impl_type_, partial_bitmap_union(), CountDistinctDescriptor::sub_bitmap_count, and UnorderedSet.

Referenced by ResultSet::makeTargetValue(), ResultSet::ResultSetComparator< BUFFER_ITERATOR_TYPE >::materializeCountDistinctColumn(), and ResultSetStorage::reduceSingleRow().

79  {
80  if (!set_handle) {
81  return 0;
82  }
83  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
84  auto set_vals = reinterpret_cast<int8_t*>(set_handle);
85  if (count_distinct_desc.approximate) {
86  CHECK_GT(count_distinct_desc.bitmap_sz_bits, 0);
87  return count_distinct_desc.device_type == ExecutorDeviceType::GPU
88  ? hll_size(reinterpret_cast<const int32_t*>(set_vals),
89  count_distinct_desc.bitmap_sz_bits)
90  : hll_size(reinterpret_cast<const int8_t*>(set_vals),
91  count_distinct_desc.bitmap_sz_bits);
92  }
93  if (count_distinct_desc.sub_bitmap_count > 1) {
94  partial_bitmap_union(set_vals, count_distinct_desc);
95  }
96  return bitmap_set_size(set_vals, count_distinct_desc.bitmapSizeBytes());
97  }
99  return reinterpret_cast<CountDistinctSet*>(set_handle)->size();
100 }
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:37
CountDistinctImplType impl_type_
size_t hll_size(const T *M, const size_t bitmap_sz_bits)
Definition: HyperLogLog.h:90
#define CHECK_GT(x, y)
Definition: Logger.h:235
void partial_bitmap_union(int8_t *set_vals, const CountDistinctDescriptor &count_distinct_desc)
Definition: CountDistinct.h:63
#define CHECK(condition)
Definition: Logger.h:223
size_t bitmap_set_size(const int8_t *bitmap, const size_t bitmap_byte_sz)
Definition: CountDistinct.h:39

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void count_distinct_set_union ( const int64_t  new_set_handle,
const int64_t  old_set_handle,
const CountDistinctDescriptor new_count_distinct_desc,
const CountDistinctDescriptor old_count_distinct_desc 
)
inline

Definition at line 102 of file CountDistinct.h.

References CountDistinctDescriptor::approximate, Bitmap, bitmap_set_union(), CountDistinctDescriptor::bitmap_sz_bits, CountDistinctDescriptor::bitmapPaddedSizeBytes(), CountDistinctDescriptor::bitmapSizeBytes(), CHECK, CHECK_EQ, CHECK_GE, CPU, CountDistinctDescriptor::device_type, GPU, hll_unify(), CountDistinctDescriptor::impl_type_, CountDistinctDescriptor::sub_bitmap_count, and UnorderedSet.

Referenced by count_distinct_set_union_jit_rt(), and ResultSetStorage::reduceOneCountDistinctSlot().

106  {
107  if (new_count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
108  auto new_set = reinterpret_cast<int8_t*>(new_set_handle);
109  auto old_set = reinterpret_cast<int8_t*>(old_set_handle);
110  if (new_count_distinct_desc.approximate) {
111  CHECK(old_count_distinct_desc.approximate);
112  if (new_count_distinct_desc.device_type == ExecutorDeviceType::GPU &&
113  old_count_distinct_desc.device_type == ExecutorDeviceType::GPU) {
114  hll_unify(reinterpret_cast<int32_t*>(new_set),
115  reinterpret_cast<int32_t*>(old_set),
116  1 << old_count_distinct_desc.bitmap_sz_bits);
117  } else if (new_count_distinct_desc.device_type == ExecutorDeviceType::GPU &&
118  old_count_distinct_desc.device_type == ExecutorDeviceType::CPU) {
119  hll_unify(reinterpret_cast<int32_t*>(new_set),
120  reinterpret_cast<int8_t*>(old_set),
121  1 << old_count_distinct_desc.bitmap_sz_bits);
122  } else if (new_count_distinct_desc.device_type == ExecutorDeviceType::CPU &&
123  old_count_distinct_desc.device_type == ExecutorDeviceType::GPU) {
124  hll_unify(reinterpret_cast<int8_t*>(new_set),
125  reinterpret_cast<int32_t*>(old_set),
126  1 << old_count_distinct_desc.bitmap_sz_bits);
127  } else {
128  CHECK(old_count_distinct_desc.device_type == ExecutorDeviceType::CPU &&
129  new_count_distinct_desc.device_type == ExecutorDeviceType::CPU);
130  hll_unify(reinterpret_cast<int8_t*>(new_set),
131  reinterpret_cast<int8_t*>(old_set),
132  1 << old_count_distinct_desc.bitmap_sz_bits);
133  }
134  } else {
135  CHECK_EQ(new_count_distinct_desc.sub_bitmap_count,
136  old_count_distinct_desc.sub_bitmap_count);
137  CHECK_GE(old_count_distinct_desc.sub_bitmap_count, size_t(1));
138  // NB: For low cardinality input and if the query ran on GPU the bitmap is
139  // composed of multiple padded sub-bitmaps. Treat them as if they are regular
140  // bitmaps and let count_distinct_set_size take care of additional reduction.
141  const auto bitmap_byte_sz = old_count_distinct_desc.sub_bitmap_count == 1
142  ? old_count_distinct_desc.bitmapSizeBytes()
143  : old_count_distinct_desc.bitmapPaddedSizeBytes();
144  bitmap_set_union(new_set, old_set, bitmap_byte_sz);
145  }
146  } else {
147  CHECK(old_count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
148  auto old_set = reinterpret_cast<CountDistinctSet*>(old_set_handle);
149  auto new_set = reinterpret_cast<CountDistinctSet*>(new_set_handle);
150  new_set->insert(old_set->begin(), old_set->end());
151  old_set->insert(new_set->begin(), new_set->end());
152  }
153 }
#define CHECK_EQ(x, y)
Definition: Logger.h:231
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:37
void hll_unify(T1 *lhs, T2 *rhs, const size_t m)
Definition: HyperLogLog.h:109
#define CHECK_GE(x, y)
Definition: Logger.h:236
void bitmap_set_union(int8_t *lhs, int8_t *rhs, const size_t bitmap_sz)
Definition: CountDistinct.h:56
CountDistinctImplType impl_type_
#define CHECK(condition)
Definition: Logger.h:223

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void partial_bitmap_union ( int8_t *  set_vals,
const CountDistinctDescriptor count_distinct_desc 
)
inline

Definition at line 63 of file CountDistinct.h.

References bitmap_set_union(), CountDistinctDescriptor::bitmapPaddedSizeBytes(), CountDistinctDescriptor::bitmapSizeBytes(), CHECK_EQ, and CountDistinctDescriptor::sub_bitmap_count.

Referenced by count_distinct_set_size().

64  {
65  auto partial_set_vals = set_vals;
66  CHECK_EQ(
67  size_t(0),
68  count_distinct_desc.bitmapPaddedSizeBytes() % count_distinct_desc.sub_bitmap_count);
69  const auto partial_padded_size =
70  count_distinct_desc.bitmapPaddedSizeBytes() / count_distinct_desc.sub_bitmap_count;
71  for (size_t i = 1; i < count_distinct_desc.sub_bitmap_count; ++i) {
72  partial_set_vals += partial_padded_size;
73  bitmap_set_union(set_vals, partial_set_vals, count_distinct_desc.bitmapSizeBytes());
74  }
75 }
#define CHECK_EQ(x, y)
Definition: Logger.h:231
void bitmap_set_union(int8_t *lhs, int8_t *rhs, const size_t bitmap_sz)
Definition: CountDistinct.h:56

+ Here is the call graph for this function:

+ Here is the caller graph for this function: