_hash_join_runtime_8cpp_source.html

 /*

  * Copyright 2022 HEAVY.AI, Inc.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */


 #include "HashJoinRuntime.h"


 #include "QueryEngine/CompareKeysInl.h"

 #include "QueryEngine/HyperLogLogRank.h"

 #include "QueryEngine/JoinHashTable/Runtime/HashJoinKeyHandlers.h"

 #include "QueryEngine/JoinHashTable/Runtime/JoinColumnIterator.h"

 #include "QueryEngine/MurmurHash1Inl.h"

 #include "Shared/shard_key.h"

 #ifdef __CUDACC__

 #include "QueryEngine/DecodersImpl.h"

 #include "QueryEngine/GpuRtConstants.h"

 #include "QueryEngine/JoinHashTable/Runtime/JoinHashImpl.h"

 #else

 #include "Logger/Logger.h"


 #include "QueryEngine/RuntimeFunctions.h"

 #include "Shared/likely.h"

 #include "StringDictionary/StringDictionary.h"

 #include "StringDictionary/StringDictionaryProxy.h"


 #ifdef HAVE_TBB

 #include <tbb/parallel_for.h>

 #endif


 #include <future>

 #endif


 #if HAVE_CUDA

 #include <cuda_runtime.h>

 #include <thrust/scan.h>

 #endif

 #include "Shared/funcannotations.h"


 #include <cmath>

 #include <numeric>


 #ifndef __CUDACC__

 namespace {


 inline int64_t map_str_id_to_outer_dict(const int64_t inner_elem,

                                         const int64_t min_inner_elem,

                                         const int64_t min_outer_elem,

                                         const int64_t max_outer_elem,

                                         const int32_t* inner_to_outer_translation_map) {

   const auto outer_id = inner_to_outer_translation_map[inner_elem - min_inner_elem];

   if (outer_id > max_outer_elem || outer_id < min_outer_elem) {

     return StringDictionary::INVALID_STR_ID;

   }

   return outer_id;

 }


 }  // namespace

 #endif


 DEVICE void SUFFIX(init_hash_join_buff)(int32_t* groups_buffer,

                                         const int64_t hash_entry_count,

                                         const int32_t invalid_slot_val,

                                         const int32_t cpu_thread_idx,

                                         const int32_t cpu_thread_count) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

   for (int64_t i = start; i < hash_entry_count; i += step) {

     groups_buffer[i] = invalid_slot_val;

   }

 }


 #ifndef __CUDACC__

 #ifdef HAVE_TBB


 void SUFFIX(init_hash_join_buff_tbb)(int32_t* groups_buffer,

                                      const int64_t hash_entry_count,

                                      const int32_t invalid_slot_val) {

   tbb::parallel_for(tbb::blocked_range<int64_t>(0, hash_entry_count),

                     [=](const tbb::blocked_range<int64_t>& r) {

                       const auto start_idx = r.begin();

                       const auto end_idx = r.end();

                       for (auto entry_idx = start_idx; entry_idx != end_idx;

                            ++entry_idx) {

                         groups_buffer[entry_idx] = invalid_slot_val;

                       }

                     });

 }


 #endif  // #ifdef HAVE_TBB

 #endif  // #ifndef __CUDACC__


 #ifdef __CUDACC__

 #define mapd_cas(address, compare, val) atomicCAS(address, compare, val)

 #elif defined(_MSC_VER)

 #define mapd_cas(address, compare, val)                                 \

   InterlockedCompareExchange(reinterpret_cast<volatile long*>(address), \

                              static_cast<long>(val),                    \

                              static_cast<long>(compare))

 #else

 #define mapd_cas(address, compare, val) __sync_val_compare_and_swap(address, compare, val)

 #endif


 template <typename HASHTABLE_FILLING_FUNC>

 DEVICE auto fill_hash_join_buff_impl(OneToOnePerfectJoinHashTableFillFuncArgs const args,

                                      const int32_t cpu_thread_idx,

                                      const int32_t cpu_thread_count,

                                      HASHTABLE_FILLING_FUNC filling_func) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

   auto const join_column = args.join_column;

   auto const type_info = args.type_info;

   JoinColumnTyped col{&join_column, &type_info};

   for (auto item : col.slice(start, step)) {

     const size_t index = item.index;

     int64_t elem = item.element;

     if (elem == type_info.null_val) {

       if (type_info.uses_bw_eq) {

         elem = type_info.translated_null_val;

       } else {

         continue;

       }

     }

 #ifndef __CUDACC__

     auto const sd_inner_to_outer_translation_map = args.sd_inner_to_outer_translation_map;

     auto const min_inner_elem = args.min_inner_elem;

     if (sd_inner_to_outer_translation_map &&

         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {

       const auto outer_id = map_str_id_to_outer_dict(elem,

                                                      min_inner_elem,

                                                      type_info.min_val,

                                                      type_info.max_val,

                                                      sd_inner_to_outer_translation_map);

       if (outer_id == StringDictionary::INVALID_STR_ID) {

         continue;

       }

       elem = outer_id;

     }

 #endif

     if (filling_func(elem, index)) {

       return -1;

     }

   }

   return 0;

 };


 DEVICE int SUFFIX(fill_hash_join_buff_bucketized)(

     OneToOnePerfectJoinHashTableFillFuncArgs const args,

     int32_t const cpu_thread_idx,

     int32_t const cpu_thread_count) {

   auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)

                                          : SUFFIX(fill_one_to_one_hashtable);

   auto hashtable_filling_func = [&](auto elem, size_t index) {

     auto entry_ptr = SUFFIX(get_bucketized_hash_slot)(

         args.buff,

         elem,

         args.type_info.min_val / args.bucket_normalization,

         args.type_info.translated_null_val,

         args.bucket_normalization);

     return filling_func(index, entry_ptr, args.invalid_slot_val);

   };


   return fill_hash_join_buff_impl(

       args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);

 }


 DEVICE int SUFFIX(fill_hash_join_buff_bitwise_eq)(

     OneToOnePerfectJoinHashTableFillFuncArgs const args,

     int32_t const cpu_thread_idx,

     int32_t const cpu_thread_count) {

   auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)

                                          : SUFFIX(fill_one_to_one_hashtable);

   auto hashtable_filling_func = [&](auto elem, size_t index) {

     auto entry_ptr = SUFFIX(get_hash_slot_bitwise_eq)(

         args.buff, elem, args.type_info.min_val, args.type_info.translated_null_val);

     return filling_func(index, entry_ptr, args.invalid_slot_val);

   };


   return fill_hash_join_buff_impl(

       args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);

 }


 DEVICE int SUFFIX(fill_hash_join_buff)(

     OneToOnePerfectJoinHashTableFillFuncArgs const args,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count) {

   auto filling_func = args.for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)

                                          : SUFFIX(fill_one_to_one_hashtable);

   auto hashtable_filling_func = [&](auto elem, size_t index) {

     auto entry_ptr = SUFFIX(get_hash_slot)(args.buff, elem, args.type_info.min_val);

     return filling_func(index, entry_ptr, args.invalid_slot_val);

   };


   return fill_hash_join_buff_impl(

       args, cpu_thread_idx, cpu_thread_count, hashtable_filling_func);

 }


 template <typename HASHTABLE_FILLING_FUNC>

 DEVICE int fill_hash_join_buff_sharded_impl(

     int32_t* buff,

     const JoinColumn join_column,

     const JoinColumnTypeInfo type_info,

     const ShardInfo shard_info,

     const int32_t* sd_inner_to_outer_translation_map,

     const int32_t min_inner_elem,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count,

     HASHTABLE_FILLING_FUNC filling_func) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

   JoinColumnTyped col{&join_column, &type_info};

   for (auto item : col.slice(start, step)) {

     const size_t index = item.index;

     int64_t elem = item.element;

     size_t shard = SHARD_FOR_KEY(elem, shard_info.num_shards);

     if (shard != shard_info.shard) {

       continue;

     }

     if (elem == type_info.null_val) {

       if (type_info.uses_bw_eq) {

         elem = type_info.translated_null_val;

       } else {

         continue;

       }

     }

 #ifndef __CUDACC__

     if (sd_inner_to_outer_translation_map &&

         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {

       const auto outer_id = map_str_id_to_outer_dict(elem,

                                                      min_inner_elem,

                                                      type_info.min_val,

                                                      type_info.max_val,

                                                      sd_inner_to_outer_translation_map);

       if (outer_id == StringDictionary::INVALID_STR_ID) {

         continue;

       }

       elem = outer_id;

     }

 #endif

     if (filling_func(elem, shard, index)) {

       return -1;

     }

   }

   return 0;

 }


 DEVICE int SUFFIX(fill_hash_join_buff_sharded_bucketized)(

     int32_t* buff,

     const int32_t invalid_slot_val,

     const bool for_semi_join,

     const JoinColumn join_column,

     const JoinColumnTypeInfo type_info,

     const ShardInfo shard_info,

     const int32_t* sd_inner_to_outer_translation_map,

     const int32_t min_inner_elem,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count,

     const int64_t bucket_normalization) {

   auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)

                                     : SUFFIX(fill_one_to_one_hashtable);

   auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {

     auto entry_ptr = SUFFIX(get_bucketized_hash_slot_sharded_opt)(

         buff,

         elem,

         type_info.min_val / bucket_normalization,

         type_info.translated_null_val,

         shard_info.entry_count_per_shard,

         shard,

         shard_info.num_shards,

         shard_info.device_count,

         bucket_normalization);

     return filling_func(index, entry_ptr, invalid_slot_val);

   };


   return fill_hash_join_buff_sharded_impl(buff,

                                           join_column,

                                           type_info,

                                           shard_info,

                                           sd_inner_to_outer_translation_map,

                                           min_inner_elem,

                                           cpu_thread_idx,

                                           cpu_thread_count,

                                           hashtable_filling_func);

 }


 DEVICE int SUFFIX(fill_hash_join_buff_sharded)(

     int32_t* buff,

     const int32_t invalid_slot_val,

     const bool for_semi_join,

     const JoinColumn join_column,

     const JoinColumnTypeInfo type_info,

     const ShardInfo shard_info,

     const int32_t* sd_inner_to_outer_translation_map,

     const int32_t min_inner_elem,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count) {

   auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join)

                                     : SUFFIX(fill_one_to_one_hashtable);

   auto hashtable_filling_func = [&](auto elem, auto shard, size_t index) {

     auto entry_ptr = SUFFIX(get_hash_slot_sharded_opt)(buff,

                                                        elem,

                                                        type_info.min_val,

                                                        shard_info.entry_count_per_shard,

                                                        shard,

                                                        shard_info.num_shards,

                                                        shard_info.device_count);

     return filling_func(index, entry_ptr, invalid_slot_val);

   };


   return fill_hash_join_buff_sharded_impl(buff,

                                           join_column,

                                           type_info,

                                           shard_info,

                                           sd_inner_to_outer_translation_map,

                                           min_inner_elem,

                                           cpu_thread_idx,

                                           cpu_thread_count,

                                           hashtable_filling_func);

 }


 template <typename T>

 DEVICE void SUFFIX(init_baseline_hash_join_buff)(int8_t* hash_buff,

                                                  const int64_t entry_count,

                                                  const size_t key_component_count,

                                                  const bool with_val_slot,

                                                  const int32_t invalid_slot_val,

                                                  const int32_t cpu_thread_idx,

                                                  const int32_t cpu_thread_count) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

   auto hash_entry_size = (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);

   const T empty_key = SUFFIX(get_invalid_key)<T>();

   for (int64_t h = start; h < entry_count; h += step) {

     int64_t off = h * hash_entry_size;

     auto row_ptr = reinterpret_cast<T*>(hash_buff + off);

     for (size_t i = 0; i < key_component_count; ++i) {

       row_ptr[i] = empty_key;

     }

     if (with_val_slot) {

       row_ptr[key_component_count] = invalid_slot_val;

     }

   }

 }


 #ifndef __CUDACC__

 #ifdef HAVE_TBB


 template <typename T>

 DEVICE void SUFFIX(init_baseline_hash_join_buff_tbb)(int8_t* hash_buff,

                                                      const int64_t entry_count,

                                                      const size_t key_component_count,

                                                      const bool with_val_slot,

                                                      const int32_t invalid_slot_val) {

   const auto hash_entry_size =

       (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);

   const T empty_key = SUFFIX(get_invalid_key)<T>();

   tbb::parallel_for(tbb::blocked_range<int64_t>(0, entry_count),

                     [=](const tbb::blocked_range<int64_t>& r) {

                       const auto start_idx = r.begin();

                       const auto end_idx = r.end();

                       for (int64_t entry_idx = start_idx; entry_idx < end_idx;

                            ++entry_idx) {

                         const int64_t offset = entry_idx * hash_entry_size;

                         auto row_ptr = reinterpret_cast<T*>(hash_buff + offset);

                         for (size_t k = 0; k < key_component_count; ++k) {

                           row_ptr[k] = empty_key;

                         }

                         if (with_val_slot) {

                           row_ptr[key_component_count] = invalid_slot_val;

                         }

                       }

                     });

 }


 #endif  // #ifdef HAVE_TBB

 #endif  // #ifndef __CUDACC__


 #ifdef __CUDACC__

 template <typename T>

 __device__ T* get_matching_baseline_hash_slot_at(int8_t* hash_buff,

                                                  const uint32_t h,

                                                  const T* key,

                                                  const size_t key_component_count,

                                                  const int64_t hash_entry_size) {

   uint32_t off = h * hash_entry_size;

   auto row_ptr = reinterpret_cast<T*>(hash_buff + off);

   const T empty_key = SUFFIX(get_invalid_key)<T>();

   {

     const T old = atomicCAS(row_ptr, empty_key, *key);

     if (empty_key == old && key_component_count > 1) {

       for (int64_t i = 1; i <= key_component_count - 1; ++i) {

         atomicExch(row_ptr + i, key[i]);

       }

     }

   }

   if (key_component_count > 1) {

     while (atomicAdd(row_ptr + key_component_count - 1, 0) == empty_key) {

       // spin until the winning thread has finished writing the entire key and the init

       // value

     }

   }

   bool match = true;

   for (uint32_t i = 0; i < key_component_count; ++i) {

     if (row_ptr[i] != key[i]) {

       match = false;

       break;

     }

   }


   if (match) {

     return reinterpret_cast<T*>(row_ptr + key_component_count);

   }

   return nullptr;

 }

 #else


 #ifdef _MSC_VER

 #define cas_cst(ptr, expected, desired)                                      \

   (InterlockedCompareExchangePointer(reinterpret_cast<void* volatile*>(ptr), \

                                      reinterpret_cast<void*>(&desired),      \

                                      expected) == expected)

 #define store_cst(ptr, val)                                          \

   InterlockedExchangePointer(reinterpret_cast<void* volatile*>(ptr), \

                              reinterpret_cast<void*>(val))

 #define load_cst(ptr) \

   InterlockedCompareExchange(reinterpret_cast<volatile long*>(ptr), 0, 0)

 #else

 #define cas_cst(ptr, expected, desired) \

   __atomic_compare_exchange_n(          \

       ptr, expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)

 #define store_cst(ptr, val) __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)

 #define load_cst(ptr) __atomic_load_n(ptr, __ATOMIC_SEQ_CST)

 #endif


 template <typename T>

 T* get_matching_baseline_hash_slot_at(int8_t* hash_buff,

                                       const uint32_t h,

                                       const T* key,

                                       const size_t key_component_count,

                                       const int64_t hash_entry_size) {

   uint32_t off = h * hash_entry_size;

   auto row_ptr = reinterpret_cast<T*>(hash_buff + off);

   T empty_key = SUFFIX(get_invalid_key)<T>();

   T write_pending = SUFFIX(get_invalid_key)<T>() - 1;

   if (UNLIKELY(*key == write_pending)) {

     // Address the singularity case where the first column contains the pending

     // write special value. Should never happen, but avoid doing wrong things.

     return nullptr;

   }

   const bool success = cas_cst(row_ptr, &empty_key, write_pending);

   if (success) {

     if (key_component_count > 1) {

       memcpy(row_ptr + 1, key + 1, (key_component_count - 1) * sizeof(T));

     }

     store_cst(row_ptr, *key);

     return reinterpret_cast<T*>(row_ptr + key_component_count);

   }

   while (load_cst(row_ptr) == write_pending) {

     // spin until the winning thread has finished writing the entire key

   }

   for (size_t i = 0; i < key_component_count; ++i) {

     if (load_cst(row_ptr + i) != key[i]) {

       return nullptr;

     }

   }

   return reinterpret_cast<T*>(row_ptr + key_component_count);

 }


 #undef load_cst

 #undef store_cst

 #undef cas_cst


 #endif  // __CUDACC__


 template <typename T>

 DEVICE int write_baseline_hash_slot(const int32_t val,

                                     int8_t* hash_buff,

                                     const int64_t entry_count,

                                     const T* key,

                                     const size_t key_component_count,

                                     const bool with_val_slot,

                                     const int32_t invalid_slot_val,

                                     const size_t key_size_in_bytes,

                                     const size_t hash_entry_size) {

   const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;

   T* matching_group = get_matching_baseline_hash_slot_at(

       hash_buff, h, key, key_component_count, hash_entry_size);

   if (!matching_group) {

     uint32_t h_probe = (h + 1) % entry_count;

     while (h_probe != h) {

       matching_group = get_matching_baseline_hash_slot_at(

           hash_buff, h_probe, key, key_component_count, hash_entry_size);

       if (matching_group) {

         break;

       }

       h_probe = (h_probe + 1) % entry_count;

     }

   }

   if (!matching_group) {

     return -2;

   }

   if (!with_val_slot) {

     return 0;

   }

   if (mapd_cas(matching_group, invalid_slot_val, val) != invalid_slot_val) {

     return -1;

   }

   return 0;

 }


 template <typename T>

 DEVICE int write_baseline_hash_slot_for_semi_join(const int32_t val,

                                                   int8_t* hash_buff,

                                                   const int64_t entry_count,

                                                   const T* key,

                                                   const size_t key_component_count,

                                                   const bool with_val_slot,

                                                   const int32_t invalid_slot_val,

                                                   const size_t key_size_in_bytes,

                                                   const size_t hash_entry_size) {

   const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;

   T* matching_group = get_matching_baseline_hash_slot_at(

       hash_buff, h, key, key_component_count, hash_entry_size);

   if (!matching_group) {

     uint32_t h_probe = (h + 1) % entry_count;

     while (h_probe != h) {

       matching_group = get_matching_baseline_hash_slot_at(

           hash_buff, h_probe, key, key_component_count, hash_entry_size);

       if (matching_group) {

         break;

       }

       h_probe = (h_probe + 1) % entry_count;

     }

   }

   if (!matching_group) {

     return -2;

   }

   if (!with_val_slot) {

     return 0;

   }

   mapd_cas(matching_group, invalid_slot_val, val);

   return 0;

 }


 template <typename T, typename FILL_HANDLER>

 DEVICE int SUFFIX(fill_baseline_hash_join_buff)(int8_t* hash_buff,

                                                 const int64_t entry_count,

                                                 const int32_t invalid_slot_val,

                                                 const bool for_semi_join,

                                                 const size_t key_component_count,

                                                 const bool with_val_slot,

                                                 const FILL_HANDLER* f,

                                                 const int64_t num_elems,

                                                 const int32_t cpu_thread_idx,

                                                 const int32_t cpu_thread_count) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif


   T key_scratch_buff[g_maximum_conditions_to_coalesce];

   const size_t key_size_in_bytes = key_component_count * sizeof(T);

   const size_t hash_entry_size =

       (key_component_count + (with_val_slot ? 1 : 0)) * sizeof(T);

   auto key_buff_handler = [hash_buff,

                            entry_count,

                            with_val_slot,

                            invalid_slot_val,

                            key_size_in_bytes,

                            hash_entry_size,

                            &for_semi_join](const int64_t entry_idx,

                                            const T* key_scratch_buffer,

                                            const size_t key_component_count) {

     if (for_semi_join) {

       return write_baseline_hash_slot_for_semi_join<T>(entry_idx,

                                                        hash_buff,

                                                        entry_count,

                                                        key_scratch_buffer,

                                                        key_component_count,

                                                        with_val_slot,

                                                        invalid_slot_val,

                                                        key_size_in_bytes,

                                                        hash_entry_size);

     } else {

       return write_baseline_hash_slot<T>(entry_idx,

                                          hash_buff,

                                          entry_count,

                                          key_scratch_buffer,

                                          key_component_count,

                                          with_val_slot,

                                          invalid_slot_val,

                                          key_size_in_bytes,

                                          hash_entry_size);

     }

   };


   JoinColumnTuple cols(

       f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());

   for (auto& it : cols.slice(start, step)) {

     const auto err = (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);

     if (err) {

       return err;

     }

   }

   return 0;

 }


 #undef mapd_cas


 #ifdef __CUDACC__

 #define mapd_add(address, val) atomicAdd(address, val)

 #elif defined(_MSC_VER)

 #define mapd_add(address, val)                                      \

   InterlockedExchangeAdd(reinterpret_cast<volatile long*>(address), \

                          static_cast<long>(val))

 #else

 #define mapd_add(address, val) __sync_fetch_and_add(address, val)

 #endif


 template <typename SLOT_SELECTOR>

 DEVICE void count_matches_impl(int32_t* count_buff,

                                const JoinColumn join_column,

                                const JoinColumnTypeInfo type_info

 #ifndef __CUDACC__

                                ,

                                const int32_t* sd_inner_to_outer_translation_map,

                                const int32_t min_inner_elem,

                                const int32_t cpu_thread_idx,

                                const int32_t cpu_thread_count

 #endif

                                ,

                                SLOT_SELECTOR slot_selector) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

   JoinColumnTyped col{&join_column, &type_info};

   for (auto item : col.slice(start, step)) {

     int64_t elem = item.element;

     if (elem == type_info.null_val) {

       if (type_info.uses_bw_eq) {

         elem = type_info.translated_null_val;

       } else {

         continue;

       }

     }

 #ifndef __CUDACC__

     if (sd_inner_to_outer_translation_map &&

         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {

       const auto outer_id = map_str_id_to_outer_dict(elem,

                                                      min_inner_elem,

                                                      type_info.min_val,

                                                      type_info.max_val,

                                                      sd_inner_to_outer_translation_map);

       if (outer_id == StringDictionary::INVALID_STR_ID) {

         continue;

       }

       elem = outer_id;

     }

 #endif

     auto* entry_ptr = slot_selector(count_buff, elem);

     mapd_add(entry_ptr, int32_t(1));

   }

 }


 GLOBAL void SUFFIX(count_matches)(int32_t* count_buff,

                                   const JoinColumn join_column,

                                   const JoinColumnTypeInfo type_info

 #ifndef __CUDACC__

                                   ,

                                   const int32_t* sd_inner_to_outer_translation_map,

                                   const int32_t min_inner_elem,

                                   const int32_t cpu_thread_idx,

                                   const int32_t cpu_thread_count

 #endif

 ) {

   auto slot_sel = [&type_info](auto count_buff, auto elem) {

     return SUFFIX(get_hash_slot)(count_buff, elem, type_info.min_val);

   };

   count_matches_impl(count_buff,

                      join_column,

                      type_info

 #ifndef __CUDACC__

                      ,

                      sd_inner_to_outer_translation_map,

                      min_inner_elem,

                      cpu_thread_idx,

                      cpu_thread_count

 #endif

                      ,

                      slot_sel);

 }


 GLOBAL void SUFFIX(count_matches_bucketized)(

     int32_t* count_buff,

     const JoinColumn join_column,

     const JoinColumnTypeInfo type_info

 #ifndef __CUDACC__

     ,

     const int32_t* sd_inner_to_outer_translation_map,

     const int32_t min_inner_elem,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count

 #endif

     ,

     const int64_t bucket_normalization) {

   auto slot_sel = [bucket_normalization, &type_info](auto count_buff, auto elem) {

     return SUFFIX(get_bucketized_hash_slot)(count_buff,

                                             elem,

                                             type_info.min_val / bucket_normalization,

                                             type_info.translated_null_val,

                                             bucket_normalization);

   };

   count_matches_impl(count_buff,

                      join_column,

                      type_info

 #ifndef __CUDACC__

                      ,

                      sd_inner_to_outer_translation_map,

                      min_inner_elem,

                      cpu_thread_idx,

                      cpu_thread_count

 #endif

                      ,

                      slot_sel);

 }


 GLOBAL void SUFFIX(count_matches_sharded)(

     int32_t* count_buff,

     const JoinColumn join_column,

     const JoinColumnTypeInfo type_info,

     const ShardInfo shard_info

 #ifndef __CUDACC__

     ,

     const int32_t* sd_inner_to_outer_translation_map,

     const int32_t min_inner_elem,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count

 #endif

 ) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

   JoinColumnTyped col{&join_column, &type_info};

   for (auto item : col.slice(start, step)) {

     int64_t elem = item.element;

     if (elem == type_info.null_val) {

       if (type_info.uses_bw_eq) {

         elem = type_info.translated_null_val;

       } else {

         continue;

       }

     }

 #ifndef __CUDACC__

     if (sd_inner_to_outer_translation_map &&

         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {

       const auto outer_id = map_str_id_to_outer_dict(elem,

                                                      min_inner_elem,

                                                      type_info.min_val,

                                                      type_info.max_val,

                                                      sd_inner_to_outer_translation_map);

       if (outer_id == StringDictionary::INVALID_STR_ID) {

         continue;

       }

       elem = outer_id;

     }

 #endif

     int32_t* entry_ptr = SUFFIX(get_hash_slot_sharded)(count_buff,

                                                        elem,

                                                        type_info.min_val,

                                                        shard_info.entry_count_per_shard,

                                                        shard_info.num_shards,

                                                        shard_info.device_count);

     mapd_add(entry_ptr, int32_t(1));

   }

 }


 template <typename T>

 DEVICE NEVER_INLINE const T* SUFFIX(get_matching_baseline_hash_slot_readonly)(

     const T* key,

     const size_t key_component_count,

     const T* composite_key_dict,

     const int64_t entry_count,

     const size_t key_size_in_bytes) {

   const uint32_t h = MurmurHash1Impl(key, key_size_in_bytes, 0) % entry_count;

   uint32_t off = h * key_component_count;

   if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {

     return &composite_key_dict[off];

   }

   uint32_t h_probe = (h + 1) % entry_count;

   while (h_probe != h) {

     off = h_probe * key_component_count;

     if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {

       return &composite_key_dict[off];

     }

     h_probe = (h_probe + 1) % entry_count;

   }

 #ifndef __CUDACC__

   CHECK(false);

 #else

   assert(false);

 #endif

   return nullptr;

 }


 template <typename T, typename KEY_HANDLER>

 GLOBAL void SUFFIX(count_matches_baseline)(int32_t* count_buff,

                                            const T* composite_key_dict,

                                            const int64_t entry_count,

                                            const KEY_HANDLER* f,

                                            const int64_t num_elems

 #ifndef __CUDACC__

                                            ,

                                            const int32_t cpu_thread_idx,

                                            const int32_t cpu_thread_count

 #endif

 ) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

 #ifdef __CUDACC__

   assert(composite_key_dict);

 #endif

   T key_scratch_buff[g_maximum_conditions_to_coalesce];

   const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);

   auto key_buff_handler = [composite_key_dict,

                            entry_count,

                            count_buff,

                            key_size_in_bytes](const int64_t row_entry_idx,

                                               const T* key_scratch_buff,

                                               const size_t key_component_count) {

     const auto matching_group =

         SUFFIX(get_matching_baseline_hash_slot_readonly)(key_scratch_buff,

                                                          key_component_count,

                                                          composite_key_dict,

                                                          entry_count,

                                                          key_size_in_bytes);

     const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;

     mapd_add(&count_buff[entry_idx], int32_t(1));

     return 0;

   };


   JoinColumnTuple cols(

       f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());

   for (auto& it : cols.slice(start, step)) {

     (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);

   }

 }


 template <typename SLOT_SELECTOR>

 DEVICE void fill_row_ids_impl(int32_t* buff,

                               const int64_t hash_entry_count,

                               const JoinColumn join_column,

                               const JoinColumnTypeInfo type_info

 #ifndef __CUDACC__

                               ,

                               const int32_t* sd_inner_to_outer_translation_map,

                               const int32_t min_inner_elem,

                               const int32_t cpu_thread_idx,

                               const int32_t cpu_thread_count

 #endif

                               ,

                               SLOT_SELECTOR slot_selector) {

   int32_t* pos_buff = buff;

   int32_t* count_buff = buff + hash_entry_count;

   int32_t* id_buff = count_buff + hash_entry_count;

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

   JoinColumnTyped col{&join_column, &type_info};

   for (auto item : col.slice(start, step)) {

     const size_t index = item.index;

     int64_t elem = item.element;

     if (elem == type_info.null_val) {

       if (type_info.uses_bw_eq) {

         elem = type_info.translated_null_val;

       } else {

         continue;

       }

     }

 #ifndef __CUDACC__

     if (sd_inner_to_outer_translation_map &&

         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {

       const auto outer_id = map_str_id_to_outer_dict(elem,

                                                      min_inner_elem,

                                                      type_info.min_val,

                                                      type_info.max_val,

                                                      sd_inner_to_outer_translation_map);

       if (outer_id == StringDictionary::INVALID_STR_ID) {

         continue;

       }

       elem = outer_id;

     }

 #endif

     auto pos_ptr = slot_selector(pos_buff, elem);

     const auto bin_idx = pos_ptr - pos_buff;

     const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;

     id_buff[id_buff_idx] = static_cast<int32_t>(index);

   }

 }


 template <typename SLOT_SELECTOR>

 DEVICE void fill_row_ids_for_window_framing_impl(

     int32_t* buff,

     const int64_t hash_entry_count,

     const JoinColumn join_column,

     const JoinColumnTypeInfo type_info

 #ifndef __CUDACC__

     ,

     const int32_t* sd_inner_to_outer_translation_map,

     const int32_t min_inner_elem,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count

 #endif

     ,

     SLOT_SELECTOR slot_selector) {

   int32_t* pos_buff = buff;

   int32_t* count_buff = buff + hash_entry_count;

   int32_t* id_buff = count_buff + hash_entry_count;

   int32_t* reversed_id_buff = id_buff + join_column.num_elems;

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;


 #endif

   if (join_column.num_elems == 0) {

     return;

   }

   JoinColumnTyped col{&join_column, &type_info};

   bool all_nulls = hash_entry_count == 1 && type_info.min_val == 0 &&

                    type_info.max_val == -1 &&

                    (*col.begin()).element == type_info.null_val;

   if (all_nulls) {

     int32_t thread_idx = -1;

 #ifdef __CUDACC__

     thread_idx = threadIdx.x;

 #else

     thread_idx = cpu_thread_idx;

 #endif

     if (thread_idx == 0) {

       pos_buff[0] = 0;

       count_buff[0] = join_column.num_elems - 1;

       for (size_t i = 0; i < join_column.num_elems; i++) {

         reversed_id_buff[i] = i;

       }

     }

     return;

   }

   for (auto item : col.slice(start, step)) {

     const size_t index = item.index;

     int64_t elem = item.element;

     if (elem == type_info.null_val) {

       elem = type_info.translated_null_val;

     }

 #ifndef __CUDACC__

     if (sd_inner_to_outer_translation_map && elem != type_info.translated_null_val) {

       const auto outer_id = map_str_id_to_outer_dict(elem,

                                                      min_inner_elem,

                                                      type_info.min_val,

                                                      type_info.max_val,

                                                      sd_inner_to_outer_translation_map);

       if (outer_id == StringDictionary::INVALID_STR_ID) {

         continue;

       }

       elem = outer_id;

     }

 #endif

     auto pos_ptr = slot_selector(pos_buff, elem);

     const auto bin_idx = pos_ptr - pos_buff;

     auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;

     id_buff[id_buff_idx] = static_cast<int32_t>(index);

     reversed_id_buff[index] = id_buff_idx;

   }

 }


 GLOBAL void SUFFIX(fill_row_ids)(int32_t* buff,

                                  const int64_t hash_entry_count,

                                  const JoinColumn join_column,

                                  const JoinColumnTypeInfo type_info,

                                  const bool for_window_framing

 #ifndef __CUDACC__

                                  ,

                                  const int32_t* sd_inner_to_outer_translation_map,

                                  const int32_t min_inner_elem,

                                  const int32_t cpu_thread_idx,

                                  const int32_t cpu_thread_count

 #endif

 ) {

   auto slot_sel = [&type_info](auto pos_buff, auto elem) {

     return SUFFIX(get_hash_slot)(pos_buff, elem, type_info.min_val);

   };


   if (!for_window_framing) {

     fill_row_ids_impl(buff,

                       hash_entry_count,

                       join_column,

                       type_info

 #ifndef __CUDACC__

                       ,

                       sd_inner_to_outer_translation_map,

                       min_inner_elem,

                       cpu_thread_idx,

                       cpu_thread_count

 #endif

                       ,

                       slot_sel);

   } else {

     fill_row_ids_for_window_framing_impl(buff,

                                          hash_entry_count,

                                          join_column,

                                          type_info

 #ifndef __CUDACC__

                                          ,

                                          sd_inner_to_outer_translation_map,

                                          min_inner_elem,

                                          cpu_thread_idx,

                                          cpu_thread_count

 #endif

                                          ,

                                          slot_sel);

   }

 }


 GLOBAL void SUFFIX(fill_row_ids_bucketized)(

     int32_t* buff,

     const int64_t hash_entry_count,

     const JoinColumn join_column,

     const JoinColumnTypeInfo type_info

 #ifndef __CUDACC__

     ,

     const int32_t* sd_inner_to_outer_translation_map,

     const int32_t min_inner_elem,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count

 #endif

     ,

     const int64_t bucket_normalization) {

   auto slot_sel = [&type_info, bucket_normalization](auto pos_buff, auto elem) {

     return SUFFIX(get_bucketized_hash_slot)(pos_buff,

                                             elem,

                                             type_info.min_val / bucket_normalization,

                                             type_info.translated_null_val,

                                             bucket_normalization);

   };


   fill_row_ids_impl(buff,

                     hash_entry_count,

                     join_column,

                     type_info

 #ifndef __CUDACC__

                     ,

                     sd_inner_to_outer_translation_map,

                     min_inner_elem,

                     cpu_thread_idx,

                     cpu_thread_count

 #endif

                     ,

                     slot_sel);

 }


 template <typename SLOT_SELECTOR>

 DEVICE void fill_row_ids_sharded_impl(int32_t* buff,

                                       const int64_t hash_entry_count,

                                       const JoinColumn join_column,

                                       const JoinColumnTypeInfo type_info,

                                       const ShardInfo shard_info

 #ifndef __CUDACC__

                                       ,

                                       const int32_t* sd_inner_to_outer_translation_map,

                                       const int32_t min_inner_elem,

                                       const int32_t cpu_thread_idx,

                                       const int32_t cpu_thread_count

 #endif

                                       ,

                                       SLOT_SELECTOR slot_selector) {


   int32_t* pos_buff = buff;

   int32_t* count_buff = buff + hash_entry_count;

   int32_t* id_buff = count_buff + hash_entry_count;


 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

   JoinColumnTyped col{&join_column, &type_info};

   for (auto item : col.slice(start, step)) {

     const size_t index = item.index;

     int64_t elem = item.element;

     if (elem == type_info.null_val) {

       if (type_info.uses_bw_eq) {

         elem = type_info.translated_null_val;

       } else {

         continue;

       }

     }

 #ifndef __CUDACC__

     if (sd_inner_to_outer_translation_map &&

         (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) {

       const auto outer_id = map_str_id_to_outer_dict(elem,

                                                      min_inner_elem,

                                                      type_info.min_val,

                                                      type_info.max_val,

                                                      sd_inner_to_outer_translation_map);

       if (outer_id == StringDictionary::INVALID_STR_ID) {

         continue;

       }

       elem = outer_id;

     }

 #endif

     auto* pos_ptr = slot_selector(pos_buff, elem);

     const auto bin_idx = pos_ptr - pos_buff;

     const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;

     id_buff[id_buff_idx] = static_cast<int32_t>(index);

   }

 }


 GLOBAL void SUFFIX(fill_row_ids_sharded)(int32_t* buff,

                                          const int64_t hash_entry_count,

                                          const JoinColumn join_column,

                                          const JoinColumnTypeInfo type_info,

                                          const ShardInfo shard_info

 #ifndef __CUDACC__

                                          ,

                                          const int32_t* sd_inner_to_outer_translation_map,

                                          const int32_t min_inner_elem,

                                          const int32_t cpu_thread_idx,

                                          const int32_t cpu_thread_count

 #endif

 ) {

   auto slot_sel = [&type_info, &shard_info](auto pos_buff, auto elem) {

     return SUFFIX(get_hash_slot_sharded)(pos_buff,

                                          elem,

                                          type_info.min_val,

                                          shard_info.entry_count_per_shard,

                                          shard_info.num_shards,

                                          shard_info.device_count);

   };

   fill_row_ids_impl(buff,

                     hash_entry_count,

                     join_column,

                     type_info

 #ifndef __CUDACC__

                     ,

                     sd_inner_to_outer_translation_map,

                     min_inner_elem,

                     cpu_thread_idx,

                     cpu_thread_count

 #endif

                     ,

                     slot_sel);

 }


 GLOBAL void SUFFIX(fill_row_ids_sharded_bucketized)(

     int32_t* buff,

     const int64_t hash_entry_count,

     const JoinColumn join_column,

     const JoinColumnTypeInfo type_info,

     const ShardInfo shard_info

 #ifndef __CUDACC__

     ,

     const int32_t* sd_inner_to_outer_translation_map,

     const int32_t min_inner_elem,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count

 #endif

     ,

     const int64_t bucket_normalization) {

   auto slot_sel = [&shard_info, &type_info, bucket_normalization](auto pos_buff,

                                                                   auto elem) {

     return SUFFIX(get_bucketized_hash_slot_sharded)(

         pos_buff,

         elem,

         type_info.min_val / bucket_normalization,

         type_info.translated_null_val,

         shard_info.entry_count_per_shard,

         shard_info.num_shards,

         shard_info.device_count,

         bucket_normalization);

   };


   fill_row_ids_impl(buff,

                     hash_entry_count,

                     join_column,

                     type_info

 #ifndef __CUDACC__

                     ,

                     sd_inner_to_outer_translation_map,

                     min_inner_elem,

                     cpu_thread_idx,

                     cpu_thread_count

 #endif

                     ,

                     slot_sel);

 }


 template <typename T, typename KEY_HANDLER>

 GLOBAL void SUFFIX(fill_row_ids_baseline)(int32_t* buff,

                                           const T* composite_key_dict,

                                           const int64_t hash_entry_count,

                                           const KEY_HANDLER* f,

                                           const int64_t num_elems,

                                           const bool for_window_framing

 #ifndef __CUDACC__

                                           ,

                                           const int32_t cpu_thread_idx,

                                           const int32_t cpu_thread_count

 #endif

 ) {

   int32_t* pos_buff = buff;

   int32_t* count_buff = buff + hash_entry_count;

   int32_t* id_buff = count_buff + hash_entry_count;

   int32_t* reversed_id_buff = for_window_framing ? id_buff + num_elems : nullptr;

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif


   T key_scratch_buff[g_maximum_conditions_to_coalesce];

 #ifdef __CUDACC__

   assert(composite_key_dict);

 #endif

   const size_t key_size_in_bytes = f->get_key_component_count() * sizeof(T);

   auto key_buff_handler = [composite_key_dict,

                            hash_entry_count,

                            pos_buff,

                            count_buff,

                            id_buff,

                            reversed_id_buff,

                            key_size_in_bytes,

                            for_window_framing](const int64_t row_index,

                                                const T* key_scratch_buff,

                                                const size_t key_component_count) {

     const T* matching_group =

         SUFFIX(get_matching_baseline_hash_slot_readonly)(key_scratch_buff,

                                                          key_component_count,

                                                          composite_key_dict,

                                                          hash_entry_count,

                                                          key_size_in_bytes);

     const auto entry_idx = (matching_group - composite_key_dict) / key_component_count;

     int32_t* pos_ptr = pos_buff + entry_idx;

     const auto bin_idx = pos_ptr - pos_buff;

     const auto id_buff_idx = mapd_add(count_buff + bin_idx, 1) + *pos_ptr;

     id_buff[id_buff_idx] = static_cast<int32_t>(row_index);

     if (for_window_framing) {

       reversed_id_buff[row_index] = id_buff_idx;

     }

     return 0;

   };


   JoinColumnTuple cols(

       f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());

   for (auto& it : cols.slice(start, step)) {

     (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);

   }

   return;

 }


 #undef mapd_add


 template <typename KEY_HANDLER>

 GLOBAL void SUFFIX(approximate_distinct_tuples_impl)(uint8_t* hll_buffer,

                                                      int32_t* row_count_buffer,

                                                      const uint32_t b,

                                                      const int64_t num_elems,

                                                      const KEY_HANDLER* f

 #ifndef __CUDACC__

                                                      ,

                                                      const int32_t cpu_thread_idx,

                                                      const int32_t cpu_thread_count

 #endif

 ) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif


   auto key_buff_handler = [b, hll_buffer, row_count_buffer](

                               const int64_t entry_idx,

                               const int64_t* key_scratch_buff,

                               const size_t key_component_count) {

     if (row_count_buffer) {

       row_count_buffer[entry_idx] += 1;

     }


     const uint64_t hash =

         MurmurHash64AImpl(key_scratch_buff, key_component_count * sizeof(int64_t), 0);

     const uint32_t index = hash >> (64 - b);

     const auto rank = get_rank(hash << b, 64 - b);

 #ifdef __CUDACC__

     atomicMax(reinterpret_cast<int32_t*>(hll_buffer) + index, rank);

 #else

     hll_buffer[index] = std::max(hll_buffer[index], rank);

 #endif


     return 0;

   };


   int64_t key_scratch_buff[g_maximum_conditions_to_coalesce];


   JoinColumnTuple cols(

       f->get_number_of_columns(), f->get_join_columns(), f->get_join_column_type_infos());

   for (auto& it : cols.slice(start, step)) {

     (*f)(it.join_column_iterators, key_scratch_buff, key_buff_handler);

   }

 }


 #ifdef __CUDACC__

 namespace {

 // TODO(adb): put these in a header file so they are not duplicated between here and

 // cuda_mapd_rt.cu

 __device__ double atomicMin(double* address, double val) {

   unsigned long long int* address_as_ull = (unsigned long long int*)address;

   unsigned long long int old = *address_as_ull, assumed;


   do {

     assumed = old;

     old = atomicCAS(address_as_ull,

                     assumed,

                     __double_as_longlong(min(val, __longlong_as_double(assumed))));

   } while (assumed != old);


   return __longlong_as_double(old);

 }

 }  // namespace

 #endif


 template <size_t N>

 GLOBAL void SUFFIX(compute_bucket_sizes_impl)(double* bucket_sizes_for_thread,

                                               const JoinColumn* join_column,

                                               const JoinColumnTypeInfo* type_info,

                                               const double* bucket_size_thresholds

 #ifndef __CUDACC__

                                               ,

                                               const int32_t cpu_thread_idx,

                                               const int32_t cpu_thread_count

 #endif

 ) {

 #ifdef __CUDACC__

   int32_t start = threadIdx.x + blockDim.x * blockIdx.x;

   int32_t step = blockDim.x * gridDim.x;

 #else

   int32_t start = cpu_thread_idx;

   int32_t step = cpu_thread_count;

 #endif

   JoinColumnIterator it(join_column, type_info, start, step);

   for (; it; ++it) {

     // We expect the bounds column to be (min, max) e.g. (x_min, y_min, x_max, y_max)

     double bounds[2 * N];

     for (size_t j = 0; j < 2 * N; j++) {

       bounds[j] = SUFFIX(fixed_width_double_decode_noinline)(it.ptr(), j);

     }


     for (size_t j = 0; j < N; j++) {

       const auto diff = bounds[j + N] - bounds[j];

 #ifdef __CUDACC__

       if (diff > bucket_size_thresholds[j]) {

         atomicMin(&bucket_sizes_for_thread[j], diff);

       }

 #else

       if (diff < bucket_size_thresholds[j] && diff > bucket_sizes_for_thread[j]) {

         bucket_sizes_for_thread[j] = diff;

       }

 #endif

     }

   }

 }


 #ifndef __CUDACC__


 template <typename InputIterator, typename OutputIterator>

 void inclusive_scan(InputIterator first,

                     InputIterator last,

                     OutputIterator out,

                     const size_t thread_count) {

   using ElementType = typename InputIterator::value_type;

   using OffsetType = typename InputIterator::difference_type;

   const OffsetType elem_count = last - first;

   if (elem_count < 10000 || thread_count <= 1) {

     ElementType sum = 0;

     for (auto iter = first; iter != last; ++iter, ++out) {

       *out = sum += *iter;

     }

     return;

   }


   const OffsetType step = (elem_count + thread_count - 1) / thread_count;

   OffsetType start_off = 0;

   OffsetType end_off = std::min(step, elem_count);

   std::vector<ElementType> partial_sums(thread_count);

   std::vector<std::future<void>> counter_threads;

   for (size_t thread_idx = 0; thread_idx < thread_count; ++thread_idx,

               start_off = std::min(start_off + step, elem_count),

               end_off = std::min(start_off + step, elem_count)) {

     counter_threads.push_back(std::async(

         std::launch::async,

         [first, out](

             ElementType& partial_sum, const OffsetType start, const OffsetType end) {

           ElementType sum = 0;

           for (auto in_iter = first + start, out_iter = out + start;

                in_iter != (first + end);

                ++in_iter, ++out_iter) {

             *out_iter = sum += *in_iter;

           }

           partial_sum = sum;

         },

         std::ref(partial_sums[thread_idx]),

         start_off,

         end_off));

   }

   for (auto& child : counter_threads) {

     child.get();

   }


   ElementType sum = 0;

   for (auto& s : partial_sums) {

     s += sum;

     sum = s;

   }


   counter_threads.clear();

   start_off = std::min(step, elem_count);

   end_off = std::min(start_off + step, elem_count);

   for (size_t thread_idx = 0; thread_idx < thread_count - 1; ++thread_idx,

               start_off = std::min(start_off + step, elem_count),

               end_off = std::min(start_off + step, elem_count)) {

     counter_threads.push_back(std::async(

         std::launch::async,

         [out](const ElementType prev_sum, const OffsetType start, const OffsetType end) {

           for (auto iter = out + start; iter != (out + end); ++iter) {

             *iter += prev_sum;

           }

         },

         partial_sums[thread_idx],

         start_off,

         end_off));

   }

   for (auto& child : counter_threads) {

     child.get();

   }

 }


 template <typename COUNT_MATCHES_LAUNCH_FUNCTOR, typename FILL_ROW_IDS_LAUNCH_FUNCTOR>

 void fill_one_to_many_hash_table_impl(int32_t* buff,

                                       const int64_t hash_entry_count,

                                       const JoinColumn& join_column,

                                       const JoinColumnTypeInfo& type_info,

                                       const int32_t* sd_inner_to_outer_translation_map,

                                       const int32_t min_inner_elem,

                                       const int32_t cpu_thread_count,

                                       const bool for_window_framing,

                                       COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func,

                                       FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func) {

   auto timer = DEBUG_TIMER(__func__);

   int32_t* pos_buff = buff;

   int32_t* count_buff = buff + hash_entry_count;

   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));

   std::vector<std::future<void>> counter_threads;

   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {

     counter_threads.push_back(std::async(

         std::launch::async, count_matches_func, cpu_thread_idx, cpu_thread_count));

   }


   for (auto& child : counter_threads) {

     child.get();

   }


   std::vector<int32_t> count_copy(hash_entry_count, 0);

   CHECK_GT(hash_entry_count, int64_t(0));

   memcpy(count_copy.data() + 1, count_buff, (hash_entry_count - 1) * sizeof(int32_t));

 #if HAVE_CUDA

   thrust::inclusive_scan(count_copy.begin(), count_copy.end(), count_copy.begin());

 #else

   ::inclusive_scan(

       count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);

 #endif

   std::vector<std::future<void>> pos_threads;

   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {

     pos_threads.push_back(std::async(

         std::launch::async,

         [&](size_t thread_idx) {

           for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {

             if (count_buff[i]) {

               pos_buff[i] = count_copy[i];

             }

           }

         },

         cpu_thread_idx));

   }

   for (auto& child : pos_threads) {

     child.get();

   }

   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));

   std::vector<std::future<void>> rowid_threads;

   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {

     rowid_threads.push_back(std::async(

         std::launch::async, fill_row_ids_func, cpu_thread_idx, cpu_thread_count));

   }


   for (auto& child : rowid_threads) {

     child.get();

   }

 }


 void fill_one_to_many_hash_table(OneToManyPerfectJoinHashTableFillFuncArgs const args,

                                  const int32_t cpu_thread_count) {

   auto timer = DEBUG_TIMER(__func__);

   auto const buff = args.buff;

   auto const hash_entry_info = args.hash_entry_info;

   auto launch_count_matches = [count_buff =

                                    buff + hash_entry_info.bucketized_hash_entry_count,

                                &args](auto cpu_thread_idx, auto cpu_thread_count) {

     SUFFIX(count_matches)

     (count_buff,

      args.join_column,

      args.type_info,

      args.sd_inner_to_outer_translation_map,

      args.min_inner_elem,

      cpu_thread_idx,

      cpu_thread_count);

   };

   auto launch_fill_row_ids =

       [hash_entry_count = hash_entry_info.bucketized_hash_entry_count, buff, args](

           auto cpu_thread_idx, auto cpu_thread_count) {

         SUFFIX(fill_row_ids)

         (buff,

          hash_entry_count,

          args.join_column,

          args.type_info,

          args.for_window_framing,

          args.sd_inner_to_outer_translation_map,

          args.min_inner_elem,

          cpu_thread_idx,

          cpu_thread_count);

       };


   fill_one_to_many_hash_table_impl(buff,

                                    hash_entry_info.bucketized_hash_entry_count,

                                    args.join_column,

                                    args.type_info,

                                    args.sd_inner_to_outer_translation_map,

                                    args.min_inner_elem,

                                    cpu_thread_count,

                                    args.for_window_framing,

                                    launch_count_matches,

                                    launch_fill_row_ids);

 }


 void fill_one_to_many_hash_table_bucketized(

     OneToManyPerfectJoinHashTableFillFuncArgs const args,

     const int32_t cpu_thread_count) {

   auto timer = DEBUG_TIMER(__func__);

   auto const buff = args.buff;

   auto const hash_entry_info = args.hash_entry_info;

   auto bucket_normalization = hash_entry_info.bucket_normalization;

   auto hash_entry_count = hash_entry_info.getNormalizedHashEntryCount();

   auto launch_count_matches = [bucket_normalization,

                                count_buff = buff + hash_entry_count,

                                &args](auto cpu_thread_idx, auto cpu_thread_count) {

     SUFFIX(count_matches_bucketized)

     (count_buff,

      args.join_column,

      args.type_info,

      args.sd_inner_to_outer_translation_map,

      args.min_inner_elem,

      cpu_thread_idx,

      cpu_thread_count,

      bucket_normalization);

   };

   auto launch_fill_row_ids = [bucket_normalization, hash_entry_count, buff, args](

                                  auto cpu_thread_idx, auto cpu_thread_count) {

     SUFFIX(fill_row_ids_bucketized)

     (buff,

      hash_entry_count,

      args.join_column,

      args.type_info,

      args.sd_inner_to_outer_translation_map,

      args.min_inner_elem,

      cpu_thread_idx,

      cpu_thread_count,

      bucket_normalization);

   };


   fill_one_to_many_hash_table_impl(buff,

                                    hash_entry_count,

                                    args.join_column,

                                    args.type_info,

                                    args.sd_inner_to_outer_translation_map,

                                    args.min_inner_elem,

                                    cpu_thread_count,

                                    false,

                                    launch_count_matches,

                                    launch_fill_row_ids);

 }


 template <typename COUNT_MATCHES_LAUNCH_FUNCTOR, typename FILL_ROW_IDS_LAUNCH_FUNCTOR>

 void fill_one_to_many_hash_table_sharded_impl(

     int32_t* buff,

     const int64_t hash_entry_count,

     const JoinColumn& join_column,

     const JoinColumnTypeInfo& type_info,

     const ShardInfo& shard_info,

     const int32_t* sd_inner_to_outer_translation_map,

     const int32_t min_inner_elem,

     const int32_t cpu_thread_count,

     COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher,

     FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher) {

   auto timer = DEBUG_TIMER(__func__);

   int32_t* pos_buff = buff;

   int32_t* count_buff = buff + hash_entry_count;

   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));

   std::vector<std::future<void>> counter_threads;

   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {

     counter_threads.push_back(std::async(

         std::launch::async, count_matches_launcher, cpu_thread_idx, cpu_thread_count));

   }


   for (auto& child : counter_threads) {

     child.get();

   }


   std::vector<int32_t> count_copy(hash_entry_count, 0);

   CHECK_GT(hash_entry_count, int64_t(0));

   memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));

   ::inclusive_scan(

       count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);

   std::vector<std::future<void>> pos_threads;

   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {

     pos_threads.push_back(std::async(

         std::launch::async,

         [&](const int32_t thread_idx) {

           for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {

             if (count_buff[i]) {

               pos_buff[i] = count_copy[i];

             }

           }

         },

         cpu_thread_idx));

   }

   for (auto& child : pos_threads) {

     child.get();

   }


   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));

   std::vector<std::future<void>> rowid_threads;

   for (int32_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {

     rowid_threads.push_back(std::async(

         std::launch::async, fill_row_ids_launcher, cpu_thread_idx, cpu_thread_count));

   }


   for (auto& child : rowid_threads) {

     child.get();

   }

 }


 void fill_one_to_many_hash_table_sharded(int32_t* buff,

                                          const int64_t hash_entry_count,

                                          const JoinColumn& join_column,

                                          const JoinColumnTypeInfo& type_info,

                                          const ShardInfo& shard_info,

                                          const int32_t* sd_inner_to_outer_translation_map,

                                          const int32_t min_inner_elem,

                                          const int32_t cpu_thread_count) {

   auto launch_count_matches = [count_buff = buff + hash_entry_count,

                                &join_column,

                                &type_info,

                                &shard_info

 #ifndef __CUDACC__

                                ,

                                sd_inner_to_outer_translation_map,

                                min_inner_elem

 #endif

   ](auto cpu_thread_idx, auto cpu_thread_count) {

     return SUFFIX(count_matches_sharded)(count_buff,

                                          join_column,

                                          type_info,

                                          shard_info

 #ifndef __CUDACC__

                                          ,

                                          sd_inner_to_outer_translation_map,

                                          min_inner_elem,

                                          cpu_thread_idx,

                                          cpu_thread_count

 #endif

     );

   };


   auto launch_fill_row_ids = [buff,

                               hash_entry_count,

                               &join_column,

                               &type_info,

                               &shard_info

 #ifndef __CUDACC__

                               ,

                               sd_inner_to_outer_translation_map,

                               min_inner_elem

 #endif

   ](auto cpu_thread_idx, auto cpu_thread_count) {

     return SUFFIX(fill_row_ids_sharded)(buff,

                                         hash_entry_count,

                                         join_column,

                                         type_info,

                                         shard_info

 #ifndef __CUDACC__

                                         ,

                                         sd_inner_to_outer_translation_map,

                                         min_inner_elem,

                                         cpu_thread_idx,

                                         cpu_thread_count);

 #endif

   };


   fill_one_to_many_hash_table_sharded_impl(buff,

                                            hash_entry_count,

                                            join_column,

                                            type_info,

                                            shard_info

 #ifndef __CUDACC__

                                            ,

                                            sd_inner_to_outer_translation_map,

                                            min_inner_elem,

                                            cpu_thread_count

 #endif

                                            ,

                                            launch_count_matches,

                                            launch_fill_row_ids);

 }


 void init_baseline_hash_join_buff_32(int8_t* hash_join_buff,

                                      const int64_t entry_count,

                                      const size_t key_component_count,

                                      const bool with_val_slot,

                                      const int32_t invalid_slot_val,

                                      const int32_t cpu_thread_idx,

                                      const int32_t cpu_thread_count) {

   init_baseline_hash_join_buff<int32_t>(hash_join_buff,

                                         entry_count,

                                         key_component_count,

                                         with_val_slot,

                                         invalid_slot_val,

                                         cpu_thread_idx,

                                         cpu_thread_count);

 }


 void init_baseline_hash_join_buff_64(int8_t* hash_join_buff,

                                      const int64_t entry_count,

                                      const size_t key_component_count,

                                      const bool with_val_slot,

                                      const int32_t invalid_slot_val,

                                      const int32_t cpu_thread_idx,

                                      const int32_t cpu_thread_count) {

   init_baseline_hash_join_buff<int64_t>(hash_join_buff,

                                         entry_count,

                                         key_component_count,

                                         with_val_slot,

                                         invalid_slot_val,

                                         cpu_thread_idx,

                                         cpu_thread_count);

 }


 #ifndef __CUDACC__

 #ifdef HAVE_TBB


 void init_baseline_hash_join_buff_tbb_32(int8_t* hash_join_buff,

                                          const int64_t entry_count,

                                          const size_t key_component_count,

                                          const bool with_val_slot,

                                          const int32_t invalid_slot_val) {

   init_baseline_hash_join_buff_tbb<int32_t>(

       hash_join_buff, entry_count, key_component_count, with_val_slot, invalid_slot_val);

 }


 void init_baseline_hash_join_buff_tbb_64(int8_t* hash_join_buff,

                                          const int64_t entry_count,

                                          const size_t key_component_count,

                                          const bool with_val_slot,

                                          const int32_t invalid_slot_val) {

   init_baseline_hash_join_buff_tbb<int64_t>(

       hash_join_buff, entry_count, key_component_count, with_val_slot, invalid_slot_val);

 }


 #endif  // #ifdef HAVE_TBB

 #endif  // #ifndef __CUDACC__


 int fill_baseline_hash_join_buff_32(int8_t* hash_buff,

                                     const int64_t entry_count,

                                     const int32_t invalid_slot_val,

                                     const bool for_semi_join,

                                     const size_t key_component_count,

                                     const bool with_val_slot,

                                     const GenericKeyHandler* key_handler,

                                     const int64_t num_elems,

                                     const int32_t cpu_thread_idx,

                                     const int32_t cpu_thread_count) {

   return fill_baseline_hash_join_buff<int32_t>(hash_buff,

                                                entry_count,

                                                invalid_slot_val,

                                                for_semi_join,

                                                key_component_count,

                                                with_val_slot,

                                                key_handler,

                                                num_elems,

                                                cpu_thread_idx,

                                                cpu_thread_count);

 }


 int bbox_intersect_fill_baseline_hash_join_buff_32(

     int8_t* hash_buff,

     const int64_t entry_count,

     const int32_t invalid_slot_val,

     const size_t key_component_count,

     const bool with_val_slot,

     const BoundingBoxIntersectKeyHandler* key_handler,

     const int64_t num_elems,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count) {

   return fill_baseline_hash_join_buff<int32_t>(hash_buff,

                                                entry_count,

                                                invalid_slot_val,

                                                false,

                                                key_component_count,

                                                with_val_slot,

                                                key_handler,

                                                num_elems,

                                                cpu_thread_idx,

                                                cpu_thread_count);

 }


 int range_fill_baseline_hash_join_buff_32(int8_t* hash_buff,

                                           const size_t entry_count,

                                           const int32_t invalid_slot_val,

                                           const size_t key_component_count,

                                           const bool with_val_slot,

                                           const RangeKeyHandler* key_handler,

                                           const size_t num_elems,

                                           const int32_t cpu_thread_idx,

                                           const int32_t cpu_thread_count) {

   return fill_baseline_hash_join_buff<int32_t>(hash_buff,

                                                entry_count,

                                                invalid_slot_val,

                                                false,

                                                key_component_count,

                                                with_val_slot,

                                                key_handler,

                                                num_elems,

                                                cpu_thread_idx,

                                                cpu_thread_count);

 }


 int fill_baseline_hash_join_buff_64(int8_t* hash_buff,

                                     const int64_t entry_count,

                                     const int32_t invalid_slot_val,

                                     const bool for_semi_join,

                                     const size_t key_component_count,

                                     const bool with_val_slot,

                                     const GenericKeyHandler* key_handler,

                                     const int64_t num_elems,

                                     const int32_t cpu_thread_idx,

                                     const int32_t cpu_thread_count) {

   return fill_baseline_hash_join_buff<int64_t>(hash_buff,

                                                entry_count,

                                                invalid_slot_val,

                                                for_semi_join,

                                                key_component_count,

                                                with_val_slot,

                                                key_handler,

                                                num_elems,

                                                cpu_thread_idx,

                                                cpu_thread_count);

 }


 int bbox_intersect_fill_baseline_hash_join_buff_64(

     int8_t* hash_buff,

     const int64_t entry_count,

     const int32_t invalid_slot_val,

     const size_t key_component_count,

     const bool with_val_slot,

     const BoundingBoxIntersectKeyHandler* key_handler,

     const int64_t num_elems,

     const int32_t cpu_thread_idx,

     const int32_t cpu_thread_count) {

   return fill_baseline_hash_join_buff<int64_t>(hash_buff,

                                                entry_count,

                                                invalid_slot_val,

                                                false,

                                                key_component_count,

                                                with_val_slot,

                                                key_handler,

                                                num_elems,

                                                cpu_thread_idx,

                                                cpu_thread_count);

 }


 int range_fill_baseline_hash_join_buff_64(int8_t* hash_buff,

                                           const size_t entry_count,

                                           const int32_t invalid_slot_val,

                                           const size_t key_component_count,

                                           const bool with_val_slot,

                                           const RangeKeyHandler* key_handler,

                                           const size_t num_elems,

                                           const int32_t cpu_thread_idx,

                                           const int32_t cpu_thread_count) {

   return fill_baseline_hash_join_buff<int64_t>(hash_buff,

                                                entry_count,

                                                invalid_slot_val,

                                                false,

                                                key_component_count,

                                                with_val_slot,

                                                key_handler,

                                                num_elems,

                                                cpu_thread_idx,

                                                cpu_thread_count);

 }


 template <typename T>

 void fill_one_to_many_baseline_hash_table(

     int32_t* buff,

     const T* composite_key_dict,

     const int64_t hash_entry_count,

     const size_t key_component_count,

     const std::vector<JoinColumn>& join_column_per_key,

     const std::vector<JoinColumnTypeInfo>& type_info_per_key,

     const std::vector<JoinBucketInfo>& join_buckets_per_key,

     const std::vector<const int32_t*>& sd_inner_to_outer_translation_maps,

     const std::vector<int32_t>& sd_min_inner_elems,

     const size_t cpu_thread_count,

     const bool is_range_join,

     const bool is_geo_compressed,

     const bool for_window_framing) {

   int32_t* pos_buff = buff;

   int32_t* count_buff = buff + hash_entry_count;

   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));

   std::vector<std::future<void>> counter_threads;

   for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {

     if (is_range_join) {

       counter_threads.push_back(std::async(

           std::launch::async,

           [count_buff,

            composite_key_dict,

            &hash_entry_count,

            &join_buckets_per_key,

            &join_column_per_key,

            &is_geo_compressed,

            cpu_thread_idx,

            cpu_thread_count] {

             const auto key_handler = RangeKeyHandler(

                 is_geo_compressed,

                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),

                 &join_column_per_key[0],

                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());

             count_matches_baseline(count_buff,

                                    composite_key_dict,

                                    hash_entry_count,

                                    &key_handler,

                                    join_column_per_key[0].num_elems,

                                    cpu_thread_idx,

                                    cpu_thread_count);

           }));

     } else if (join_buckets_per_key.size() > 0) {

       counter_threads.push_back(std::async(

           std::launch::async,

           [count_buff,

            composite_key_dict,

            &hash_entry_count,

            &join_buckets_per_key,

            &join_column_per_key,

            cpu_thread_idx,

            cpu_thread_count] {

             const auto key_handler = BoundingBoxIntersectKeyHandler(

                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),

                 &join_column_per_key[0],

                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());

             count_matches_baseline(count_buff,

                                    composite_key_dict,

                                    hash_entry_count,

                                    &key_handler,

                                    join_column_per_key[0].num_elems,

                                    cpu_thread_idx,

                                    cpu_thread_count);

           }));

     } else {

       counter_threads.push_back(

           std::async(std::launch::async,

                      [count_buff,

                       composite_key_dict,

                       &key_component_count,

                       &hash_entry_count,

                       &join_column_per_key,

                       &type_info_per_key,

                       &sd_inner_to_outer_translation_maps,

                       &sd_min_inner_elems,

                       cpu_thread_idx,

                       cpu_thread_count] {

                        const auto key_handler =

                            GenericKeyHandler(key_component_count,

                                              true,

                                              &join_column_per_key[0],

                                              &type_info_per_key[0],

                                              &sd_inner_to_outer_translation_maps[0],

                                              &sd_min_inner_elems[0]);

                        count_matches_baseline(count_buff,

                                               composite_key_dict,

                                               hash_entry_count,

                                               &key_handler,

                                               join_column_per_key[0].num_elems,

                                               cpu_thread_idx,

                                               cpu_thread_count);

                      }));

     }

   }


   for (auto& child : counter_threads) {

     child.get();

   }


   std::vector<int32_t> count_copy(hash_entry_count, 0);

   CHECK_GT(hash_entry_count, int64_t(0));

   memcpy(&count_copy[1], count_buff, (hash_entry_count - 1) * sizeof(int32_t));

   ::inclusive_scan(

       count_copy.begin(), count_copy.end(), count_copy.begin(), cpu_thread_count);

   std::vector<std::future<void>> pos_threads;

   for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {

     pos_threads.push_back(std::async(

         std::launch::async,

         [&](const int thread_idx) {

           for (int64_t i = thread_idx; i < hash_entry_count; i += cpu_thread_count) {

             if (count_buff[i]) {

               pos_buff[i] = count_copy[i];

             }

           }

         },

         cpu_thread_idx));

   }

   for (auto& child : pos_threads) {

     child.get();

   }


   memset(count_buff, 0, hash_entry_count * sizeof(int32_t));

   std::vector<std::future<void>> rowid_threads;

   for (size_t cpu_thread_idx = 0; cpu_thread_idx < cpu_thread_count; ++cpu_thread_idx) {

     if (is_range_join) {

       rowid_threads.push_back(std::async(

           std::launch::async,

           [buff,

            composite_key_dict,

            hash_entry_count,

            &join_column_per_key,

            &join_buckets_per_key,

            &is_geo_compressed,

            cpu_thread_idx,

            cpu_thread_count] {

             const auto key_handler = RangeKeyHandler(

                 is_geo_compressed,

                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),

                 &join_column_per_key[0],

                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());

             SUFFIX(fill_row_ids_baseline)

             (buff,

              composite_key_dict,

              hash_entry_count,

              &key_handler,

              join_column_per_key[0].num_elems,

              false,

              cpu_thread_idx,

              cpu_thread_count);

           }));

     } else if (join_buckets_per_key.size() > 0) {

       rowid_threads.push_back(std::async(

           std::launch::async,

           [buff,

            composite_key_dict,

            hash_entry_count,

            &join_column_per_key,

            &join_buckets_per_key,

            for_window_framing,

            cpu_thread_idx,

            cpu_thread_count] {

             const auto key_handler = BoundingBoxIntersectKeyHandler(

                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),

                 &join_column_per_key[0],

                 join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());

             SUFFIX(fill_row_ids_baseline)

             (buff,

              composite_key_dict,

              hash_entry_count,

              &key_handler,

              join_column_per_key[0].num_elems,

              for_window_framing,

              cpu_thread_idx,

              cpu_thread_count);

           }));

     } else {

       rowid_threads.push_back(std::async(std::launch::async,

                                          [buff,

                                           composite_key_dict,

                                           hash_entry_count,

                                           key_component_count,

                                           &join_column_per_key,

                                           &type_info_per_key,

                                           &sd_inner_to_outer_translation_maps,

                                           &sd_min_inner_elems,

                                           for_window_framing,

                                           cpu_thread_idx,

                                           cpu_thread_count] {

                                            const auto key_handler = GenericKeyHandler(

                                                key_component_count,

                                                true,

                                                &join_column_per_key[0],

                                                &type_info_per_key[0],

                                                &sd_inner_to_outer_translation_maps[0],

                                                &sd_min_inner_elems[0]);

                                            SUFFIX(fill_row_ids_baseline)

                                            (buff,

                                             composite_key_dict,

                                             hash_entry_count,

                                             &key_handler,

                                             join_column_per_key[0].num_elems,

                                             for_window_framing,

                                             cpu_thread_idx,

                                             cpu_thread_count);

                                          }));

     }

   }


   for (auto& child : rowid_threads) {

     child.get();

   }

 }


 void fill_one_to_many_baseline_hash_table_32(

     int32_t* buff,

     const int32_t* composite_key_dict,

     const int64_t hash_entry_count,

     const size_t key_component_count,

     const std::vector<JoinColumn>& join_column_per_key,

     const std::vector<JoinColumnTypeInfo>& type_info_per_key,

     const std::vector<JoinBucketInfo>& join_bucket_info,

     const std::vector<const int32_t*>& sd_inner_to_outer_translation_maps,

     const std::vector<int32_t>& sd_min_inner_elems,

     const int32_t cpu_thread_count,

     const bool is_range_join,

     const bool is_geo_compressed,

     const bool for_window_framing) {

   fill_one_to_many_baseline_hash_table<int32_t>(buff,

                                                 composite_key_dict,

                                                 hash_entry_count,

                                                 key_component_count,

                                                 join_column_per_key,

                                                 type_info_per_key,

                                                 join_bucket_info,

                                                 sd_inner_to_outer_translation_maps,

                                                 sd_min_inner_elems,

                                                 cpu_thread_count,

                                                 is_range_join,

                                                 is_geo_compressed,

                                                 for_window_framing);

 }


 void fill_one_to_many_baseline_hash_table_64(

     int32_t* buff,

     const int64_t* composite_key_dict,

     const int64_t hash_entry_count,

     const size_t key_component_count,

     const std::vector<JoinColumn>& join_column_per_key,

     const std::vector<JoinColumnTypeInfo>& type_info_per_key,

     const std::vector<JoinBucketInfo>& join_bucket_info,

     const std::vector<const int32_t*>& sd_inner_to_outer_translation_maps,

     const std::vector<int32_t>& sd_min_inner_elems,

     const int32_t cpu_thread_count,

     const bool is_range_join,

     const bool is_geo_compressed,

     const bool for_window_framing) {

   fill_one_to_many_baseline_hash_table<int64_t>(buff,

                                                 composite_key_dict,

                                                 hash_entry_count,

                                                 key_component_count,

                                                 join_column_per_key,

                                                 type_info_per_key,

                                                 join_bucket_info,

                                                 sd_inner_to_outer_translation_maps,

                                                 sd_min_inner_elems,

                                                 cpu_thread_count,

                                                 is_range_join,

                                                 is_geo_compressed,

                                                 for_window_framing);

 }


 void approximate_distinct_tuples(uint8_t* hll_buffer_all_cpus,

                                  const uint32_t b,

                                  const size_t padded_size_bytes,

                                  const std::vector<JoinColumn>& join_column_per_key,

                                  const std::vector<JoinColumnTypeInfo>& type_info_per_key,

                                  const int thread_count) {

   CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());

   CHECK(!join_column_per_key.empty());


   std::vector<std::future<void>> approx_distinct_threads;

   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {

     approx_distinct_threads.push_back(std::async(

         std::launch::async,

         [&join_column_per_key,

          &type_info_per_key,

          b,

          hll_buffer_all_cpus,

          padded_size_bytes,

          thread_idx,

          thread_count] {

           auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;


           const auto key_handler = GenericKeyHandler(join_column_per_key.size(),

                                                      false,

                                                      &join_column_per_key[0],

                                                      &type_info_per_key[0],

                                                      nullptr,

                                                      nullptr);

           approximate_distinct_tuples_impl(hll_buffer,

                                            nullptr,

                                            b,

                                            join_column_per_key[0].num_elems,

                                            &key_handler,

                                            thread_idx,

                                            thread_count);

         }));

   }

   for (auto& child : approx_distinct_threads) {

     child.get();

   }

 }


 void approximate_distinct_tuples_bbox_intersect(

     uint8_t* hll_buffer_all_cpus,

     std::vector<int32_t>& row_counts,

     const uint32_t b,

     const size_t padded_size_bytes,

     const std::vector<JoinColumn>& join_column_per_key,

     const std::vector<JoinColumnTypeInfo>& type_info_per_key,

     const std::vector<JoinBucketInfo>& join_buckets_per_key,

     const int thread_count) {

   CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());

   CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());

   CHECK(!join_column_per_key.empty());


   std::vector<std::future<void>> approx_distinct_threads;

   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {

     approx_distinct_threads.push_back(std::async(

         std::launch::async,

         [&join_column_per_key,

          &join_buckets_per_key,

          &row_counts,

          b,

          hll_buffer_all_cpus,

          padded_size_bytes,

          thread_idx,

          thread_count] {

           auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;


           const auto key_handler = BoundingBoxIntersectKeyHandler(

               join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),

               &join_column_per_key[0],

               join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());

           approximate_distinct_tuples_impl(hll_buffer,

                                            row_counts.data(),

                                            b,

                                            join_column_per_key[0].num_elems,

                                            &key_handler,

                                            thread_idx,

                                            thread_count);

         }));

   }

   for (auto& child : approx_distinct_threads) {

     child.get();

   }


   ::inclusive_scan(

       row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);

 }


 void approximate_distinct_tuples_range(

     uint8_t* hll_buffer_all_cpus,

     std::vector<int32_t>& row_counts,

     const uint32_t b,

     const size_t padded_size_bytes,

     const std::vector<JoinColumn>& join_column_per_key,

     const std::vector<JoinColumnTypeInfo>& type_info_per_key,

     const std::vector<JoinBucketInfo>& join_buckets_per_key,

     const bool is_compressed,

     const int thread_count) {

   CHECK_EQ(join_column_per_key.size(), join_buckets_per_key.size());

   CHECK_EQ(join_column_per_key.size(), type_info_per_key.size());

   CHECK(!join_column_per_key.empty());


   std::vector<std::future<void>> approx_distinct_threads;

   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {

     approx_distinct_threads.push_back(std::async(

         std::launch::async,

         [&join_column_per_key,

          &join_buckets_per_key,

          &row_counts,

          b,

          hll_buffer_all_cpus,

          padded_size_bytes,

          thread_idx,

          is_compressed,

          thread_count] {

           auto hll_buffer = hll_buffer_all_cpus + thread_idx * padded_size_bytes;


           const auto key_handler = RangeKeyHandler(

               is_compressed,

               join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.size(),

               &join_column_per_key[0],

               join_buckets_per_key[0].inverse_bucket_sizes_for_dimension.data());

           approximate_distinct_tuples_impl(hll_buffer,

                                            row_counts.data(),

                                            b,

                                            join_column_per_key[0].num_elems,

                                            &key_handler,

                                            thread_idx,

                                            thread_count);

         }));

   }

   for (auto& child : approx_distinct_threads) {

     child.get();

   }


   ::inclusive_scan(

       row_counts.begin(), row_counts.end(), row_counts.begin(), thread_count);

 }


 void compute_bucket_sizes_on_cpu(std::vector<double>& bucket_sizes_for_dimension,

                                  const JoinColumn& join_column,

                                  const JoinColumnTypeInfo& type_info,

                                  const std::vector<double>& bucket_size_thresholds,

                                  const int thread_count) {

   std::vector<std::vector<double>> bucket_sizes_for_threads;

   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {

     bucket_sizes_for_threads.emplace_back(bucket_sizes_for_dimension.size(), 0.0);

   }

   std::vector<std::future<void>> threads;

   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {

     threads.push_back(std::async(std::launch::async,

                                  compute_bucket_sizes_impl<2>,

                                  bucket_sizes_for_threads[thread_idx].data(),

                                  &join_column,

                                  &type_info,

                                  bucket_size_thresholds.data(),

                                  thread_idx,

                                  thread_count));

   }

   for (auto& child : threads) {

     child.get();

   }


   for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {

     for (size_t i = 0; i < bucket_sizes_for_dimension.size(); i++) {

       if (bucket_sizes_for_threads[thread_idx][i] > bucket_sizes_for_dimension[i]) {

         bucket_sizes_for_dimension[i] = bucket_sizes_for_threads[thread_idx][i];

       }

     }

   }

 }


 #endif  // ifndef __CUDACC__

OneToManyPerfectJoinHashTableFillFuncArgs::for_window_framing
const bool for_window_framing
Definition: HashJoinRuntime.h:187

heavydb.dtypes.T
T
Definition: dtypes.py:8

OneToOnePerfectJoinHashTableFillFuncArgs
Definition: HashJoinRuntime.h:167

CHECK_EQ
#define CHECK_EQ(x, y)
Definition: Logger.h:301

DecodersImpl.h

get_matching_baseline_hash_slot_at
T * get_matching_baseline_hash_slot_at(int8_t *hash_buff, const uint32_t h, const T *key, const size_t key_component_count, const int64_t hash_entry_size)
Definition: HashJoinRuntime.cpp:466

JoinColumnIterator.h

OneToOnePerfectJoinHashTableFillFuncArgs::type_info
const JoinColumnTypeInfo type_info
Definition: HashJoinRuntime.h:173

keys_are_equal
bool keys_are_equal(const T *key1, const T *key2, const size_t key_component_count)
Definition: CompareKeysInl.h:53

run_benchmark_import.args
tuple args
Definition: run_benchmark_import.py:247

count_matches_sharded
GLOBAL void SUFFIX() count_matches_sharded(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:764

StringDictionary.h

OneToOnePerfectJoinHashTableFillFuncArgs::min_inner_elem
const int32_t min_inner_elem
Definition: HashJoinRuntime.h:175

init_baseline_hash_join_buff_32
void init_baseline_hash_join_buff_32(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1788

fill_one_to_many_hash_table_sharded_impl
void fill_one_to_many_hash_table_sharded_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_launcher, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_launcher)
Definition: HashJoinRuntime.cpp:1656

fill_one_to_many_baseline_hash_table_64
void fill_one_to_many_baseline_hash_table_64(int32_t *buff, const int64_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)
Definition: HashJoinRuntime.cpp:2218

get_rank
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
Definition: HyperLogLogRank.h:30

fill_row_ids_sharded_bucketized
GLOBAL void SUFFIX() fill_row_ids_sharded_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
Definition: HashJoinRuntime.cpp:1207

JoinColumn::num_elems
size_t num_elems
Definition: HashJoinRuntime.h:133

fill_row_ids_impl
DEVICE void fill_row_ids_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
Definition: HashJoinRuntime.cpp:895

fill_row_ids_sharded_impl
DEVICE void fill_row_ids_sharded_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
Definition: HashJoinRuntime.cpp:1113

ShardInfo
Definition: HashJoinRuntime.h:160

JoinColumnTyped
Definition: JoinColumnIterator.h:132

atomicMin
__device__ double atomicMin(double *address, double val)
Definition: cuda_mapd_rt.cu:403

GLOBAL
#define GLOBAL
Definition: funcannotations.h:32

OneToManyPerfectJoinHashTableFillFuncArgs::buff
int32_t * buff
Definition: HashJoinRuntime.h:180

fill_one_to_many_baseline_hash_table_32
void fill_one_to_many_baseline_hash_table_32(int32_t *buff, const int32_t *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const int32_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)
Definition: HashJoinRuntime.cpp:2189

fill_hash_join_buff_bitwise_eq
DEVICE int SUFFIX() fill_hash_join_buff_bitwise_eq(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
Definition: HashJoinRuntime.cpp:187

SUFFIX
#define SUFFIX(name)
Definition: funcannotations.h:71

OneToOnePerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map
const int32_t * sd_inner_to_outer_translation_map
Definition: HashJoinRuntime.h:174

fill_hashtable_for_semi_join
ALWAYS_INLINE DEVICE int SUFFIX() fill_hashtable_for_semi_join(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:54

OneToOnePerfectJoinHashTableFillFuncArgs::join_column
const JoinColumn join_column
Definition: HashJoinRuntime.h:172

JoinColumnTypeInfo::null_val
const int64_t null_val
Definition: HashJoinRuntime.h:141

init_baseline_hash_join_buff_64
void init_baseline_hash_join_buff_64(int8_t *hash_join_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1804

inclusive_scan
void inclusive_scan(InputIterator first, InputIterator last, OutputIterator out, const size_t thread_count)
Definition: HashJoinRuntime.cpp:1431

init_baseline_hash_join_buff
DEVICE void SUFFIX() init_baseline_hash_join_buff(int8_t *hash_buff, const int64_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:347

HashJoinRuntime.h

CHECK_GT
#define CHECK_GT(x, y)
Definition: Logger.h:305

load_cst
#define load_cst(ptr)
Definition: HashJoinRuntime.cpp:462

RuntimeFunctions.h

MurmurHash1Impl
FORCE_INLINE DEVICE uint32_t MurmurHash1Impl(const void *key, int len, const uint32_t seed)
Definition: MurmurHash1Inl.h:6

fill_one_to_many_hash_table
void fill_one_to_many_hash_table(OneToManyPerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1564

DEVICE
#define DEVICE
Definition: funcannotations.h:20

fill_hash_join_buff
DEVICE int SUFFIX() fill_hash_join_buff(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:203

get_hash_slot_bitwise_eq
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_bitwise_eq(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val)
Definition: JoinHashImpl.h:82

fill_hash_join_buff_impl
DEVICE auto fill_hash_join_buff_impl(OneToOnePerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
Definition: HashJoinRuntime.cpp:120

JoinColumnTypeInfo::translated_null_val
const int64_t translated_null_val
Definition: HashJoinRuntime.h:143

get_hash_slot
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key)
Definition: JoinHashImpl.h:76

approximate_distinct_tuples
void approximate_distinct_tuples(uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
Definition: HashJoinRuntime.cpp:2247

get_matching_baseline_hash_slot_readonly
DEVICE NEVER_INLINE const T *SUFFIX() get_matching_baseline_hash_slot_readonly(const T *key, const size_t key_component_count, const T *composite_key_dict, const int64_t entry_count, const size_t key_size_in_bytes)
Definition: HashJoinRuntime.cpp:819

get_bucketized_hash_slot
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:66

threading_serial::async
future< Result > async(Fn &&fn, Args &&...args)
Definition: threading_serial.h:11

ShardInfo::num_shards
size_t num_shards
Definition: HashJoinRuntime.h:163

OneToManyPerfectJoinHashTableFillFuncArgs::join_column
const JoinColumn join_column
Definition: HashJoinRuntime.h:182

StringDictionary::INVALID_STR_ID
static constexpr int32_t INVALID_STR_ID
Definition: StringDictionary.h:181

OneToManyPerfectJoinHashTableFillFuncArgs::hash_entry_info
const BucketizedHashEntryInfo hash_entry_info
Definition: HashJoinRuntime.h:181

fill_row_ids_baseline
GLOBAL void SUFFIX() fill_row_ids_baseline(int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const KEY_HANDLER *f, const int64_t num_elems, const bool for_window_framing, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1251

JoinColumnIterator
Iterates over the rows of a JoinColumn across multiple fragments/chunks.
Definition: JoinColumnIterator.h:20

BucketizedHashEntryInfo::bucket_normalization
int64_t bucket_normalization
Definition: HashJoinRuntime.h:45

get_invalid_key
DEVICE T SUFFIX() get_invalid_key()
Definition: CompareKeysInl.h:28

ShardInfo::shard
size_t shard
Definition: HashJoinRuntime.h:161

fill_row_ids_sharded
GLOBAL void SUFFIX() fill_row_ids_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1171

mapd_cas
#define mapd_cas(address, compare, val)
Definition: HashJoinRuntime.cpp:116

compute_bucket_sizes_on_cpu
void compute_bucket_sizes_on_cpu(std::vector< double > &bucket_sizes_for_dimension, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const std::vector< double > &bucket_size_thresholds, const int thread_count)
Definition: HashJoinRuntime.cpp:2388

BoundingBoxIntersectKeyHandler
Definition: HashJoinKeyHandlers.h:133

Logger.h

gpu_enabled::partial_sum
DEVICE void partial_sum(ARGS &&...args)
Definition: gpu_enabled.h:87

anonymous_namespace{HashJoinRuntime.cpp}::map_str_id_to_outer_dict
int64_t map_str_id_to_outer_dict(const int64_t inner_elem, const int64_t min_inner_elem, const int64_t min_outer_elem, const int64_t max_outer_elem, const int32_t *inner_to_outer_translation_map)
Definition: HashJoinRuntime.cpp:56

bbox_intersect_fill_baseline_hash_join_buff_32
int bbox_intersect_fill_baseline_hash_join_buff_32(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1866

count_matches
GLOBAL void SUFFIX() count_matches(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:702

cas_cst
#define cas_cst(ptr, expected, desired)
Definition: HashJoinRuntime.cpp:458

bbox_intersect_fill_baseline_hash_join_buff_64
int bbox_intersect_fill_baseline_hash_join_buff_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const BoundingBoxIntersectKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1931

approximate_distinct_tuples_impl
GLOBAL void SUFFIX() approximate_distinct_tuples_impl(uint8_t *hll_buffer, int32_t *row_count_buffer, const uint32_t b, const int64_t num_elems, const KEY_HANDLER *f, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1318

fill_baseline_hash_join_buff_64
int fill_baseline_hash_join_buff_64(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1909

CompareKeysInl.h

JoinColumnTypeInfo::max_val
const int64_t max_val
Definition: HashJoinRuntime.h:140

UNLIKELY
#define UNLIKELY(x)
Definition: likely.h:25

fill_baseline_hash_join_buff
int fill_baseline_hash_join_buff(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const KEY_HANDLER *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: BaselineHashTableBuilder.h:30

fill_row_ids_bucketized
GLOBAL void SUFFIX() fill_row_ids_bucketized(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
Definition: HashJoinRuntime.cpp:1075

StringDictionaryProxy.h

range_fill_baseline_hash_join_buff_64
int range_fill_baseline_hash_join_buff_64(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1953

shard_key.h

GpuRtConstants.h

fill_one_to_many_hash_table_sharded
void fill_one_to_many_hash_table_sharded(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const ShardInfo &shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1715

get_hash_slot_sharded_opt
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:140

JoinColumnTypeInfo::uses_bw_eq
const bool uses_bw_eq
Definition: HashJoinRuntime.h:142

range_fill_baseline_hash_join_buff_32
int range_fill_baseline_hash_join_buff_32(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const RangeKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1888

fill_one_to_many_hash_table_bucketized
void fill_one_to_many_hash_table_bucketized(OneToManyPerfectJoinHashTableFillFuncArgs const args, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1608

JoinHashImpl.h

init_hash_join_buff
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int64_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:71

f
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
Definition: TestTorchTableFunctions.cpp:103

approximate_distinct_tuples_bbox_intersect
void approximate_distinct_tuples_bbox_intersect(uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const int thread_count)
Definition: HashJoinRuntime.cpp:2289

JoinColumnTypeInfo::min_val
const int64_t min_val
Definition: HashJoinRuntime.h:139

JoinColumnTypeInfo
Definition: HashJoinRuntime.h:137

get_hash_slot_sharded
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count)
Definition: JoinHashImpl.h:108

write_baseline_hash_slot
DEVICE int write_baseline_hash_slot(const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
Definition: HashJoinRuntime.cpp:506

threading_serial::parallel_for
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
Definition: threading_serial.h:34

fill_hash_join_buff_sharded_impl
DEVICE int fill_hash_join_buff_sharded_impl(int32_t *buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, HASHTABLE_FILLING_FUNC filling_func)
Definition: HashJoinRuntime.cpp:219

likely.h

NEVER_INLINE
#define NEVER_INLINE
Definition: funcannotations.h:65

approximate_distinct_tuples_range
void approximate_distinct_tuples_range(uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
Definition: HashJoinRuntime.cpp:2337

store_cst
#define store_cst(ptr, val)
Definition: HashJoinRuntime.cpp:461

anonymous_namespace{Utm.h}::N
constexpr unsigned N
Definition: Utm.h:110

fill_row_ids
GLOBAL void SUFFIX() fill_row_ids(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const bool for_window_framing, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1027

CHECK
#define CHECK(condition)
Definition: Logger.h:291

DEBUG_TIMER
#define DEBUG_TIMER(name)
Definition: Logger.h:412

fill_one_to_one_hashtable
ALWAYS_INLINE DEVICE int SUFFIX() fill_one_to_one_hashtable(size_t idx, int32_t *entry_ptr, const int32_t invalid_slot_val)
Definition: JoinHashImpl.h:44

MurmurHash1Inl.h

get_bucketized_hash_slot_sharded_opt
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded_opt(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const uint32_t entry_count_per_shard, const uint32_t shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:122

OneToManyPerfectJoinHashTableFillFuncArgs::type_info
const JoinColumnTypeInfo type_info
Definition: HashJoinRuntime.h:183

funcannotations.h

OneToManyPerfectJoinHashTableFillFuncArgs
Definition: HashJoinRuntime.h:179

fixed_width_double_decode_noinline
DEVICE NEVER_INLINE double SUFFIX() fixed_width_double_decode_noinline(const int8_t *byte_stream, const int64_t pos)
Definition: DecodersImpl.h:134

fill_hash_join_buff_bucketized
DEVICE int SUFFIX() fill_hash_join_buff_bucketized(OneToOnePerfectJoinHashTableFillFuncArgs const args, int32_t const cpu_thread_idx, int32_t const cpu_thread_count)
Definition: HashJoinRuntime.cpp:167

count_matches_bucketized
GLOBAL void SUFFIX() count_matches_bucketized(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
Definition: HashJoinRuntime.cpp:730

fill_one_to_many_baseline_hash_table
void fill_one_to_many_baseline_hash_table(int32_t *buff, const T *composite_key_dict, const int64_t hash_entry_count, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const std::vector< const int32_t * > &sd_inner_to_outer_translation_maps, const std::vector< int32_t > &sd_min_inner_elems, const size_t cpu_thread_count, const bool is_range_join, const bool is_geo_compressed, const bool for_window_framing)
Definition: HashJoinRuntime.cpp:1975

JoinColumnTuple
Definition: JoinColumnIterator.h:214

write_baseline_hash_slot_for_semi_join
DEVICE int write_baseline_hash_slot_for_semi_join(const int32_t val, int8_t *hash_buff, const int64_t entry_count, const T *key, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t key_size_in_bytes, const size_t hash_entry_size)
Definition: HashJoinRuntime.cpp:542

fill_hash_join_buff_sharded_bucketized
DEVICE int SUFFIX() fill_hash_join_buff_sharded_bucketized(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, const int64_t bucket_normalization)
Definition: HashJoinRuntime.cpp:272

HyperLogLogRank.h

atomicMax
__device__ double atomicMax(double *address, double val)
Definition: cuda_mapd_rt.cu:372

JoinColumnIterator::ptr
DEVICE FORCE_INLINE const int8_t * ptr() const
Definition: JoinColumnIterator.h:33

mapd_add
#define mapd_add(address, val)
Definition: HashJoinRuntime.cpp:650

count_matches_baseline
GLOBAL void SUFFIX() count_matches_baseline(int32_t *count_buff, const T *composite_key_dict, const int64_t entry_count, const KEY_HANDLER *f, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:847

fill_baseline_hash_join_buff_32
int fill_baseline_hash_join_buff_32(int8_t *hash_buff, const int64_t entry_count, const int32_t invalid_slot_val, const bool for_semi_join, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const int64_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1844

HashJoinKeyHandlers.h

fill_one_to_many_hash_table_impl
void fill_one_to_many_hash_table_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn &join_column, const JoinColumnTypeInfo &type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_count, const bool for_window_framing, COUNT_MATCHES_LAUNCH_FUNCTOR count_matches_func, FILL_ROW_IDS_LAUNCH_FUNCTOR fill_row_ids_func)
Definition: HashJoinRuntime.cpp:1503

compute_bucket_sizes_impl
GLOBAL void SUFFIX() compute_bucket_sizes_impl(double *bucket_sizes_for_thread, const JoinColumn *join_column, const JoinColumnTypeInfo *type_info, const double *bucket_size_thresholds, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:1388

g_maximum_conditions_to_coalesce
const size_t g_maximum_conditions_to_coalesce
Definition: HashJoinRuntime.h:52

get_bucketized_hash_slot_sharded
ALWAYS_INLINE DEVICE int32_t *SUFFIX() get_bucketized_hash_slot_sharded(int32_t *buff, const int64_t key, const int64_t min_key, const int64_t translated_null_val, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count, const int64_t bucket_normalization)
Definition: JoinHashImpl.h:90

OneToManyPerfectJoinHashTableFillFuncArgs::sd_inner_to_outer_translation_map
const int32_t * sd_inner_to_outer_translation_map
Definition: HashJoinRuntime.h:184

fill_hash_join_buff_sharded
DEVICE int SUFFIX() fill_hash_join_buff_sharded(int32_t *buff, const int32_t invalid_slot_val, const bool for_semi_join, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const ShardInfo shard_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
Definition: HashJoinRuntime.cpp:311

fill_row_ids_for_window_framing_impl
DEVICE void fill_row_ids_for_window_framing_impl(int32_t *buff, const int64_t hash_entry_count, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
Definition: HashJoinRuntime.cpp:951

OneToManyPerfectJoinHashTableFillFuncArgs::min_inner_elem
const int32_t min_inner_elem
Definition: HashJoinRuntime.h:185

SHARD_FOR_KEY
#define SHARD_FOR_KEY(key, num_shards)
Definition: shard_key.h:20

MurmurHash64AImpl
FORCE_INLINE DEVICE uint64_t MurmurHash64AImpl(const void *key, int len, uint64_t seed)
Definition: MurmurHash1Inl.h:53

count_matches_impl
DEVICE void count_matches_impl(int32_t *count_buff, const JoinColumn join_column, const JoinColumnTypeInfo type_info, const int32_t *sd_inner_to_outer_translation_map, const int32_t min_inner_elem, const int32_t cpu_thread_idx, const int32_t cpu_thread_count, SLOT_SELECTOR slot_selector)
Definition: HashJoinRuntime.cpp:654

JoinColumn
Definition: HashJoinRuntime.h:128

RangeKeyHandler
Definition: HashJoinKeyHandlers.h:187

GenericKeyHandler
Definition: HashJoinKeyHandlers.h:39