OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
RangeJoinHashTable Class Referencefinal

#include <RangeJoinHashTable.h>

+ Inheritance diagram for RangeJoinHashTable:
+ Collaboration diagram for RangeJoinHashTable:

Public Member Functions

 RangeJoinHashTable (const std::shared_ptr< Analyzer::BinOper > condition, const JoinType join_type, const Analyzer::RangeOper *range_expr, std::shared_ptr< Analyzer::ColumnVar > inner_col_expr, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, ColumnCacheMap &column_cache, Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs, const int device_count, const HashTableBuildDagMap &hashtable_build_dag_map, const TableIdToNodeMap &table_id_to_node_map)
 
 ~RangeJoinHashTable () override=default
 
llvm::Value * codegenKey (const CompilationOptions &co, llvm::Value *offset)
 
HashJoinMatchingSet codegenMatchingSetWithOffset (const CompilationOptions &, const size_t, llvm::Value *)
 
- Public Member Functions inherited from OverlapsJoinHashTable
 OverlapsJoinHashTable (const std::shared_ptr< Analyzer::BinOper > condition, const JoinType join_type, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, ColumnCacheMap &column_cache, Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs, const int device_count, const HashTableBuildDagMap &hashtable_build_dag_map, const TableIdToNodeMap &table_id_to_node_map)
 
virtual ~OverlapsJoinHashTable ()
 
- Public Member Functions inherited from HashJoin
virtual std::string toStringFlat64 (const ExecutorDeviceType device_type, const int device_id) const
 
virtual std::string toStringFlat32 (const ExecutorDeviceType device_type, const int device_id) const
 
JoinColumn fetchJoinColumn (const Analyzer::ColumnVar *hash_col, const std::vector< Fragmenter_Namespace::FragmentInfo > &fragment_info, const Data_Namespace::MemoryLevel effective_memory_level, const int device_id, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, DeviceAllocator *dev_buff_owner, std::vector< std::shared_ptr< void >> &malloc_owner, Executor *executor, ColumnCacheMap *column_cache)
 
HashTablegetHashTableForDevice (const size_t device_id) const
 
size_t getJoinHashBufferSize (const ExecutorDeviceType device_type)
 
size_t getJoinHashBufferSize (const ExecutorDeviceType device_type, const int device_id) const
 
int8_t * getJoinHashBuffer (const ExecutorDeviceType device_type, const int device_id) const
 
void freeHashBufferMemory ()
 

Static Public Member Functions

static std::shared_ptr
< RangeJoinHashTable
getInstance (const std::shared_ptr< Analyzer::BinOper > condition, const Analyzer::RangeOper *range_expr, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const JoinType join_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
 
- Static Public Member Functions inherited from OverlapsJoinHashTable
static std::shared_ptr
< OverlapsJoinHashTable
getInstance (const std::shared_ptr< Analyzer::BinOper > condition, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const JoinType join_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
 Make hash table from an in-flight SQL query's parse tree etc. More...
 
static void invalidateCache ()
 
static void markCachedItemAsDirty (size_t table_key)
 
static HashtableRecyclergetHashTableCache ()
 
static
OverlapsTuningParamRecycler
getOverlapsTuningParamCache ()
 
- Static Public Member Functions inherited from HashJoin
static bool layoutRequiresAdditionalBuffers (HashType layout) noexcept
 
static std::string getHashTypeString (HashType ht) noexcept
 
static HashJoinMatchingSet codegenMatchingSet (const std::vector< llvm::Value * > &hash_join_idx_args_in, const bool is_sharded, const bool col_is_nullable, const bool is_bw_eq, const int64_t sub_buff_size, Executor *executor, const bool is_bucketized=false)
 
static llvm::Value * codegenHashTableLoad (const size_t table_idx, Executor *executor)
 
static std::shared_ptr< HashJoingetInstance (const std::shared_ptr< Analyzer::BinOper > qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
 Make hash table from an in-flight SQL query's parse tree etc. More...
 
static std::shared_ptr< HashJoingetSyntheticInstance (std::string_view table1, std::string_view column1, std::string_view table2, std::string_view column2, const Data_Namespace::MemoryLevel memory_level, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor)
 Make hash table from named tables and columns (such as for testing). More...
 
static std::shared_ptr< HashJoingetSyntheticInstance (const std::shared_ptr< Analyzer::BinOper > qual_bin_oper, const Data_Namespace::MemoryLevel memory_level, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor)
 Make hash table from named tables and columns (such as for testing). More...
 
static std::pair< std::string,
std::shared_ptr< HashJoin > > 
getSyntheticInstance (std::vector< std::shared_ptr< Analyzer::BinOper >>, const Data_Namespace::MemoryLevel memory_level, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor)
 
static int getInnerTableId (const std::vector< InnerOuter > &inner_outer_pairs)
 
static bool canAccessHashTable (bool allow_hash_table_recycling, bool invalid_cache_key, JoinType join_type)
 
static void checkHashJoinReplicationConstraint (const int table_id, const size_t shard_count, const Executor *executor)
 
static std::pair< InnerOuter,
InnerOuterStringOpInfos
normalizeColumnPair (const Analyzer::Expr *lhs, const Analyzer::Expr *rhs, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables, const bool is_overlaps_join=false)
 
template<typename T >
static const T * getHashJoinColumn (const Analyzer::Expr *expr)
 
static std::pair< std::vector
< InnerOuter >, std::vector
< InnerOuterStringOpInfos > > 
normalizeColumnPairs (const Analyzer::BinOper *condition, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables)
 
static std::vector< int > collectFragmentIds (const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments)
 
static CompositeKeyInfo getCompositeKeyInfo (const std::vector< InnerOuter > &inner_outer_pairs, const Executor *executor, const std::vector< InnerOuterStringOpInfos > &inner_outer_string_op_infos_pairs={})
 
static std::vector< const
StringDictionaryProxy::IdMap * > 
translateCompositeStrDictProxies (const CompositeKeyInfo &composite_key_info, const std::vector< InnerOuterStringOpInfos > &string_op_infos_for_keys, const Executor *executor)
 
static std::pair< const
StringDictionaryProxy
*, StringDictionaryProxy * > 
getStrDictProxies (const InnerOuter &cols, const Executor *executor, const bool has_string_ops)
 
static const
StringDictionaryProxy::IdMap
translateInnerToOuterStrDictProxies (const InnerOuter &cols, const InnerOuterStringOpInfos &inner_outer_string_op_infos, ExpressionRange &old_col_range, const Executor *executor)
 

Protected Member Functions

void reifyWithLayout (const HashType layout) override
 
void reifyForDevice (const ColumnsForDevice &columns_for_device, const HashType layout, const size_t entry_count, const size_t emitted_keys_count, const int device_id, const logger::ThreadId parent_thread_id)
 
std::shared_ptr
< BaselineHashTable
initHashTableOnCpu (const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_bucket_info, const HashType layout, const size_t entry_count, const size_t emitted_keys_count)
 
HashType getHashType () const noexceptoverride
 
std::pair< size_t, size_t > approximateTupleCount (const std::vector< double > &inverse_bucket_sizes_for_dimension, std::vector< ColumnsForDevice > &columns_per_device, const size_t chosen_max_hashtable_size, const double chosen_bucket_threshold) override
 
std::pair< size_t, size_t > computeRangeHashTableCounts (const size_t shard_count, std::vector< ColumnsForDevice > &columns_per_device)
 
- Protected Member Functions inherited from OverlapsJoinHashTable
void reify (const HashType preferred_layout)
 
virtual void reifyImpl (std::vector< ColumnsForDevice > &columns_per_device, const Fragmenter_Namespace::TableInfo &query_info, const HashType layout, const size_t shard_count, const size_t entry_count, const size_t emitted_keys_count, const bool skip_hashtable_caching, const size_t chosen_max_hashtable_size, const double chosen_bucket_threshold)
 
void reifyForDevice (const ColumnsForDevice &columns_for_device, const HashType layout, const size_t entry_count, const size_t emitted_keys_count, const bool skip_hashtable_caching, const int device_id, const logger::ThreadId parent_thread_id)
 
size_t calculateHashTableSize (size_t number_of_dimensions, size_t emitted_keys_count, size_t entry_count) const
 
ColumnsForDevice fetchColumnsForDevice (const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id, DeviceAllocator *dev_buff_owner)
 
virtual std::pair< size_t, size_t > computeHashTableCounts (const size_t shard_count, const std::vector< double > &inverse_bucket_sizes_for_dimension, std::vector< ColumnsForDevice > &columns_per_device, const size_t chosen_max_hashtable_size, const double chosen_bucket_threshold)
 
void setInverseBucketSizeInfo (const std::vector< double > &inverse_bucket_sizes, std::vector< ColumnsForDevice > &columns_per_device, const size_t device_count)
 
size_t getKeyComponentWidth () const
 
size_t getKeyComponentCount () const
 
Data_Namespace::MemoryLevel getMemoryLevel () const noexceptoverride
 
int getDeviceCount () const noexceptoverride
 
std::shared_ptr
< BaselineHashTable
initHashTableOnCpu (const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_bucket_info, const HashType layout, const size_t entry_count, const size_t emitted_keys_count, const bool skip_hashtable_caching)
 
HashJoinMatchingSet codegenMatchingSet (const CompilationOptions &, const size_t) override
 
std::string toString (const ExecutorDeviceType device_type, const int device_id=0, bool raw=false) const override
 
DecodedJoinHashBufferSet toSet (const ExecutorDeviceType device_type, const int device_id) const override
 
llvm::Value * codegenSlot (const CompilationOptions &, const size_t) override
 
const RegisteredQueryHintgetRegisteredQueryHint ()
 
void registerQueryHint (const RegisteredQueryHint &query_hint)
 
size_t getEntryCount () const
 
size_t getEmittedKeysCount () const
 
size_t getComponentBufferSize () const noexceptoverride
 
size_t shardCount () const
 
Data_Namespace::MemoryLevel getEffectiveMemoryLevel (const std::vector< InnerOuter > &inner_outer_pairs) const
 
int getInnerTableId () const noexceptoverride
 
int getInnerTableRteIdx () const noexceptoverride
 
size_t getKeyBufferSize () const noexcept
 
size_t offsetBufferOff () const noexceptoverride
 
size_t countBufferOff () const noexceptoverride
 
size_t payloadBufferOff () const noexceptoverride
 
std::string getHashJoinType () const final
 
bool isBitwiseEq () const override
 
std::shared_ptr< HashTableinitHashTableOnCpuFromCache (QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier)
 
std::optional< std::pair
< size_t, size_t > > 
getApproximateTupleCountFromCache (QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier)
 
void putHashTableOnCpuToCache (QueryPlanHash key, CacheItemType item_type, std::shared_ptr< HashTable > hashtable_ptr, DeviceIdentifier device_identifier, size_t hashtable_building_time)
 
llvm::Value * codegenKey (const CompilationOptions &)
 
std::vector< llvm::Value * > codegenManyKey (const CompilationOptions &)
 
std::optional
< OverlapsHashTableMetaInfo
getOverlapsHashTableMetaInfo ()
 
QueryPlanHash getAlternativeCacheKey (AlternativeCacheKeyForOverlapsHashJoin &info)
 
void generateCacheKey (const size_t max_hashtable_size, const double bucket_threshold, const std::vector< double > &bucket_sizes, std::vector< std::vector< Fragmenter_Namespace::FragmentInfo >> &fragments_per_device, int device_count)
 
QueryPlanHash getCacheKey (int device_id) const
 
const std::vector< InnerOuter > & getInnerOuterPairs () const
 
void setOverlapsHashtableMetaInfo (size_t max_table_size_bytes, double bucket_threshold, std::vector< double > &bucket_sizes)
 

Private Member Functions

bool isInnerColCompressed () const
 
bool isProbeCompressed () const
 

Private Attributes

const Analyzer::RangeOperrange_expr_
 
std::shared_ptr
< Analyzer::ColumnVar
inner_col_expr_
 
Data_Namespace::MemoryLevel effective_memory_level_
 
const double bucket_threshold_ {std::numeric_limits<double>::max()}
 
const size_t max_hashtable_size_ {std::numeric_limits<size_t>::max()}
 

Additional Inherited Members

- Static Protected Member Functions inherited from HashJoin
static llvm::Value * codegenColOrStringOper (const Analyzer::Expr *col_or_string_oper, const std::vector< StringOps_Namespace::StringOpInfo > &string_op_infos, CodeGenerator &code_generator, const CompilationOptions &co)
 
- Protected Attributes inherited from OverlapsJoinHashTable
const std::shared_ptr
< Analyzer::BinOper
condition_
 
const JoinType join_type_
 
const std::vector
< InputTableInfo > & 
query_infos_
 
const Data_Namespace::MemoryLevel memory_level_
 
Executorexecutor_
 
ColumnCacheMapcolumn_cache_
 
std::vector< InnerOuterinner_outer_pairs_
 
const int device_count_
 
std::vector< double > inverse_bucket_sizes_for_dimension_
 
double chosen_overlaps_bucket_threshold_
 
size_t chosen_overlaps_max_table_size_bytes_
 
CompositeKeyInfo composite_key_info_
 
std::optional< HashTypelayout_override_
 
std::mutex cpu_hash_table_buff_mutex_
 
RegisteredQueryHint query_hint_
 
HashTableBuildDagMap hashtable_build_dag_map_
 
QueryPlanDAG query_plan_dag_
 
std::vector< QueryPlanHashhashtable_cache_key_
 
HashtableCacheMetaInfo hashtable_cache_meta_info_
 
std::unordered_set< size_t > table_keys_
 
const TableIdToNodeMap table_id_to_node_map_
 
- Protected Attributes inherited from HashJoin
std::vector< std::shared_ptr
< HashTable > > 
hash_tables_for_device_
 
- Static Protected Attributes inherited from OverlapsJoinHashTable
static std::unique_ptr
< HashtableRecycler
hash_table_cache_
 
static std::unique_ptr
< OverlapsTuningParamRecycler
auto_tuner_cache_
 

Detailed Description

Definition at line 21 of file RangeJoinHashTable.h.

Constructor & Destructor Documentation

RangeJoinHashTable::RangeJoinHashTable ( const std::shared_ptr< Analyzer::BinOper condition,
const JoinType  join_type,
const Analyzer::RangeOper range_expr,
std::shared_ptr< Analyzer::ColumnVar inner_col_expr,
const std::vector< InputTableInfo > &  query_infos,
const Data_Namespace::MemoryLevel  memory_level,
ColumnCacheMap column_cache,
Executor executor,
const std::vector< InnerOuter > &  inner_outer_pairs,
const int  device_count,
const HashTableBuildDagMap hashtable_build_dag_map,
const TableIdToNodeMap table_id_to_node_map 
)
inline

Definition at line 23 of file RangeJoinHashTable.h.

35  : OverlapsJoinHashTable(condition,
36  join_type,
37  query_infos,
38  memory_level,
39  column_cache,
40  executor,
41  inner_outer_pairs,
42  device_count,
43  hashtable_build_dag_map,
44  table_id_to_node_map)
45  , range_expr_(range_expr)
46  , inner_col_expr_(std::move(inner_col_expr)) {}
std::shared_ptr< Analyzer::ColumnVar > inner_col_expr_
OverlapsJoinHashTable(const std::shared_ptr< Analyzer::BinOper > condition, const JoinType join_type, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, ColumnCacheMap &column_cache, Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs, const int device_count, const HashTableBuildDagMap &hashtable_build_dag_map, const TableIdToNodeMap &table_id_to_node_map)
const Analyzer::RangeOper * range_expr_
RangeJoinHashTable::~RangeJoinHashTable ( )
overridedefault

Member Function Documentation

std::pair< size_t, size_t > RangeJoinHashTable::approximateTupleCount ( const std::vector< double > &  inverse_bucket_sizes_for_dimension,
std::vector< ColumnsForDevice > &  columns_per_device,
const size_t  chosen_max_hashtable_size,
const double  chosen_bucket_threshold 
)
overrideprotectedvirtual

Reimplemented from OverlapsJoinHashTable.

Definition at line 530 of file RangeJoinHashTable.cpp.

References approximate_distinct_tuples_on_device_range(), approximate_distinct_tuples_range(), threading_serial::async(), Bitmap, CHECK, CHECK_EQ, CHECK_GT, CPU, Data_Namespace::CPU_LEVEL, cpu_threads(), OverlapsJoinHashTable::device_count_, effective_memory_level_, OverlapsJoinHashTable::executor_, HashJoin::getCompositeKeyInfo(), getQueryEngineCudaStreamForDevice(), GPU, Data_Namespace::GPU_LEVEL, hll_size(), hll_unify(), CountDistinctDescriptor::impl_type_, OverlapsJoinHashTable::inner_outer_pairs_, isInnerColCompressed(), transfer_flat_object_to_gpu(), transfer_vector_of_flat_objects_to_gpu(), and UNREACHABLE.

Referenced by computeRangeHashTableCounts().

534  {
535 #ifdef _WIN32
536  // WIN32 needs have C++20 set for designated initialisation to work
537  CountDistinctDescriptor count_distinct_desc{
539  0,
540  11,
541  true,
545  1,
546  };
547 #else
548  CountDistinctDescriptor count_distinct_desc{
550  .min_val = 0,
551  .bitmap_sz_bits = 11,
552  .approximate = true,
556  .sub_bitmap_count = 1,
557  };
558 #endif
559  const auto padded_size_bytes = count_distinct_desc.bitmapPaddedSizeBytes();
560 
561  CHECK(!columns_per_device.empty() && !columns_per_device.front().join_columns.empty());
562  if (columns_per_device.front().join_columns.front().num_elems == 0) {
563  return std::make_pair(0, 0);
564  }
565 
566  for (auto& columns_for_device : columns_per_device) {
567  columns_for_device.setBucketInfo(inverse_bucket_sizes_for_dimension,
569  }
570 
571  // Number of keys must match dimension of buckets
572  CHECK_EQ(columns_per_device.front().join_columns.size(),
573  columns_per_device.front().join_buckets.size());
575  const auto composite_key_info =
577  int thread_count = cpu_threads();
578  std::vector<uint8_t> hll_buffer_all_cpus(thread_count * padded_size_bytes);
579  auto hll_result = &hll_buffer_all_cpus[0];
580 
581  std::vector<int32_t> num_keys_for_row;
582  num_keys_for_row.resize(columns_per_device.front().join_columns[0].num_elems);
583 
585  num_keys_for_row,
586  count_distinct_desc.bitmap_sz_bits,
587  padded_size_bytes,
588  columns_per_device.front().join_columns,
589  columns_per_device.front().join_column_types,
590  columns_per_device.front().join_buckets,
592  thread_count);
593 
594  for (int i = 1; i < thread_count; ++i) {
595  hll_unify(hll_result,
596  hll_result + i * padded_size_bytes,
597  1 << count_distinct_desc.bitmap_sz_bits);
598  }
599  return std::make_pair(hll_size(hll_result, count_distinct_desc.bitmap_sz_bits),
600  num_keys_for_row.size() > 0 ? num_keys_for_row.back() : 0);
601  }
602 #ifdef HAVE_CUDA
603  auto& data_mgr = executor_->getCatalog()->getDataMgr();
604  std::vector<std::vector<uint8_t>> host_hll_buffers(device_count_);
605  for (auto& host_hll_buffer : host_hll_buffers) {
606  host_hll_buffer.resize(count_distinct_desc.bitmapPaddedSizeBytes());
607  }
608  std::vector<size_t> emitted_keys_count_device_threads(device_count_, 0);
609  std::vector<std::future<void>> approximate_distinct_device_threads;
610  for (int device_id = 0; device_id < device_count_; ++device_id) {
611  approximate_distinct_device_threads.emplace_back(std::async(
613  [device_id,
614  &columns_per_device,
615  &count_distinct_desc,
616  &data_mgr,
617  &host_hll_buffers,
618  &emitted_keys_count_device_threads,
619  this] {
620  auto allocator = std::make_unique<CudaAllocator>(
621  &data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
622  auto device_hll_buffer =
623  allocator->alloc(count_distinct_desc.bitmapPaddedSizeBytes());
624  data_mgr.getCudaMgr()->zeroDeviceMem(
625  device_hll_buffer,
626  count_distinct_desc.bitmapPaddedSizeBytes(),
627  device_id,
629  const auto& columns_for_device = columns_per_device[device_id];
630  auto join_columns_gpu = transfer_vector_of_flat_objects_to_gpu(
631  columns_for_device.join_columns, *allocator);
632 
633  CHECK_GT(columns_for_device.join_buckets.size(), 0u);
634  const auto& bucket_sizes_for_dimension =
635  columns_for_device.join_buckets[0].inverse_bucket_sizes_for_dimension;
636  auto bucket_sizes_gpu =
637  allocator->alloc(bucket_sizes_for_dimension.size() * sizeof(double));
638  allocator->copyToDevice(bucket_sizes_gpu,
639  bucket_sizes_for_dimension.data(),
640  bucket_sizes_for_dimension.size() * sizeof(double));
641  const size_t row_counts_buffer_sz =
642  columns_per_device.front().join_columns[0].num_elems * sizeof(int32_t);
643  auto row_counts_buffer = allocator->alloc(row_counts_buffer_sz);
644  data_mgr.getCudaMgr()->zeroDeviceMem(
645  row_counts_buffer,
646  row_counts_buffer_sz,
647  device_id,
649  const auto key_handler =
651  bucket_sizes_for_dimension.size(),
652  join_columns_gpu,
653  reinterpret_cast<double*>(bucket_sizes_gpu));
654  const auto key_handler_gpu =
655  transfer_flat_object_to_gpu(key_handler, *allocator);
657  reinterpret_cast<uint8_t*>(device_hll_buffer),
658  count_distinct_desc.bitmap_sz_bits,
659  reinterpret_cast<int32_t*>(row_counts_buffer),
660  key_handler_gpu,
661  columns_for_device.join_columns[0].num_elems,
662  executor_->blockSize(),
663  executor_->gridSize());
664 
665  auto& host_emitted_keys_count = emitted_keys_count_device_threads[device_id];
666  allocator->copyFromDevice(
667  &host_emitted_keys_count,
668  row_counts_buffer +
669  (columns_per_device.front().join_columns[0].num_elems - 1) *
670  sizeof(int32_t),
671  sizeof(int32_t));
672 
673  auto& host_hll_buffer = host_hll_buffers[device_id];
674  allocator->copyFromDevice(&host_hll_buffer[0],
675  device_hll_buffer,
676  count_distinct_desc.bitmapPaddedSizeBytes());
677  }));
678  }
679  for (auto& child : approximate_distinct_device_threads) {
680  child.get();
681  }
683  auto& result_hll_buffer = host_hll_buffers.front();
684  auto hll_result = reinterpret_cast<int32_t*>(&result_hll_buffer[0]);
685  for (int device_id = 1; device_id < device_count_; ++device_id) {
686  auto& host_hll_buffer = host_hll_buffers[device_id];
687  hll_unify(hll_result,
688  reinterpret_cast<int32_t*>(&host_hll_buffer[0]),
689  1 << count_distinct_desc.bitmap_sz_bits);
690  }
691  size_t emitted_keys_count = 0;
692  for (auto& emitted_keys_count_device : emitted_keys_count_device_threads) {
693  emitted_keys_count += emitted_keys_count_device;
694  }
695  return std::make_pair(hll_size(hll_result, count_distinct_desc.bitmap_sz_bits),
696  emitted_keys_count);
697 #else
698  UNREACHABLE();
699  return {0, 0};
700 #endif // HAVE_CUDA
701 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
T * transfer_flat_object_to_gpu(const T &object, DeviceAllocator &allocator)
void hll_unify(T1 *lhs, T2 *rhs, const size_t m)
Definition: HyperLogLog.h:107
#define UNREACHABLE()
Definition: Logger.h:266
CountDistinctImplType impl_type_
size_t hll_size(const T *M, const size_t bitmap_sz_bits)
Definition: HyperLogLog.h:88
#define CHECK_GT(x, y)
Definition: Logger.h:234
void approximate_distinct_tuples_on_device_range(uint8_t *hll_buffer, const uint32_t b, int32_t *row_counts_buffer, const RangeKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
future< Result > async(Fn &&fn, Args &&...args)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
bool isInnerColCompressed() const
void approximate_distinct_tuples_range(uint8_t *hll_buffer_all_cpus, std::vector< int32_t > &row_counts, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_buckets_per_key, const bool is_compressed, const int thread_count)
#define CHECK(condition)
Definition: Logger.h:222
std::vector< InnerOuter > inner_outer_pairs_
Data_Namespace::MemoryLevel effective_memory_level_
T * transfer_vector_of_flat_objects_to_gpu(const std::vector< T > &vec, DeviceAllocator &allocator)
int cpu_threads()
Definition: thread_count.h:24
static CompositeKeyInfo getCompositeKeyInfo(const std::vector< InnerOuter > &inner_outer_pairs, const Executor *executor, const std::vector< InnerOuterStringOpInfos > &inner_outer_string_op_infos_pairs={})
Definition: HashJoin.cpp:455

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * RangeJoinHashTable::codegenKey ( const CompilationOptions co,
llvm::Value *  offset 
)

Definition at line 709 of file RangeJoinHashTable.cpp.

References CHECK, CHECK_EQ, CodeGenerator::codegen(), OverlapsJoinHashTable::executor_, get_int_type(), OverlapsJoinHashTable::getKeyComponentCount(), OverlapsJoinHashTable::getKeyComponentWidth(), OverlapsJoinHashTable::inner_outer_pairs_, OverlapsJoinHashTable::inverse_bucket_sizes_for_dimension_, isProbeCompressed(), kPOINT, kTINYINT, LL_BUILDER, LL_CONTEXT, LL_FP, LL_INT, and CodeGenerator::posArg().

Referenced by codegenMatchingSetWithOffset().

710  {
711  const auto key_component_width = getKeyComponentWidth();
712  CHECK(key_component_width == 4 || key_component_width == 8);
713  const auto key_size_lv = LL_INT(getKeyComponentCount() * key_component_width);
714  llvm::Value* key_buff_lv{nullptr};
715  switch (key_component_width) {
716  case 4:
717  key_buff_lv =
718  LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv);
719  break;
720  case 8:
721  key_buff_lv =
722  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
723  break;
724  default:
725  CHECK(false);
726  }
727 
728  const auto& inner_outer_pair = inner_outer_pairs_[0];
729  const auto outer_col = inner_outer_pair.second;
730  const auto outer_col_ti = outer_col->get_type_info();
731 
732  if (outer_col_ti.is_geometry()) {
733  CodeGenerator code_generator(executor_);
734  // TODO(adb): for points we will use the coords array, but for other
735  // geometries we will need to use the bounding box. For now only support
736  // points.
737  CHECK_EQ(outer_col_ti.get_type(), kPOINT);
738  CHECK_EQ(inverse_bucket_sizes_for_dimension_.size(), static_cast<size_t>(2));
739 
740  const auto col_lvs = code_generator.codegen(outer_col, true, co);
741  CHECK_EQ(col_lvs.size(), size_t(1));
742 
743  const auto outer_col_var = dynamic_cast<const Analyzer::ColumnVar*>(outer_col);
744  CHECK(outer_col_var);
745 
746  const auto coords_cd = executor_->getCatalog()->getMetadataForColumn(
747  outer_col_var->get_table_id(), outer_col_var->get_column_id() + 1);
748  CHECK(coords_cd);
749 
750  const auto array_ptr = executor_->cgen_state_->emitExternalCall(
751  "array_buff",
752  llvm::Type::getInt8PtrTy(executor_->cgen_state_->context_),
753  {col_lvs.front(), code_generator.posArg(outer_col)});
754  CHECK(coords_cd->columnType.get_elem_type().get_type() == kTINYINT)
755  << "Only TINYINT coordinates columns are supported in geo overlaps "
756  "hash join.";
757 
758  const auto arr_ptr =
759  code_generator.castArrayPointer(array_ptr, coords_cd->columnType.get_elem_type());
760 
761  // load and unpack offsets
762  const auto offset =
763  LL_BUILDER.CreateLoad(offset_ptr->getType()->getPointerElementType(),
764  offset_ptr,
765  "packed_bucket_offset");
766  const auto x_offset =
767  LL_BUILDER.CreateTrunc(offset, llvm::Type::getInt32Ty(LL_CONTEXT));
768 
769  const auto y_offset_shifted =
770  LL_BUILDER.CreateLShr(offset, LL_INT(static_cast<int64_t>(32)));
771  const auto y_offset =
772  LL_BUILDER.CreateTrunc(y_offset_shifted, llvm::Type::getInt32Ty(LL_CONTEXT));
773 
774  const auto x_bucket_offset =
775  LL_BUILDER.CreateSExt(x_offset, llvm::Type::getInt64Ty(LL_CONTEXT));
776  const auto y_bucket_offset =
777  LL_BUILDER.CreateSExt(y_offset, llvm::Type::getInt64Ty(LL_CONTEXT));
778 
779  for (size_t i = 0; i < 2; i++) {
780  const auto key_comp_dest_lv = LL_BUILDER.CreateGEP(
781  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
782  key_buff_lv,
783  LL_INT(i));
784 
785  const auto funcName = isProbeCompressed() ? "get_bucket_key_for_range_compressed"
786  : "get_bucket_key_for_range_double";
787 
788  // Note that get_bucket_key_for_range_compressed will need to be
789  // specialized for future compression schemes
790  auto bucket_key = executor_->cgen_state_->emitExternalCall(
791  funcName,
794 
795  auto bucket_key_shifted = i == 0
796  ? LL_BUILDER.CreateAdd(x_bucket_offset, bucket_key)
797  : LL_BUILDER.CreateAdd(y_bucket_offset, bucket_key);
798 
799  const auto col_lv = LL_BUILDER.CreateSExt(
800  bucket_key_shifted, get_int_type(key_component_width * 8, LL_CONTEXT));
801  LL_BUILDER.CreateStore(col_lv, key_comp_dest_lv);
802  }
803  } else {
804  LOG(FATAL) << "Range join key currently only supported for geospatial types.";
805  }
806  return key_buff_lv;
807 }
#define LL_INT(v)
#define CHECK_EQ(x, y)
Definition: Logger.h:230
#define LOG(tag)
Definition: Logger.h:216
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define LL_CONTEXT
#define LL_FP(v)
#define LL_BUILDER
std::vector< double > inverse_bucket_sizes_for_dimension_
bool isProbeCompressed() const
#define CHECK(condition)
Definition: Logger.h:222
std::vector< InnerOuter > inner_outer_pairs_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

HashJoinMatchingSet RangeJoinHashTable::codegenMatchingSetWithOffset ( const CompilationOptions co,
const size_t  index,
llvm::Value *  range_offset 
)

Definition at line 809 of file RangeJoinHashTable.cpp.

References CHECK, HashJoin::codegenHashTableLoad(), codegenKey(), HashJoin::codegenMatchingSet(), OverlapsJoinHashTable::executor_, get_int_type(), OverlapsJoinHashTable::getComponentBufferSize(), OverlapsJoinHashTable::getEntryCount(), getHashType(), OverlapsJoinHashTable::getKeyComponentCount(), OverlapsJoinHashTable::getKeyComponentWidth(), LL_BUILDER, LL_CONTEXT, LL_INT, OverlapsJoinHashTable::offsetBufferOff(), OneToMany, and to_string().

812  {
813  const auto key_component_width = getKeyComponentWidth();
814  CHECK(key_component_width == 4 || key_component_width == 8);
815 
816  auto key_buff_lv = codegenKey(co, range_offset);
818 
819  auto hash_ptr = codegenHashTableLoad(index, executor_);
820  const auto composite_dict_ptr_type =
821  llvm::Type::getIntNPtrTy(LL_CONTEXT, key_component_width * 8);
822 
823  const auto composite_key_dict =
824  hash_ptr->getType()->isPointerTy()
825  ? LL_BUILDER.CreatePointerCast(hash_ptr, composite_dict_ptr_type)
826  : LL_BUILDER.CreateIntToPtr(hash_ptr, composite_dict_ptr_type);
827 
828  const auto key_component_count = getKeyComponentCount();
829 
830  const auto funcName =
831  "get_composite_key_index_" + std::to_string(key_component_width * 8);
832 
833  const auto key = executor_->cgen_state_->emitExternalCall(funcName,
835  {key_buff_lv,
836  LL_INT(key_component_count),
837  composite_key_dict,
838  LL_INT(getEntryCount())});
839 
840  auto one_to_many_ptr = hash_ptr;
841  if (one_to_many_ptr->getType()->isPointerTy()) {
842  one_to_many_ptr =
843  LL_BUILDER.CreatePtrToInt(hash_ptr, llvm::Type::getInt64Ty(LL_CONTEXT));
844  } else {
845  CHECK(one_to_many_ptr->getType()->isIntegerTy(64));
846  }
847  const auto composite_key_dict_size = offsetBufferOff();
848  one_to_many_ptr =
849  LL_BUILDER.CreateAdd(one_to_many_ptr, LL_INT(composite_key_dict_size));
850 
852  /* hash_join_idx_args_in */ {one_to_many_ptr,
853  key,
854  LL_INT(int64_t(0)),
855  LL_INT(getEntryCount() - 1)},
856  /* is_sharded */ false,
857  /* is_nullable */ false,
858  /* is_bw_eq */ false,
859  /* sub_buff_size */ getComponentBufferSize(),
860  /* executor */ executor_);
861 }
#define LL_INT(v)
virtual HashJoinMatchingSet codegenMatchingSet(const CompilationOptions &, const size_t)=0
llvm::Value * codegenKey(const CompilationOptions &co, llvm::Value *offset)
static llvm::Value * codegenHashTableLoad(const size_t table_idx, Executor *executor)
Definition: HashJoin.cpp:257
HashType getHashType() const noexceptoverride
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define LL_CONTEXT
std::string to_string(char const *&&v)
#define LL_BUILDER
size_t offsetBufferOff() const noexceptoverride
#define CHECK(condition)
Definition: Logger.h:222
size_t getComponentBufferSize() const noexceptoverride

+ Here is the call graph for this function:

std::pair< size_t, size_t > RangeJoinHashTable::computeRangeHashTableCounts ( const size_t  shard_count,
std::vector< ColumnsForDevice > &  columns_per_device 
)
protected

Definition at line 514 of file RangeJoinHashTable.cpp.

References approximateTupleCount(), bucket_threshold_, CHECK, OverlapsJoinHashTable::device_count_, get_entries_per_device(), OverlapsJoinHashTable::inverse_bucket_sizes_for_dimension_, max_hashtable_size_, and OverlapsJoinHashTable::memory_level_.

Referenced by reifyWithLayout().

516  {
518  const auto [tuple_count, emitted_keys_count] =
520  columns_per_device,
523  const auto entry_count = 2 * std::max(tuple_count, size_t(1));
524 
525  return std::make_pair(
526  get_entries_per_device(entry_count, shard_count, device_count_, memory_level_),
527  emitted_keys_count);
528 }
std::pair< size_t, size_t > approximateTupleCount(const std::vector< double > &inverse_bucket_sizes_for_dimension, std::vector< ColumnsForDevice > &columns_per_device, const size_t chosen_max_hashtable_size, const double chosen_bucket_threshold) override
const double bucket_threshold_
std::vector< double > inverse_bucket_sizes_for_dimension_
size_t get_entries_per_device(const size_t total_entries, const size_t shard_count, const size_t device_count, const Data_Namespace::MemoryLevel memory_level)
#define CHECK(condition)
Definition: Logger.h:222
const Data_Namespace::MemoryLevel memory_level_
const size_t max_hashtable_size_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

HashType RangeJoinHashTable::getHashType ( ) const
inlineoverrideprotectedvirtualnoexcept

Reimplemented from OverlapsJoinHashTable.

Definition at line 92 of file RangeJoinHashTable.h.

References OneToMany.

Referenced by codegenMatchingSetWithOffset().

+ Here is the caller graph for this function:

std::shared_ptr< RangeJoinHashTable > RangeJoinHashTable::getInstance ( const std::shared_ptr< Analyzer::BinOper condition,
const Analyzer::RangeOper range_expr,
const std::vector< InputTableInfo > &  query_infos,
const Data_Namespace::MemoryLevel  memory_level,
const JoinType  join_type,
const int  device_count,
ColumnCacheMap column_cache,
Executor executor,
const HashTableBuildDagMap hashtable_build_dag_map,
const RegisteredQueryHint query_hint,
const TableIdToNodeMap table_id_to_node_map 
)
static

NOTE(jclay): Handling Range Joins With Mixed Compression:

First, let's take a concrete example of a query that is rewritten as a range join. Notice in the first code block, that the condition operator is an Overlaps operator. The LHS is a column, and the RHS is the range operator. In order to have the hash table build and probe work properly, we need to ensure that the approriate runtime functions are selected. The following breakdown is provided to help document how the appropriate runtime funditon is selected.

  • The LHS of the RangeOper is used to build the hash table
  • The LHS of the OverlapsOper + the RHS of the RangeOper is used as probe

SELECT count(*) FROM t1, t2 where ST_Distance(t1.p1_comp32, t2.p1) <= 6.3;

BinOper condition

((OVERLAPS) (ColumnVar table: (t1) column: (p1_comp32) GEOMETRY(POINT, 4326) ENCODING COMPRESSED(32)) (RangeOper) (ColumnVar table: (t2) column: (p1) GEOMETRY(POINT, 4326) ENCODING NONE), (Const 6.330000))

RangeOper condition

[(ColumnVar table: 5 (t2) column: 1 rte: 1 GEOMETRY(POINT, 4326) ENCODING NONE), (Const 6.330000)]

Same example as above, annotated:

SELECT count(*) FROM t1, t2 where ST_Distance( t1.p1_comp32, << Overlaps Condition LHS t2.p1 << RangeOper LHS ) <= 6.3; << RangeOper RHS

In this case, we select the uncompressed runtime functions when building the hash table over t2.p1. When performing the probe, we must select the compressed runtime functions.

Definition at line 71 of file RangeJoinHashTable.cpp.

References cat(), CHECK, HashJoin::checkHashJoinReplicationConstraint(), logger::FATAL, get_inner_query_info(), Analyzer::RangeOper::get_left_operand(), HashJoin::getInnerTableId(), Fragmenter_Namespace::TableInfo::getNumTuplesUpperBound(), BaselineJoinHashTable::getShardCountForCondition(), Data_Namespace::GPU_LEVEL, InputTableInfo::info, LOG, OneToMany, and Analyzer::RangeOper::toString().

Referenced by OverlapsJoinHashTable::getInstance().

82  {
83  // the hash table is built over the LHS of the range oper. we then use the lhs
84  // of the bin oper + the rhs of the range oper for the probe
85  auto range_expr_col_var =
86  dynamic_cast<const Analyzer::ColumnVar*>(range_expr->get_left_operand());
87  if (!range_expr_col_var || !range_expr_col_var->get_type_info().is_geometry()) {
88  throw HashJoinFail("Could not build hash tables for range join | " +
89  range_expr->toString());
90  }
91  auto cat = executor->getCatalog();
92  CHECK(cat);
93  CHECK(range_expr_col_var->get_type_info().is_geometry());
94 
95  auto coords_cd = cat->getMetadataForColumn(range_expr_col_var->get_table_id(),
96  range_expr_col_var->get_column_id() + 1);
97  CHECK(coords_cd);
98 
99  auto range_join_inner_col_expr =
100  makeExpr<Analyzer::ColumnVar>(coords_cd->columnType,
101  coords_cd->tableId,
102  coords_cd->columnId,
103  range_expr_col_var->get_rte_idx());
104 
105  std::vector<InnerOuter> inner_outer_pairs;
106  inner_outer_pairs.emplace_back(
107  InnerOuter{dynamic_cast<Analyzer::ColumnVar*>(range_join_inner_col_expr.get()),
108  condition->get_left_operand()});
109 
110  const auto& query_info =
111  get_inner_query_info(HashJoin::getInnerTableId(inner_outer_pairs), query_infos)
112  .info;
113 
114  const auto total_entries = 2 * query_info.getNumTuplesUpperBound();
115  if (total_entries > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
116  throw TooManyHashEntries();
117  }
118 
119  const auto shard_count = memory_level == Data_Namespace::GPU_LEVEL
121  condition.get(), executor, inner_outer_pairs)
122  : 0;
123 
124  auto join_hash_table = std::make_shared<RangeJoinHashTable>(condition,
125  join_type,
126  range_expr,
127  range_join_inner_col_expr,
128  query_infos,
129  memory_level,
130  column_cache,
131  executor,
132  inner_outer_pairs,
133  device_count,
134  hashtable_build_dag_map,
135  table_id_to_node_map);
137  HashJoin::getInnerTableId(inner_outer_pairs), shard_count, executor);
138  try {
139  join_hash_table->reifyWithLayout(HashType::OneToMany);
140  } catch (const HashJoinFail& e) {
141  throw HashJoinFail(std::string("Could not build a 1-to-1 correspondence for columns "
142  "involved in equijoin | ") +
143  e.what());
144  } catch (const ColumnarConversionNotSupported& e) {
145  throw HashJoinFail(std::string("Could not build hash tables for equijoin | ") +
146  e.what());
147  } catch (const std::exception& e) {
148  LOG(FATAL) << "Fatal error while attempting to build hash tables for join: "
149  << e.what();
150  }
151 
152  return join_hash_table;
153 }
Fragmenter_Namespace::TableInfo info
Definition: InputMetadata.h:35
std::string cat(Ts &&...args)
std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * > InnerOuter
Definition: HashJoin.h:95
#define LOG(tag)
Definition: Logger.h:216
const Expr * get_left_operand() const
Definition: Analyzer.h:546
std::string toString() const override
Definition: Analyzer.cpp:2695
virtual int getInnerTableId() const noexcept=0
static void checkHashJoinReplicationConstraint(const int table_id, const size_t shard_count, const Executor *executor)
Definition: HashJoin.cpp:771
const InputTableInfo & get_inner_query_info(const int inner_table_id, const std::vector< InputTableInfo > &query_infos)
#define CHECK(condition)
Definition: Logger.h:222
static size_t getShardCountForCondition(const Analyzer::BinOper *condition, const Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr< BaselineHashTable > RangeJoinHashTable::initHashTableOnCpu ( const std::vector< JoinColumn > &  join_columns,
const std::vector< JoinColumnTypeInfo > &  join_column_types,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const HashType  layout,
const size_t  entry_count,
const size_t  emitted_keys_count 
)
protected

Definition at line 456 of file RangeJoinHashTable.cpp.

References CHECK, DataRecyclerUtil::CPU_DEVICE_IDENTIFIER, DEBUG_TIMER, OverlapsJoinHashTable::executor_, HashJoin::getCompositeKeyInfo(), OverlapsJoinHashTable::getKeyComponentCount(), OverlapsJoinHashTable::getKeyComponentWidth(), OverlapsJoinHashTable::hashtable_cache_key_, OverlapsJoinHashTable::inner_outer_pairs_, isInnerColCompressed(), OverlapsJoinHashTable::join_type_, HashJoin::layoutRequiresAdditionalBuffers(), OVERLAPS_HT, OverlapsJoinHashTable::putHashTableOnCpuToCache(), and to_string().

Referenced by reifyForDevice().

462  {
463  auto timer = DEBUG_TIMER(__func__);
464  decltype(std::chrono::steady_clock::now()) ts1, ts2;
465  ts1 = std::chrono::steady_clock::now();
466  const auto composite_key_info =
468  CHECK(!join_columns.empty());
469  CHECK(!join_bucket_info.empty());
470 
472  const auto key_component_count =
473  join_bucket_info[0].inverse_bucket_sizes_for_dimension.size();
474 
475  auto key_handler =
477  key_component_count,
478  &join_columns[0],
479  join_bucket_info[0].inverse_bucket_sizes_for_dimension.data());
480 
483  dummy_str_proxy_translation_maps_ptrs_and_offsets;
484  const auto err =
485  builder.initHashTableOnCpu(&key_handler,
486  composite_key_info,
487  join_columns,
488  join_column_types,
489  join_bucket_info,
490  dummy_str_proxy_translation_maps_ptrs_and_offsets,
491  entry_count,
492  emitted_keys_count,
493  layout,
494  join_type_,
497  ts2 = std::chrono::steady_clock::now();
498  if (err) {
499  throw HashJoinFail(std::string("Unrecognized error when initializing CPU "
500  "range join hash table (") +
501  std::to_string(err) + std::string(")"));
502  }
503  std::shared_ptr<BaselineHashTable> hash_table = builder.getHashTable();
504  auto hashtable_build_time =
505  std::chrono::duration_cast<std::chrono::milliseconds>(ts2 - ts1).count();
508  hash_table,
510  hashtable_build_time);
511  return hash_table;
512 }
void putHashTableOnCpuToCache(QueryPlanHash key, CacheItemType item_type, std::shared_ptr< HashTable > hashtable_ptr, DeviceIdentifier device_identifier, size_t hashtable_building_time)
std::string to_string(char const *&&v)
std::shared_ptr< BaselineHashTable > initHashTableOnCpu(const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_bucket_info, const HashType layout, const size_t entry_count, const size_t emitted_keys_count)
std::vector< QueryPlanHash > hashtable_cache_key_
std::pair< std::vector< const int32_t * >, std::vector< int32_t >> StrProxyTranslationMapsPtrsAndOffsets
std::unique_ptr< BaselineHashTable > getHashTable()
bool isInnerColCompressed() const
#define CHECK(condition)
Definition: Logger.h:222
#define DEBUG_TIMER(name)
Definition: Logger.h:371
std::vector< InnerOuter > inner_outer_pairs_
static constexpr DeviceIdentifier CPU_DEVICE_IDENTIFIER
Definition: DataRecycler.h:136
static bool layoutRequiresAdditionalBuffers(HashType layout) noexcept
Definition: HashJoin.h:150
static CompositeKeyInfo getCompositeKeyInfo(const std::vector< InnerOuter > &inner_outer_pairs, const Executor *executor, const std::vector< InnerOuterStringOpInfos > &inner_outer_string_op_infos_pairs={})
Definition: HashJoin.cpp:455

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool RangeJoinHashTable::isInnerColCompressed ( ) const
inlineprivate

Definition at line 112 of file RangeJoinHashTable.h.

References SQLTypeInfo::get_compression(), Analyzer::RangeOper::get_left_operand(), Analyzer::Expr::get_type_info(), kENCODING_GEOINT, and range_expr_.

Referenced by approximateTupleCount(), and initHashTableOnCpu().

112  {
115  }
const Expr * get_left_operand() const
Definition: Analyzer.h:546
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:81
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
const Analyzer::RangeOper * range_expr_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool RangeJoinHashTable::isProbeCompressed ( ) const
inlineprivate

Definition at line 117 of file RangeJoinHashTable.h.

References OverlapsJoinHashTable::getInnerOuterPairs(), and kENCODING_GEOINT.

Referenced by codegenKey().

117  {
118  const auto& inner_outer_pair = getInnerOuterPairs()[0];
119  const auto outer_col = inner_outer_pair.second;
120  const auto outer_col_ti = outer_col->get_type_info();
121 
122  return outer_col_ti.get_compression() == kENCODING_GEOINT;
123  }
const std::vector< InnerOuter > & getInnerOuterPairs() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void RangeJoinHashTable::reifyForDevice ( const ColumnsForDevice columns_for_device,
const HashType  layout,
const size_t  entry_count,
const size_t  emitted_keys_count,
const int  device_id,
const logger::ThreadId  parent_thread_id 
)
protected

Definition at line 352 of file RangeJoinHashTable.cpp.

References CHECK, CHECK_EQ, CHECK_LT, Data_Namespace::CPU_LEVEL, DEBUG_TIMER_NEW_THREAD, effective_memory_level_, OverlapsJoinHashTable::getKeyComponentWidth(), Data_Namespace::GPU_LEVEL, HashJoin::hash_tables_for_device_, initHashTableOnCpu(), ColumnsForDevice::join_buckets, ColumnsForDevice::join_column_types, ColumnsForDevice::join_columns, HashJoin::layoutRequiresAdditionalBuffers(), OverlapsJoinHashTable::memory_level_, UNREACHABLE, and VLOG.

Referenced by reifyWithLayout().

357  {
358  DEBUG_TIMER_NEW_THREAD(parent_thread_id);
359  CHECK_EQ(getKeyComponentWidth(), size_t(8));
361 
363  VLOG(1) << "Building range join hash table on CPU.";
364  auto hash_table = initHashTableOnCpu(columns_for_device.join_columns,
365  columns_for_device.join_column_types,
366  columns_for_device.join_buckets,
367  layout,
368  entry_count,
369  emitted_keys_count);
370  CHECK(hash_table);
371 
372 #ifdef HAVE_CUDA
374  auto gpu_hash_table = copyCpuHashTableToGpu(
375  hash_table, layout, entry_count, emitted_keys_count, device_id);
376  CHECK_LT(size_t(device_id), hash_tables_for_device_.size());
377  hash_tables_for_device_[device_id] = std::move(gpu_hash_table);
378  } else {
379 #else
381 #endif
382  CHECK_EQ(hash_tables_for_device_.size(), size_t(1));
383  hash_tables_for_device_[0] = std::move(hash_table);
384 #ifdef HAVE_CUDA
385  }
386 #endif
387  } else {
388 #ifdef HAVE_CUDA
389  auto hash_table = initHashTableOnGpu(columns_for_device.join_columns,
390  columns_for_device.join_column_types,
391  columns_for_device.join_buckets,
392  layout,
393  entry_count,
394  emitted_keys_count,
395  device_id);
396  CHECK_LT(size_t(device_id), hash_tables_for_device_.size());
397  hash_tables_for_device_[device_id] = std::move(hash_table);
398 #else
399  UNREACHABLE();
400 #endif
401  }
402 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
std::vector< std::shared_ptr< HashTable > > hash_tables_for_device_
Definition: HashJoin.h:351
#define UNREACHABLE()
Definition: Logger.h:266
#define DEBUG_TIMER_NEW_THREAD(parent_thread_id)
Definition: Logger.h:376
const std::vector< JoinColumnTypeInfo > join_column_types
Definition: HashJoin.h:101
std::shared_ptr< BaselineHashTable > initHashTableOnCpu(const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_bucket_info, const HashType layout, const size_t entry_count, const size_t emitted_keys_count)
#define CHECK_LT(x, y)
Definition: Logger.h:232
#define CHECK(condition)
Definition: Logger.h:222
const Data_Namespace::MemoryLevel memory_level_
Data_Namespace::MemoryLevel effective_memory_level_
std::vector< JoinBucketInfo > join_buckets
Definition: HashJoin.h:103
const std::vector< JoinColumn > join_columns
Definition: HashJoin.h:100
#define VLOG(n)
Definition: Logger.h:316
static bool layoutRequiresAdditionalBuffers(HashType layout) noexcept
Definition: HashJoin.h:150

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void RangeJoinHashTable::reifyWithLayout ( const HashType  layout)
overrideprotectedvirtual

Reimplemented from OverlapsJoinHashTable.

Definition at line 155 of file RangeJoinHashTable.cpp.

References threading_serial::async(), bucket_threshold_, CompositeKeyInfo::cache_key_chunks, OverlapsJoinHashTable::calculateHashTableSize(), CHECK, CHECK_EQ, CHECK_LT, HashJoin::collectFragmentIds(), OverlapsJoinHashTable::composite_key_info_, computeRangeHashTableCounts(), OverlapsJoinHashTable::condition_, DataRecyclerUtil::CPU_DEVICE_IDENTIFIER, OverlapsJoinHashTable::cpu_hash_table_buff_mutex_, Data_Namespace::CPU_LEVEL, DEBUG_TIMER, OverlapsJoinHashTable::device_count_, effective_memory_level_, OverlapsJoinHashTable::executor_, OverlapsJoinHashTable::fetchColumnsForDevice(), OverlapsJoinHashTable::generateCacheKey(), get_inner_query_info(), Analyzer::RangeOper::get_left_operand(), Analyzer::RangeOper::get_right_operand(), OverlapsJoinHashTable::getAlternativeCacheKey(), DataRecyclerUtil::getAlternativeTableKeys(), OverlapsJoinHashTable::getEffectiveMemoryLevel(), HashtableRecycler::getHashtableAccessPathInfo(), HashJoin::getHashTypeString(), HashJoin::getInnerTableId(), OverlapsJoinHashTable::getInnerTableId(), getQueryEngineCudaStreamForDevice(), Data_Namespace::GPU_LEVEL, OverlapsJoinHashTable::hash_table_cache_, HashJoin::hash_tables_for_device_, OverlapsJoinHashTable::hashtable_build_dag_map_, OverlapsJoinHashTable::hashtable_cache_key_, InputTableInfo::info, OverlapsJoinHashTable::initHashTableOnCpuFromCache(), OverlapsJoinHashTable::inner_outer_pairs_, OverlapsJoinHashTable::inverse_bucket_sizes_for_dimension_, HashtableRecycler::isInvalidHashTableCacheKey(), OverlapsJoinHashTable::join_type_, OverlapsJoinHashTable::layout_override_, ManyToMany, max_hashtable_size_, OverlapsJoinHashTable::memory_level_, OneToMany, only_shards_for_device(), OVERLAPS_HT, OverlapsJoinHashTable::query_infos_, range_expr_, reifyForDevice(), OverlapsJoinHashTable::setInverseBucketSizeInfo(), OverlapsJoinHashTable::setOverlapsHashtableMetaInfo(), OverlapsJoinHashTable::shardCount(), OverlapsJoinHashTable::table_keys_, logger::thread_id(), UNREACHABLE, and VLOG.

155  {
156  auto timer = DEBUG_TIMER(__func__);
157  CHECK(layout == HashType::OneToMany);
158 
159  const auto& query_info =
161  .info;
162 
163  if (query_info.fragments.empty()) {
164  return;
165  }
166 
167  VLOG(1) << "Reify with layout " << getHashTypeString(layout)
168  << "for table_id: " << getInnerTableId();
169 
170  std::vector<ColumnsForDevice> columns_per_device;
171  const auto catalog = executor_->getCatalog();
172  CHECK(catalog);
173 
174  auto& data_mgr = catalog->getDataMgr();
175  std::vector<std::vector<Fragmenter_Namespace::FragmentInfo>> fragments_per_device;
176  std::vector<std::unique_ptr<CudaAllocator>> dev_buff_owners;
177  const auto shard_count = shardCount();
178  for (int device_id = 0; device_id < device_count_; ++device_id) {
179  fragments_per_device.emplace_back(
180  shard_count
181  ? only_shards_for_device(query_info.fragments, device_id, device_count_)
182  : query_info.fragments);
184  dev_buff_owners.emplace_back(std::make_unique<CudaAllocator>(
185  &data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id)));
186  }
187  // for overlaps join, we need to fetch columns regardless of the availability of
188  // cached hash table to calculate various params, i.e., bucket size info todo
189  // (yoonmin) : relax this
190  const auto columns_for_device =
191  fetchColumnsForDevice(fragments_per_device[device_id],
192  device_id,
194  ? dev_buff_owners[device_id].get()
195  : nullptr);
196  columns_per_device.push_back(columns_for_device);
197  }
198 
200 
201  const auto bucket_range =
202  dynamic_cast<const Analyzer::Constant*>(range_expr_->get_right_operand());
203 
204  CHECK(bucket_range);
205  CHECK(bucket_range->get_type_info().is_fp() &&
206  bucket_range->get_type_info().get_size() == 8); // TODO
207 
208  const auto bucket_range_datum = bucket_range->get_constval();
209 
210  inverse_bucket_sizes_for_dimension_.emplace_back(1. / bucket_range_datum.doubleval);
211  inverse_bucket_sizes_for_dimension_.emplace_back(1. / bucket_range_datum.doubleval);
212 
214  inverse_bucket_sizes_for_dimension_, columns_per_device, device_count_);
215 
217 
218  // to properly lookup cached hash table, we need to use join columns listed as lhs and
219  // rhs of the overlaps op instead of physical (and hidden) column tailored to range join
220  // expr in other words, we need to use geometry column (point) instead of its hidden
221  // array column i.e., see `get_physical_cols` function
222  std::vector<InnerOuter> inner_outer_pairs_for_cache_lookup;
223  inner_outer_pairs_for_cache_lookup.emplace_back(InnerOuter{
224  dynamic_cast<const Analyzer::ColumnVar*>(range_expr_->get_left_operand()),
225  condition_->get_left_operand()});
226  auto hashtable_access_path_info =
227  HashtableRecycler::getHashtableAccessPathInfo(inner_outer_pairs_for_cache_lookup,
228  {},
229  condition_->get_optype(),
230  join_type_,
233  shard_count,
234  fragments_per_device,
235  executor_);
236  hashtable_cache_key_ = hashtable_access_path_info.hashed_query_plan_dag;
237  table_keys_ = hashtable_access_path_info.table_keys;
238 
239  auto get_inner_table_id = [&inner_outer_pairs_for_cache_lookup]() {
240  return inner_outer_pairs_for_cache_lookup.front().first->get_table_id();
241  };
242 
243  if (table_keys_.empty()) {
246  executor_->getCatalog()->getDatabaseId(),
247  get_inner_table_id());
248  }
249  CHECK(!table_keys_.empty());
250 
256  fragments_per_device,
257  device_count_);
258 
259  if (HashtableRecycler::isInvalidHashTableCacheKey(hashtable_cache_key_) &&
260  get_inner_table_id() > 0) {
261  std::vector<size_t> per_device_chunk_key;
262  for (int device_id = 0; device_id < device_count_; ++device_id) {
263  auto chunk_key_hash = boost::hash_value(composite_key_info_.cache_key_chunks);
264  boost::hash_combine(chunk_key_hash,
265  HashJoin::collectFragmentIds(fragments_per_device[device_id]));
266  per_device_chunk_key.push_back(chunk_key_hash);
267  AlternativeCacheKeyForOverlapsHashJoin cache_key{
268  inner_outer_pairs_for_cache_lookup,
269  columns_per_device.front().join_columns.front().num_elems,
270  chunk_key_hash,
271  condition_->get_optype(),
274  {}};
275  hashtable_cache_key_[device_id] = getAlternativeCacheKey(cache_key);
276  hash_table_cache_->addQueryPlanDagForTableKeys(hashtable_cache_key_[device_id],
277  table_keys_);
278  }
279  }
280 
282  std::lock_guard<std::mutex> cpu_hash_table_buff_lock(cpu_hash_table_buff_mutex_);
283  if (auto generic_hash_table =
284  initHashTableOnCpuFromCache(hashtable_cache_key_.front(),
287  if (auto hash_table =
288  std::dynamic_pointer_cast<BaselineHashTable>(generic_hash_table)) {
289  // See if a hash table of a different layout was returned.
290  // If it was OneToMany, we can reuse it on ManyToMany.
291  if (layout == HashType::ManyToMany &&
292  hash_table->getLayout() == HashType::OneToMany) {
293  // use the cached hash table
295  }
296 
298 #ifdef HAVE_CUDA
299  for (int device_id = 0; device_id < device_count_; ++device_id) {
300  auto gpu_hash_table = copyCpuHashTableToGpu(hash_table,
301  layout,
302  hash_table->getEntryCount(),
303  hash_table->getEmittedKeysCount(),
304  device_id);
305  CHECK_LT(size_t(device_id), hash_tables_for_device_.size());
306  hash_tables_for_device_[device_id] = std::move(gpu_hash_table);
307  }
308 #else
309  UNREACHABLE();
310 #endif
311  } else {
313  CHECK_EQ(hash_tables_for_device_.size(), size_t(1));
314  // do not move hash_table to keep valid ptr of it within the hash table recycler
315  hash_tables_for_device_[0] = hash_table;
316  }
317  return;
318  }
319  }
320  }
321 
322  auto [entry_count, emitted_keys_count] =
323  computeRangeHashTableCounts(shard_count, columns_per_device);
324 
325  size_t hash_table_size = OverlapsJoinHashTable::calculateHashTableSize(
326  inverse_bucket_sizes_for_dimension_.size(), emitted_keys_count, entry_count);
327 
328  VLOG(1) << "Finalized range join hash table: entry count " << entry_count
329  << " hash table size " << hash_table_size;
330 
331  std::vector<std::future<void>> init_threads;
332  for (int device_id = 0; device_id < device_count_; ++device_id) {
333  init_threads.push_back(
336  this,
337  /* columns_for_device */ columns_per_device[device_id],
338  /* layout_type */ layout,
339  /* entry_count */ entry_count,
340  /* emitted_keys_count */ emitted_keys_count,
341  /* device_id */ device_id,
342  /* parent_thread_id */ logger::thread_id()));
343  }
344  for (auto& init_thread : init_threads) {
345  init_thread.wait();
346  }
347  for (auto& init_thread : init_threads) {
348  init_thread.get();
349  }
350 }
static std::vector< int > collectFragmentIds(const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments)
Definition: HashJoin.cpp:446
#define CHECK_EQ(x, y)
Definition: Logger.h:230
int getInnerTableId() const noexceptoverride
Fragmenter_Namespace::TableInfo info
Definition: InputMetadata.h:35
std::shared_ptr< HashTable > initHashTableOnCpuFromCache(QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier)
static bool isInvalidHashTableCacheKey(const std::vector< QueryPlanHash > &cache_keys)
std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * > InnerOuter
Definition: HashJoin.h:95
std::vector< ChunkKey > cache_key_chunks
Definition: HashJoin.h:119
std::vector< std::shared_ptr< HashTable > > hash_tables_for_device_
Definition: HashJoin.h:351
#define UNREACHABLE()
Definition: Logger.h:266
const Expr * get_left_operand() const
Definition: Analyzer.h:546
size_t calculateHashTableSize(size_t number_of_dimensions, size_t emitted_keys_count, size_t entry_count) const
const std::shared_ptr< Analyzer::BinOper > condition_
future< Result > async(Fn &&fn, Args &&...args)
QueryPlanHash getAlternativeCacheKey(AlternativeCacheKeyForOverlapsHashJoin &info)
const std::vector< InputTableInfo > & query_infos_
std::vector< Fragmenter_Namespace::FragmentInfo > only_shards_for_device(const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id, const int device_count)
HashTableBuildDagMap hashtable_build_dag_map_
std::pair< size_t, size_t > computeRangeHashTableCounts(const size_t shard_count, std::vector< ColumnsForDevice > &columns_per_device)
std::vector< QueryPlanHash > hashtable_cache_key_
virtual int getInnerTableId() const noexcept=0
const double bucket_threshold_
const Expr * get_right_operand() const
Definition: Analyzer.h:547
void setOverlapsHashtableMetaInfo(size_t max_table_size_bytes, double bucket_threshold, std::vector< double > &bucket_sizes)
const InputTableInfo & get_inner_query_info(const int inner_table_id, const std::vector< InputTableInfo > &query_infos)
std::vector< double > inverse_bucket_sizes_for_dimension_
#define CHECK_LT(x, y)
Definition: Logger.h:232
std::optional< HashType > layout_override_
static std::string getHashTypeString(HashType ht) noexcept
Definition: HashJoin.h:154
static std::unordered_set< size_t > getAlternativeTableKeys(const std::vector< ChunkKey > &chunk_keys, int db_id, int inner_table_id)
Definition: DataRecycler.h:154
void setInverseBucketSizeInfo(const std::vector< double > &inverse_bucket_sizes, std::vector< ColumnsForDevice > &columns_per_device, const size_t device_count)
ColumnsForDevice fetchColumnsForDevice(const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id, DeviceAllocator *dev_buff_owner)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
ThreadId thread_id()
Definition: Logger.cpp:820
#define CHECK(condition)
Definition: Logger.h:222
#define DEBUG_TIMER(name)
Definition: Logger.h:371
void reifyForDevice(const ColumnsForDevice &columns_for_device, const HashType layout, const size_t entry_count, const size_t emitted_keys_count, const int device_id, const logger::ThreadId parent_thread_id)
const Data_Namespace::MemoryLevel memory_level_
void generateCacheKey(const size_t max_hashtable_size, const double bucket_threshold, const std::vector< double > &bucket_sizes, std::vector< std::vector< Fragmenter_Namespace::FragmentInfo >> &fragments_per_device, int device_count)
static std::unique_ptr< HashtableRecycler > hash_table_cache_
std::vector< InnerOuter > inner_outer_pairs_
std::unordered_set< size_t > table_keys_
Data_Namespace::MemoryLevel effective_memory_level_
Data_Namespace::MemoryLevel getEffectiveMemoryLevel(const std::vector< InnerOuter > &inner_outer_pairs) const
const Analyzer::RangeOper * range_expr_
static constexpr DeviceIdentifier CPU_DEVICE_IDENTIFIER
Definition: DataRecycler.h:136
static HashtableAccessPathInfo getHashtableAccessPathInfo(const std::vector< InnerOuter > &inner_outer_pairs, const std::vector< InnerOuterStringOpInfos > &inner_outer_string_op_infos_pairs, const SQLOps op_type, const JoinType join_type, const HashTableBuildDagMap &hashtable_build_dag_map, int device_count, int shard_count, const std::vector< std::vector< Fragmenter_Namespace::FragmentInfo >> &frags_for_device, Executor *executor)
CompositeKeyInfo composite_key_info_
#define VLOG(n)
Definition: Logger.h:316
const size_t max_hashtable_size_

+ Here is the call graph for this function:

Member Data Documentation

const double RangeJoinHashTable::bucket_threshold_ {std::numeric_limits<double>::max()}
private

Definition at line 128 of file RangeJoinHashTable.h.

Referenced by computeRangeHashTableCounts(), and reifyWithLayout().

Data_Namespace::MemoryLevel RangeJoinHashTable::effective_memory_level_
private

Definition at line 127 of file RangeJoinHashTable.h.

Referenced by approximateTupleCount(), reifyForDevice(), and reifyWithLayout().

std::shared_ptr<Analyzer::ColumnVar> RangeJoinHashTable::inner_col_expr_
private

Definition at line 126 of file RangeJoinHashTable.h.

const size_t RangeJoinHashTable::max_hashtable_size_ {std::numeric_limits<size_t>::max()}
private

Definition at line 129 of file RangeJoinHashTable.h.

Referenced by computeRangeHashTableCounts(), and reifyWithLayout().

const Analyzer::RangeOper* RangeJoinHashTable::range_expr_
private

Definition at line 125 of file RangeJoinHashTable.h.

Referenced by isInnerColCompressed(), and reifyWithLayout().


The documentation for this class was generated from the following files: