OmniSciDB  c07336695a
BaselineJoinHashTable Class Reference

#include <BaselineJoinHashTable.h>

+ Inheritance diagram for BaselineJoinHashTable:
+ Collaboration diagram for BaselineJoinHashTable:

Classes

struct  ColumnsForDevice
 
struct  CompositeKeyInfo
 
struct  HashTableCacheKey
 
struct  HashTableCacheValue
 

Public Member Functions

int64_t getJoinHashBuffer (const ExecutorDeviceType device_type, const int device_id) noexcept override
 
llvm::Value * codegenSlot (const CompilationOptions &, const size_t) override
 
HashJoinMatchingSet codegenMatchingSet (const CompilationOptions &, const size_t) override
 
int getInnerTableId () const noexcept override
 
int getInnerTableRteIdx () const noexcept override
 
JoinHashTableInterface::HashType getHashType () const noexcept override
 
size_t offsetBufferOff () const noexcept override
 
size_t countBufferOff () const noexcept override
 
size_t payloadBufferOff () const noexcept override
 
virtual ~BaselineJoinHashTable ()
 

Static Public Member Functions

static std::shared_ptr< BaselineJoinHashTablegetInstance (const std::shared_ptr< Analyzer::BinOper > condition, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_map, Executor *executor)
 
static size_t getShardCountForCondition (const Analyzer::BinOper *condition, const Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs)
 
static auto yieldCacheInvalidator () -> std::function< void()>
 

Protected Types

typedef std::pair< const int8_t *, size_t > LinearizedColumn
 
typedef std::pair< int, int > LinearizedColumnCacheKey
 

Protected Member Functions

 BaselineJoinHashTable (const std::shared_ptr< Analyzer::BinOper > condition, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const HashType preferred_hash_type, const size_t entry_count, ColumnCacheMap &column_map, Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs)
 
virtual void reifyWithLayout (const int device_count, const JoinHashTableInterface::HashType layout)
 
virtual ColumnsForDevice fetchColumnsForDevice (const std::deque< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id)
 
virtual std::pair< size_t, size_t > approximateTupleCount (const std::vector< ColumnsForDevice > &) const
 
virtual size_t getKeyComponentWidth () const
 
virtual size_t getKeyComponentCount () const
 
virtual int initHashTableOnCpu (const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_bucket_info, const JoinHashTableInterface::HashType layout)
 
virtual int initHashTableOnGpu (const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_bucket_info, const JoinHashTableInterface::HashType layout, const size_t key_component_width, const size_t key_component_count, const int device_id)
 
virtual llvm::Value * codegenKey (const CompilationOptions &)
 
std::pair< const int8_t *, size_t > getAllColumnFragments (const Analyzer::ColumnVar &hash_col, const std::deque< Fragmenter_Namespace::FragmentInfo > &fragments, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner)
 
size_t shardCount () const
 
Data_Namespace::MemoryLevel getEffectiveMemoryLevel (const std::vector< InnerOuter > &inner_outer_pairs) const
 
CompositeKeyInfo getCompositeKeyInfo () const
 
void reify (const int device_count)
 
JoinColumn fetchColumn (const Analyzer::ColumnVar *inner_col, const Data_Namespace::MemoryLevel &effective_memory_level, const std::deque< Fragmenter_Namespace::FragmentInfo > &fragments, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, const int device_id)
 
void reifyForDevice (const ColumnsForDevice &columns_for_device, const JoinHashTableInterface::HashType layout, const int device_id)
 
void checkHashJoinReplicationConstraint (const int table_id) const
 
int initHashTableForDevice (const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_buckets, const JoinHashTableInterface::HashType layout, const Data_Namespace::MemoryLevel effective_memory_level, const int device_id)
 
llvm::Value * hashPtr (const size_t index)
 
void initHashTableOnCpuFromCache (const HashTableCacheKey &)
 
void putHashTableOnCpuToCache (const HashTableCacheKey &)
 
std::pair< ssize_t, size_t > getApproximateTupleCountFromCache (const HashTableCacheKey &) const
 
bool isBitwiseEq () const
 
void freeHashBufferMemory ()
 
void freeHashBufferGpuMemory ()
 
void freeHashBufferCpuMemory ()
 
const HashTableCacheValuefindHashTableOnCpuInCache (const HashTableCacheKey &)
 

Static Protected Member Functions

static int getInnerTableId (const std::vector< InnerOuter > &inner_outer_pairs)
 

Protected Attributes

const std::shared_ptr< Analyzer::BinOpercondition_
 
const std::vector< InputTableInfo > & query_infos_
 
const Data_Namespace::MemoryLevel memory_level_
 
JoinHashTableInterface::HashType layout_
 
size_t entry_count_
 
size_t emitted_keys_count_
 
Executorexecutor_
 
ColumnCacheMapcolumn_cache_
 
std::shared_ptr< std::vector< int8_t > > cpu_hash_table_buff_
 
std::mutex cpu_hash_table_buff_mutex_
 
std::map< LinearizedColumnCacheKey, LinearizedColumnlinearized_multifrag_columns_
 
std::mutex linearized_multifrag_column_mutex_
 
RowSetMemoryOwner linearized_multifrag_column_owner_
 
std::vector< InnerOuterinner_outer_pairs_
 
const Catalog_Namespace::Catalogcatalog_
 

Static Protected Attributes

static std::vector< std::pair< HashTableCacheKey, HashTableCacheValue > > hash_table_cache_
 
static std::mutex hash_table_cache_mutex_
 
static const int ERR_FAILED_TO_FETCH_COLUMN {-3}
 
static const int ERR_FAILED_TO_JOIN_ON_VIRTUAL_COLUMN {-4}
 

Private Member Functions

size_t getComponentBufferSize () const noexcept
 

Additional Inherited Members

- Public Types inherited from JoinHashTableInterface
enum  HashType { HashType::OneToOne, HashType::OneToMany }
 

Detailed Description

Definition at line 43 of file BaselineJoinHashTable.h.

Member Typedef Documentation

◆ LinearizedColumn

typedef std::pair<const int8_t*, size_t> BaselineJoinHashTable::LinearizedColumn
protected

Definition at line 240 of file BaselineJoinHashTable.h.

◆ LinearizedColumnCacheKey

typedef std::pair<int, int> BaselineJoinHashTable::LinearizedColumnCacheKey
protected

Definition at line 241 of file BaselineJoinHashTable.h.

Constructor & Destructor Documentation

◆ ~BaselineJoinHashTable()

virtual BaselineJoinHashTable::~BaselineJoinHashTable ( )
inlinevirtual

Definition at line 86 of file BaselineJoinHashTable.h.

References BaselineJoinHashTable(), getComponentBufferSize(), getInnerTableId(), and reifyWithLayout().

86 {}
+ Here is the call graph for this function:

◆ BaselineJoinHashTable()

BaselineJoinHashTable::BaselineJoinHashTable ( const std::shared_ptr< Analyzer::BinOper condition,
const std::vector< InputTableInfo > &  query_infos,
const Data_Namespace::MemoryLevel  memory_level,
const HashType  preferred_hash_type,
const size_t  entry_count,
ColumnCacheMap column_map,
Executor executor,
const std::vector< InnerOuter > &  inner_outer_pairs 
)
protected

Definition at line 92 of file BaselineJoinHashTable.cpp.

Referenced by getInstance(), and ~BaselineJoinHashTable().

101  : condition_(condition)
102  , query_infos_(query_infos)
103  , memory_level_(memory_level)
104  , layout_(preferred_hash_type)
105  , entry_count_(entry_count)
107  , executor_(executor)
108  , column_cache_(column_cache)
109  , inner_outer_pairs_(inner_outer_pairs)
110  , catalog_(executor->getCatalog())
111 #ifdef HAVE_CUDA
112  , block_size_(executor->blockSize())
113  , grid_size_(executor->gridSize()) {
114 }
115 #else
116 {
117 }
JoinHashTableInterface::HashType layout_
const std::vector< InputTableInfo > & query_infos_
std::vector< InnerOuter > inner_outer_pairs_
ColumnCacheMap & column_cache_
const Catalog_Namespace::Catalog * catalog_
const Data_Namespace::MemoryLevel memory_level_
const std::shared_ptr< Analyzer::BinOper > condition_
+ Here is the caller graph for this function:

Member Function Documentation

◆ approximateTupleCount()

std::pair< size_t, size_t > BaselineJoinHashTable::approximateTupleCount ( const std::vector< ColumnsForDevice > &  columns_per_device) const
protectedvirtual

Reimplemented in OverlapsJoinHashTable.

Definition at line 263 of file BaselineJoinHashTable.cpp.

References ThrustAllocator::allocateScopedBuffer(), approximate_distinct_tuples(), approximate_distinct_tuples_on_device(), Bitmap, catalog_, CHECK, CHECK_EQ, condition_, copy_from_gpu(), CPU, Data_Namespace::CPU_LEVEL, cpu_threads(), getApproximateTupleCountFromCache(), getCompositeKeyInfo(), Catalog_Namespace::Catalog::getDataMgr(), getEffectiveMemoryLevel(), GPU, Data_Namespace::GPU_LEVEL, hll_size(), hll_unify(), inner_outer_pairs_, transfer_object_to_gpu(), transfer_pod_vector_to_gpu(), and UNREACHABLE.

Referenced by reifyWithLayout().

264  {
265  const auto effective_memory_level = getEffectiveMemoryLevel(inner_outer_pairs_);
266  CountDistinctDescriptor count_distinct_desc{
268  0,
269  11,
270  true,
271  effective_memory_level == Data_Namespace::MemoryLevel::GPU_LEVEL
274  1};
275  const auto padded_size_bytes = count_distinct_desc.bitmapPaddedSizeBytes();
276 
277  CHECK(!columns_per_device.empty() && !columns_per_device.front().join_columns.empty());
278 
279  if (effective_memory_level == Data_Namespace::MemoryLevel::CPU_LEVEL) {
280  const auto composite_key_info = getCompositeKeyInfo();
281  HashTableCacheKey cache_key{columns_per_device.front().join_columns.front().num_elems,
282  composite_key_info.cache_key_chunks,
283  condition_->get_optype()};
284  const auto cached_count_info = getApproximateTupleCountFromCache(cache_key);
285  if (cached_count_info.first >= 0) {
286  return std::make_pair(cached_count_info.first, cached_count_info.second);
287  }
288  int thread_count = cpu_threads();
289  std::vector<uint8_t> hll_buffer_all_cpus(thread_count * padded_size_bytes);
290  auto hll_result = &hll_buffer_all_cpus[0];
291 
292  approximate_distinct_tuples(hll_result,
293  count_distinct_desc.bitmap_sz_bits,
294  padded_size_bytes,
295  columns_per_device.front().join_columns,
296  columns_per_device.front().join_column_types,
297  thread_count);
298  for (int i = 1; i < thread_count; ++i) {
299  hll_unify(hll_result,
300  hll_result + i * padded_size_bytes,
301  1 << count_distinct_desc.bitmap_sz_bits);
302  }
303  return std::make_pair(hll_size(hll_result, count_distinct_desc.bitmap_sz_bits), 0);
304  }
305 #ifdef HAVE_CUDA
306  const int device_count = columns_per_device.size();
307  auto& data_mgr = catalog_->getDataMgr();
308  std::vector<std::vector<uint8_t>> host_hll_buffers(device_count);
309  for (auto& host_hll_buffer : host_hll_buffers) {
310  host_hll_buffer.resize(count_distinct_desc.bitmapPaddedSizeBytes());
311  }
312  std::vector<std::future<void>> approximate_distinct_device_threads;
313  for (int device_id = 0; device_id < device_count; ++device_id) {
314  approximate_distinct_device_threads.emplace_back(std::async(
315  std::launch::async,
316  [device_id,
317  &columns_per_device,
318  &count_distinct_desc,
319  &data_mgr,
320  &host_hll_buffers,
321  this] {
322  ThrustAllocator allocator(&data_mgr, device_id);
323  auto device_hll_buffer =
324  allocator.allocateScopedBuffer(count_distinct_desc.bitmapPaddedSizeBytes());
325  data_mgr.getCudaMgr()->zeroDeviceMem(
326  device_hll_buffer, count_distinct_desc.bitmapPaddedSizeBytes(), device_id);
327  const auto& columns_for_device = columns_per_device[device_id];
328  auto join_columns_gpu =
329  transfer_pod_vector_to_gpu(columns_for_device.join_columns, allocator);
330  auto join_column_types_gpu =
331  transfer_pod_vector_to_gpu(columns_for_device.join_column_types, allocator);
332  const auto key_handler =
333  GenericKeyHandler(columns_for_device.join_columns.size(),
334  true,
335  join_columns_gpu,
336  join_column_types_gpu,
337  nullptr,
338  nullptr);
339  const auto key_handler_gpu = transfer_object_to_gpu(key_handler, allocator);
341  reinterpret_cast<uint8_t*>(device_hll_buffer),
342  count_distinct_desc.bitmap_sz_bits,
343  key_handler_gpu,
344  columns_for_device.join_columns[0].num_elems,
345  block_size_,
346  grid_size_);
347 
348  auto& host_hll_buffer = host_hll_buffers[device_id];
349  copy_from_gpu(&data_mgr,
350  &host_hll_buffer[0],
351  reinterpret_cast<CUdeviceptr>(device_hll_buffer),
352  count_distinct_desc.bitmapPaddedSizeBytes(),
353  device_id);
354  }));
355  }
356  for (auto& child : approximate_distinct_device_threads) {
357  child.get();
358  }
359  CHECK_EQ(Data_Namespace::MemoryLevel::GPU_LEVEL, effective_memory_level);
360  auto& result_hll_buffer = host_hll_buffers.front();
361  auto hll_result = reinterpret_cast<int32_t*>(&result_hll_buffer[0]);
362  for (int device_id = 1; device_id < device_count; ++device_id) {
363  auto& host_hll_buffer = host_hll_buffers[device_id];
364  hll_unify(hll_result,
365  reinterpret_cast<int32_t*>(&host_hll_buffer[0]),
366  1 << count_distinct_desc.bitmap_sz_bits);
367  }
368  return std::make_pair(hll_size(hll_result, count_distinct_desc.bitmap_sz_bits), 0);
369 #else
370  UNREACHABLE();
371  return {0, 0};
372 #endif // HAVE_CUDA
373 }
#define CHECK_EQ(x, y)
Definition: Logger.h:195
std::pair< ssize_t, size_t > getApproximateTupleCountFromCache(const HashTableCacheKey &) const
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:177
void hll_unify(T1 *lhs, T2 *rhs, const size_t m)
Definition: HyperLogLog.h:109
#define UNREACHABLE()
Definition: Logger.h:231
CompositeKeyInfo getCompositeKeyInfo() const
T * transfer_pod_vector_to_gpu(const std::vector< T > &vec, ThrustAllocator &allocator)
size_t hll_size(const T *M, const size_t bitmap_sz_bits)
Definition: HyperLogLog.h:90
std::vector< InnerOuter > inner_outer_pairs_
void approximate_distinct_tuples(uint8_t *hll_buffer_all_cpus, const uint32_t b, const size_t padded_size_bytes, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const int thread_count)
T * transfer_object_to_gpu(const T &object, ThrustAllocator &allocator)
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
void approximate_distinct_tuples_on_device(uint8_t *hll_buffer, const uint32_t b, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
const Catalog_Namespace::Catalog * catalog_
#define CHECK(condition)
Definition: Logger.h:187
Data_Namespace::MemoryLevel getEffectiveMemoryLevel(const std::vector< InnerOuter > &inner_outer_pairs) const
int cpu_threads()
Definition: thread_count.h:23
const std::shared_ptr< Analyzer::BinOper > condition_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ checkHashJoinReplicationConstraint()

void BaselineJoinHashTable::checkHashJoinReplicationConstraint ( const int  table_id) const
protected

Definition at line 1103 of file BaselineJoinHashTable.cpp.

References catalog_, CHECK, g_cluster, Catalog_Namespace::Catalog::getMetadataForTable(), shardCount(), and table_is_replicated().

1103  {
1104  if (!g_cluster) {
1105  return;
1106  }
1107  if (table_id >= 0) {
1108  const auto inner_td = catalog_->getMetadataForTable(table_id);
1109  CHECK(inner_td);
1110  const auto shard_count = shardCount();
1111  if (!shard_count && !table_is_replicated(inner_td)) {
1112  throw TableMustBeReplicated(inner_td->tableName);
1113  }
1114  }
1115 }
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
bool table_is_replicated(const TableDescriptor *td)
const Catalog_Namespace::Catalog * catalog_
#define CHECK(condition)
Definition: Logger.h:187
bool g_cluster
+ Here is the call graph for this function:

◆ codegenKey()

llvm::Value * BaselineJoinHashTable::codegenKey ( const CompilationOptions co)
protectedvirtual

Reimplemented in OverlapsJoinHashTable.

Definition at line 1032 of file BaselineJoinHashTable.cpp.

References CHECK, CHECK_EQ, CodeGenerator::codegen(), executor_, get_int_type(), getKeyComponentCount(), getKeyComponentWidth(), inner_outer_pairs_, LL_BUILDER, LL_CONTEXT, and LL_INT.

Referenced by codegenMatchingSet(), and codegenSlot().

1032  {
1033  const auto key_component_width = getKeyComponentWidth();
1034  CHECK(key_component_width == 4 || key_component_width == 8);
1035  const auto key_size_lv = LL_INT(getKeyComponentCount() * key_component_width);
1036  llvm::Value* key_buff_lv{nullptr};
1037  switch (key_component_width) {
1038  case 4:
1039  key_buff_lv =
1040  LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv);
1041  break;
1042  case 8:
1043  key_buff_lv =
1044  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1045  break;
1046  default:
1047  CHECK(false);
1048  }
1049 
1050  CodeGenerator code_generator(executor_);
1051  for (size_t i = 0; i < getKeyComponentCount(); ++i) {
1052  const auto key_comp_dest_lv = LL_BUILDER.CreateGEP(key_buff_lv, LL_INT(i));
1053  const auto& inner_outer_pair = inner_outer_pairs_[i];
1054  const auto outer_col = inner_outer_pair.second;
1055  const auto col_lvs = code_generator.codegen(outer_col, true, co);
1056  CHECK_EQ(size_t(1), col_lvs.size());
1057  const auto col_lv = LL_BUILDER.CreateSExt(
1058  col_lvs.front(), get_int_type(key_component_width * 8, LL_CONTEXT));
1059  LL_BUILDER.CreateStore(col_lv, key_comp_dest_lv);
1060  }
1061  return key_buff_lv;
1062 }
#define CHECK_EQ(x, y)
Definition: Logger.h:195
virtual size_t getKeyComponentCount() const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::vector< InnerOuter > inner_outer_pairs_
#define LL_INT(v)
#define LL_CONTEXT
#define LL_BUILDER
virtual size_t getKeyComponentWidth() const
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ codegenMatchingSet()

HashJoinMatchingSet BaselineJoinHashTable::codegenMatchingSet ( const CompilationOptions co,
const size_t  index 
)
overridevirtual

Implements JoinHashTableInterface.

Definition at line 969 of file BaselineJoinHashTable.cpp.

References CHECK, JoinHashTable::codegenHashTableLoad(), codegenKey(), JoinHashTable::codegenMatchingSet(), entry_count_, executor_, get_int_type(), getComponentBufferSize(), getKeyComponentCount(), getKeyComponentWidth(), layout_, LL_BUILDER, LL_CONTEXT, LL_INT, offsetBufferOff(), JoinHashTableInterface::OneToMany, and to_string().

971  {
972  const auto key_component_width = getKeyComponentWidth();
973  CHECK(key_component_width == 4 || key_component_width == 8);
974  auto key_buff_lv = codegenKey(co);
976  auto hash_ptr = JoinHashTable::codegenHashTableLoad(index, executor_);
977  const auto composite_dict_ptr_type =
978  llvm::Type::getIntNPtrTy(LL_CONTEXT, key_component_width * 8);
979  const auto composite_key_dict =
980  hash_ptr->getType()->isPointerTy()
981  ? LL_BUILDER.CreatePointerCast(hash_ptr, composite_dict_ptr_type)
982  : LL_BUILDER.CreateIntToPtr(hash_ptr, composite_dict_ptr_type);
983  const auto key_component_count = getKeyComponentCount();
984  const auto key = executor_->cgen_state_->emitExternalCall(
985  "get_composite_key_index_" + std::to_string(key_component_width * 8),
987  {key_buff_lv,
988  LL_INT(key_component_count),
989  composite_key_dict,
990  LL_INT(entry_count_)});
991  auto one_to_many_ptr = hash_ptr;
992  if (one_to_many_ptr->getType()->isPointerTy()) {
993  one_to_many_ptr =
994  LL_BUILDER.CreatePtrToInt(hash_ptr, llvm::Type::getInt64Ty(LL_CONTEXT));
995  } else {
996  CHECK(one_to_many_ptr->getType()->isIntegerTy(64));
997  }
998  const auto composite_key_dict_size = offsetBufferOff();
999  one_to_many_ptr =
1000  LL_BUILDER.CreateAdd(one_to_many_ptr, LL_INT(composite_key_dict_size));
1002  {one_to_many_ptr, key, LL_INT(int64_t(0)), LL_INT(entry_count_ - 1)},
1003  false,
1004  false,
1005  false,
1007  executor_);
1008 }
virtual size_t getKeyComponentCount() const
size_t getComponentBufferSize() const noexcept
JoinHashTableInterface::HashType layout_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
virtual llvm::Value * codegenKey(const CompilationOptions &)
#define LL_INT(v)
#define LL_CONTEXT
#define LL_BUILDER
static llvm::Value * codegenHashTableLoad(const size_t table_idx, Executor *executor)
virtual size_t getKeyComponentWidth() const
size_t offsetBufferOff() const noexcept override
HashJoinMatchingSet codegenMatchingSet(const CompilationOptions &, const size_t) override
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:

◆ codegenSlot()

llvm::Value * BaselineJoinHashTable::codegenSlot ( const CompilationOptions co,
const size_t  index 
)
overridevirtual

Implements JoinHashTableInterface.

Definition at line 953 of file BaselineJoinHashTable.cpp.

References CHECK, codegenKey(), entry_count_, executor_, get_int_type(), getHashType(), getKeyComponentCount(), getKeyComponentWidth(), hashPtr(), LL_BUILDER, LL_CONTEXT, LL_INT, JoinHashTableInterface::OneToOne, and to_string().

954  {
956  const auto key_component_width = getKeyComponentWidth();
957  CHECK(key_component_width == 4 || key_component_width == 8);
958  auto key_buff_lv = codegenKey(co);
959  const auto hash_ptr = hashPtr(index);
960  const auto key_ptr_lv =
961  LL_BUILDER.CreatePointerCast(key_buff_lv, llvm::Type::getInt8PtrTy(LL_CONTEXT));
962  const auto key_size_lv = LL_INT(getKeyComponentCount() * key_component_width);
963  return executor_->cgen_state_->emitExternalCall(
964  "baseline_hash_join_idx_" + std::to_string(key_component_width * 8),
966  {hash_ptr, key_ptr_lv, key_size_lv, LL_INT(entry_count_)});
967 }
virtual size_t getKeyComponentCount() const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
virtual llvm::Value * codegenKey(const CompilationOptions &)
#define LL_INT(v)
#define LL_CONTEXT
#define LL_BUILDER
virtual size_t getKeyComponentWidth() const
llvm::Value * hashPtr(const size_t index)
#define CHECK(condition)
Definition: Logger.h:187
JoinHashTableInterface::HashType getHashType() const noexcept override
+ Here is the call graph for this function:

◆ countBufferOff()

size_t BaselineJoinHashTable::countBufferOff ( ) const
overridevirtualnoexcept

Implements JoinHashTableInterface.

Reimplemented in OverlapsJoinHashTable.

Definition at line 1018 of file BaselineJoinHashTable.cpp.

References CHECK, getComponentBufferSize(), layout_, offsetBufferOff(), and JoinHashTableInterface::OneToMany.

Referenced by payloadBufferOff().

1018  {
1021 }
size_t getComponentBufferSize() const noexcept
JoinHashTableInterface::HashType layout_
size_t offsetBufferOff() const noexcept override
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ fetchColumn()

JoinColumn BaselineJoinHashTable::fetchColumn ( const Analyzer::ColumnVar inner_col,
const Data_Namespace::MemoryLevel effective_memory_level,
const std::deque< Fragmenter_Namespace::FragmentInfo > &  fragments,
std::vector< std::shared_ptr< Chunk_NS::Chunk >> &  chunks_owner,
const int  device_id 
)
protected

Definition at line 375 of file BaselineJoinHashTable.cpp.

References ThrustAllocator::allocate(), catalog_, CHECK, CHECK_NE, column_cache_, copy_to_gpu(), executor_, SQLTypeInfoCore< TYPE_FACET_PACK >::get_size(), Analyzer::Expr::get_type_info(), getAllColumnFragments(), Catalog_Namespace::Catalog::getDataMgr(), ColumnFetcher::getOneColumnFragment(), and Data_Namespace::GPU_LEVEL.

Referenced by OverlapsJoinHashTable::fetchColumnsForDevice(), and fetchColumnsForDevice().

380  {
381  static std::mutex fragment_fetch_mutex;
382  const bool has_multi_frag = fragments.size() > 1;
383  const auto& first_frag = fragments.front();
384 
385  const int8_t* col_buff = nullptr;
386  size_t elem_count = 0;
387  const size_t elem_width = inner_col->get_type_info().get_size();
388  auto& data_mgr = catalog_->getDataMgr();
389  ThrustAllocator dev_buff_owner(&data_mgr, device_id);
390  if (has_multi_frag) {
391  try {
392  std::tie(col_buff, elem_count) =
393  getAllColumnFragments(*inner_col, fragments, chunks_owner);
394  } catch (...) {
395  throw FailedToFetchColumn();
396  }
397  }
398  {
399  std::lock_guard<std::mutex> fragment_fetch_lock(fragment_fetch_mutex);
400  if (has_multi_frag) {
401  if (effective_memory_level == Data_Namespace::GPU_LEVEL) {
402  CHECK(col_buff != nullptr);
403  CHECK_NE(elem_count, size_t(0));
404  int8_t* dev_col_buff = nullptr;
405  dev_col_buff = dev_buff_owner.allocate(elem_count * elem_width);
406  copy_to_gpu(&data_mgr,
407  reinterpret_cast<CUdeviceptr>(dev_col_buff),
408  col_buff,
409  elem_count * elem_width,
410  device_id);
411  col_buff = dev_col_buff;
412  }
413  } else {
414  try {
415  std::tie(col_buff, elem_count) =
417  *inner_col,
418  first_frag,
419  effective_memory_level,
420  device_id,
421  chunks_owner,
422  column_cache_);
423  } catch (...) {
424  throw FailedToFetchColumn();
425  }
426  }
427  }
428  return {col_buff, elem_count};
429 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:329
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:177
std::pair< const int8_t *, size_t > getAllColumnFragments(const Analyzer::ColumnVar &hash_col, const std::deque< Fragmenter_Namespace::FragmentInfo > &fragments, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner)
void copy_to_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
Definition: GpuMemUtils.cpp:31
#define CHECK_NE(x, y)
Definition: Logger.h:196
ColumnCacheMap & column_cache_
static std::pair< const int8_t *, size_t > getOneColumnFragment(Executor *executor, const Analyzer::ColumnVar &hash_col, const Fragmenter_Namespace::FragmentInfo &fragment, const Data_Namespace::MemoryLevel effective_mem_lvl, const int device_id, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, ColumnCacheMap &column_cache)
const Catalog_Namespace::Catalog * catalog_
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:77
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ fetchColumnsForDevice()

BaselineJoinHashTable::ColumnsForDevice BaselineJoinHashTable::fetchColumnsForDevice ( const std::deque< Fragmenter_Namespace::FragmentInfo > &  fragments,
const int  device_id 
)
protectedvirtual

Reimplemented in OverlapsJoinHashTable.

Definition at line 431 of file BaselineJoinHashTable.cpp.

References catalog_, fetchColumn(), get_column_descriptor_maybe(), get_join_column_type_kind(), getEffectiveMemoryLevel(), inline_fixed_encoding_null_val(), inner_outer_pairs_, and isBitwiseEq().

Referenced by reifyWithLayout().

433  {
434  const auto effective_memory_level = getEffectiveMemoryLevel(inner_outer_pairs_);
435 
436  std::vector<JoinColumn> join_columns;
437  std::vector<std::shared_ptr<Chunk_NS::Chunk>> chunks_owner;
438  std::vector<JoinColumnTypeInfo> join_column_types;
439  std::vector<JoinBucketInfo> join_bucket_info;
440  for (const auto& inner_outer_pair : inner_outer_pairs_) {
441  const auto inner_col = inner_outer_pair.first;
442  const auto inner_cd = get_column_descriptor_maybe(
443  inner_col->get_column_id(), inner_col->get_table_id(), *catalog_);
444  if (inner_cd && inner_cd->isVirtualCol) {
446  }
447  join_columns.emplace_back(fetchColumn(
448  inner_col, effective_memory_level, fragments, chunks_owner, device_id));
449  const auto& ti = inner_col->get_type_info();
450  join_column_types.emplace_back(JoinColumnTypeInfo{static_cast<size_t>(ti.get_size()),
451  0,
452  0,
454  isBitwiseEq(),
455  0,
457  }
458  return {join_columns, join_column_types, chunks_owner, join_bucket_info};
459 }
std::vector< InnerOuter > inner_outer_pairs_
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:168
JoinColumn fetchColumn(const Analyzer::ColumnVar *inner_col, const Data_Namespace::MemoryLevel &effective_memory_level, const std::deque< Fragmenter_Namespace::FragmentInfo > &fragments, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, const int device_id)
const Catalog_Namespace::Catalog * catalog_
ColumnType get_join_column_type_kind(const SQLTypeInfo &ti)
Data_Namespace::MemoryLevel getEffectiveMemoryLevel(const std::vector< InnerOuter > &inner_outer_pairs) const
int64_t inline_fixed_encoding_null_val(const SQL_TYPE_INFO &ti)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ findHashTableOnCpuInCache()

const BaselineJoinHashTable::HashTableCacheValue * BaselineJoinHashTable::findHashTableOnCpuInCache ( const HashTableCacheKey key)
protected

Definition at line 1118 of file BaselineJoinHashTable.cpp.

References hash_table_cache_, and hash_table_cache_mutex_.

1118  {
1119  std::lock_guard<std::mutex> hash_table_cache_lock(hash_table_cache_mutex_);
1120  for (const auto& kv : hash_table_cache_) {
1121  if (kv.first == key) {
1122  return &kv.second;
1123  }
1124  }
1125  return nullptr;
1126 }
static std::mutex hash_table_cache_mutex_
static std::vector< std::pair< HashTableCacheKey, HashTableCacheValue > > hash_table_cache_

◆ freeHashBufferCpuMemory()

void BaselineJoinHashTable::freeHashBufferCpuMemory ( )
protected

Definition at line 1190 of file BaselineJoinHashTable.cpp.

References cpu_hash_table_buff_, HashTypeCache::hash_type_cache_, and HashTypeCache::hash_type_cache_mutex_.

Referenced by freeHashBufferMemory(), and BaselineJoinHashTable::HashTableCacheKey::operator<().

1190  {
1191  cpu_hash_table_buff_.reset();
1192 }
std::shared_ptr< std::vector< int8_t > > cpu_hash_table_buff_
+ Here is the caller graph for this function:

◆ freeHashBufferGpuMemory()

void BaselineJoinHashTable::freeHashBufferGpuMemory ( )
protected

Definition at line 1176 of file BaselineJoinHashTable.cpp.

References catalog_, CHECK, CudaAllocator::freeGpuAbstractBuffer(), and Catalog_Namespace::Catalog::getDataMgr().

Referenced by freeHashBufferMemory(), and BaselineJoinHashTable::HashTableCacheKey::operator<().

1176  {
1177 #ifdef HAVE_CUDA
1178  auto& data_mgr = catalog_->getDataMgr();
1179  for (auto& buf : gpu_hash_table_buff_) {
1180  if (buf) {
1181  CudaAllocator::freeGpuAbstractBuffer(&data_mgr, buf);
1182  buf = nullptr;
1183  }
1184  }
1185 #else
1186  CHECK(false);
1187 #endif // HAVE_CUDA
1188 }
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:177
static void freeGpuAbstractBuffer(Data_Namespace::DataMgr *data_mgr, Data_Namespace::AbstractBuffer *ab)
const Catalog_Namespace::Catalog * catalog_
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ freeHashBufferMemory()

void BaselineJoinHashTable::freeHashBufferMemory ( )
protected

Definition at line 1169 of file BaselineJoinHashTable.cpp.

References freeHashBufferCpuMemory(), and freeHashBufferGpuMemory().

Referenced by BaselineJoinHashTable::HashTableCacheKey::operator<(), and reify().

1169  {
1170 #ifdef HAVE_CUDA
1172 #endif
1174 }
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ getAllColumnFragments()

std::pair< const int8_t *, size_t > BaselineJoinHashTable::getAllColumnFragments ( const Analyzer::ColumnVar hash_col,
const std::deque< Fragmenter_Namespace::FragmentInfo > &  fragments,
std::vector< std::shared_ptr< Chunk_NS::Chunk >> &  chunks_owner 
)
protected

Definition at line 485 of file BaselineJoinHashTable.cpp.

References RowSetMemoryOwner::addColBuffer(), CHECK, column_cache_, executor_, Analyzer::ColumnVar::get_column_id(), Analyzer::ColumnVar::get_table_id(), ColumnFetcher::getAllColumnFragments(), linearized_multifrag_column_mutex_, linearized_multifrag_column_owner_, linearized_multifrag_columns_, and shardCount().

Referenced by fetchColumn().

488  {
489  std::lock_guard<std::mutex> linearized_multifrag_column_lock(
491  auto linearized_column_cache_key =
492  std::make_pair(hash_col.get_table_id(), hash_col.get_column_id());
493  const auto cache_it = linearized_multifrag_columns_.find(linearized_column_cache_key);
494  if (cache_it != linearized_multifrag_columns_.end()) {
495  return cache_it->second;
496  }
497  const int8_t* col_buff;
498  size_t total_elem_count;
499  std::tie(col_buff, total_elem_count) = ColumnFetcher::getAllColumnFragments(
500  executor_, hash_col, fragments, chunks_owner, column_cache_);
502  const auto shard_count = shardCount();
503  if (!shard_count) {
504  const auto it_ok = linearized_multifrag_columns_.emplace(
505  linearized_column_cache_key, LinearizedColumn{col_buff, total_elem_count});
506  CHECK(it_ok.second);
507  }
508  return {col_buff, total_elem_count};
509 }
std::pair< const int8_t *, size_t > LinearizedColumn
int get_column_id() const
Definition: Analyzer.h:194
RowSetMemoryOwner linearized_multifrag_column_owner_
ColumnCacheMap & column_cache_
int get_table_id() const
Definition: Analyzer.h:193
std::map< LinearizedColumnCacheKey, LinearizedColumn > linearized_multifrag_columns_
std::mutex linearized_multifrag_column_mutex_
#define CHECK(condition)
Definition: Logger.h:187
static std::pair< const int8_t *, size_t > getAllColumnFragments(Executor *executor, const Analyzer::ColumnVar &hash_col, const std::deque< Fragmenter_Namespace::FragmentInfo > &fragments, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, ColumnCacheMap &column_cache)
void addColBuffer(const void *col_buffer)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ getApproximateTupleCountFromCache()

std::pair< ssize_t, size_t > BaselineJoinHashTable::getApproximateTupleCountFromCache ( const HashTableCacheKey key) const
protected

Definition at line 1154 of file BaselineJoinHashTable.cpp.

References hash_table_cache_, and hash_table_cache_mutex_.

Referenced by OverlapsJoinHashTable::approximateTupleCount(), approximateTupleCount(), and BaselineJoinHashTable::HashTableCacheKey::operator<().

1155  {
1156  std::lock_guard<std::mutex> hash_table_cache_lock(hash_table_cache_mutex_);
1157  for (const auto& kv : hash_table_cache_) {
1158  if (kv.first == key) {
1159  return std::make_pair(kv.second.entry_count / 2, kv.second.emitted_keys_count);
1160  }
1161  }
1162  return std::make_pair(-1, 0);
1163 }
static std::mutex hash_table_cache_mutex_
static std::vector< std::pair< HashTableCacheKey, HashTableCacheValue > > hash_table_cache_
+ Here is the caller graph for this function:

◆ getComponentBufferSize()

size_t BaselineJoinHashTable::getComponentBufferSize ( ) const
privatenoexcept

Definition at line 1028 of file BaselineJoinHashTable.cpp.

References entry_count_.

Referenced by codegenMatchingSet(), countBufferOff(), payloadBufferOff(), and ~BaselineJoinHashTable().

1028  {
1029  return entry_count_ * sizeof(int32_t);
1030 }
+ Here is the caller graph for this function:

◆ getCompositeKeyInfo()

BaselineJoinHashTable::CompositeKeyInfo BaselineJoinHashTable::getCompositeKeyInfo ( ) const
protected

Definition at line 149 of file BaselineJoinHashTable.cpp.

References catalog_, CHECK, Catalog_Namespace::DBMetadata::dbId, executor_, Catalog_Namespace::Catalog::getCurrentDB(), inner_outer_pairs_, and kENCODING_DICT.

Referenced by OverlapsJoinHashTable::approximateTupleCount(), approximateTupleCount(), OverlapsJoinHashTable::initHashTableOnCpu(), initHashTableOnCpu(), reify(), and OverlapsJoinHashTable::reifyWithLayout().

150  {
151  std::vector<const void*> sd_inner_proxy_per_key;
152  std::vector<const void*> sd_outer_proxy_per_key;
153  std::vector<ChunkKey> cache_key_chunks; // used for the cache key
154  for (const auto& inner_outer_pair : inner_outer_pairs_) {
155  const auto inner_col = inner_outer_pair.first;
156  const auto outer_col = inner_outer_pair.second;
157  const auto& inner_ti = inner_col->get_type_info();
158  const auto& outer_ti = outer_col->get_type_info();
159  ChunkKey cache_key_chunks_for_column{catalog_->getCurrentDB().dbId,
160  inner_col->get_table_id(),
161  inner_col->get_column_id()};
162  if (inner_ti.is_string()) {
163  CHECK(outer_ti.is_string());
164  CHECK(inner_ti.get_compression() == kENCODING_DICT &&
165  outer_ti.get_compression() == kENCODING_DICT);
166  const auto sd_inner_proxy = executor_->getStringDictionaryProxy(
167  inner_ti.get_comp_param(), executor_->getRowSetMemoryOwner(), true);
168  const auto sd_outer_proxy = executor_->getStringDictionaryProxy(
169  outer_ti.get_comp_param(), executor_->getRowSetMemoryOwner(), true);
170  CHECK(sd_inner_proxy && sd_outer_proxy);
171  sd_inner_proxy_per_key.push_back(sd_inner_proxy);
172  sd_outer_proxy_per_key.push_back(sd_outer_proxy);
173  cache_key_chunks_for_column.push_back(sd_outer_proxy->getGeneration());
174  } else {
175  sd_inner_proxy_per_key.emplace_back();
176  sd_outer_proxy_per_key.emplace_back();
177  }
178  cache_key_chunks.push_back(cache_key_chunks_for_column);
179  }
180  return {sd_inner_proxy_per_key, sd_outer_proxy_per_key, cache_key_chunks};
181 }
std::vector< InnerOuter > inner_outer_pairs_
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:176
const Catalog_Namespace::Catalog * catalog_
#define CHECK(condition)
Definition: Logger.h:187
std::vector< int > ChunkKey
Definition: types.h:35
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ getEffectiveMemoryLevel()

Data_Namespace::MemoryLevel BaselineJoinHashTable::getEffectiveMemoryLevel ( const std::vector< InnerOuter > &  inner_outer_pairs) const
protected

Definition at line 535 of file BaselineJoinHashTable.cpp.

References Data_Namespace::CPU_LEVEL, executor_, memory_level_, and needs_dictionary_translation().

Referenced by OverlapsJoinHashTable::approximateTupleCount(), approximateTupleCount(), OverlapsJoinHashTable::computeBucketSizes(), OverlapsJoinHashTable::fetchColumnsForDevice(), fetchColumnsForDevice(), and reifyForDevice().

536  {
537  for (const auto& inner_outer_pair : inner_outer_pairs) {
539  inner_outer_pair.first, inner_outer_pair.second, executor_)) {
541  }
542  }
543  return memory_level_;
544 }
bool needs_dictionary_translation(const Analyzer::ColumnVar *inner_col, const Analyzer::Expr *outer_col_expr, const Executor *executor)
const Data_Namespace::MemoryLevel memory_level_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ getHashType()

JoinHashTableInterface::HashType BaselineJoinHashTable::getHashType ( ) const
overridevirtualnoexcept

Implements JoinHashTableInterface.

Definition at line 1092 of file BaselineJoinHashTable.cpp.

References layout_.

Referenced by codegenSlot().

1092  {
1093  return layout_;
1094 }
JoinHashTableInterface::HashType layout_
+ Here is the caller graph for this function:

◆ getInnerTableId() [1/2]

int BaselineJoinHashTable::getInnerTableId ( ) const
overridevirtualnoexcept

Implements JoinHashTableInterface.

Definition at line 1077 of file BaselineJoinHashTable.cpp.

References CHECK, and inner_outer_pairs_.

Referenced by OverlapsJoinHashTable::getInstance(), getInstance(), OverlapsJoinHashTable::initHashTableOnCpu(), initHashTableOnCpu(), OverlapsJoinHashTable::reifyWithLayout(), reifyWithLayout(), and ~BaselineJoinHashTable().

1077  {
1078  try {
1080  } catch (...) {
1081  CHECK(false);
1082  }
1083  return 0;
1084 }
std::vector< InnerOuter > inner_outer_pairs_
int getInnerTableId() const noexcept override
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the caller graph for this function:

◆ getInnerTableId() [2/2]

int BaselineJoinHashTable::getInnerTableId ( const std::vector< InnerOuter > &  inner_outer_pairs)
staticprotected

Definition at line 1096 of file BaselineJoinHashTable.cpp.

References CHECK.

1097  {
1098  CHECK(!inner_outer_pairs.empty());
1099  const auto first_inner_col = inner_outer_pairs.front().first;
1100  return first_inner_col->get_table_id();
1101 }
#define CHECK(condition)
Definition: Logger.h:187

◆ getInnerTableRteIdx()

int BaselineJoinHashTable::getInnerTableRteIdx ( ) const
overridevirtualnoexcept

Implements JoinHashTableInterface.

Definition at line 1086 of file BaselineJoinHashTable.cpp.

References CHECK, and inner_outer_pairs_.

1086  {
1087  CHECK(!inner_outer_pairs_.empty());
1088  const auto first_inner_col = inner_outer_pairs_.front().first;
1089  return first_inner_col->get_rte_idx();
1090 }
std::vector< InnerOuter > inner_outer_pairs_
#define CHECK(condition)
Definition: Logger.h:187

◆ getInstance()

std::shared_ptr< BaselineJoinHashTable > BaselineJoinHashTable::getInstance ( const std::shared_ptr< Analyzer::BinOper condition,
const std::vector< InputTableInfo > &  query_infos,
const Data_Namespace::MemoryLevel  memory_level,
const HashType  preferred_hash_type,
const int  device_count,
ColumnCacheMap column_map,
Executor executor 
)
static

Definition at line 32 of file BaselineJoinHashTable.cpp.

References BaselineJoinHashTable(), get_entries_per_device(), get_inner_query_info(), getInnerTableId(), Fragmenter_Namespace::TableInfo::getNumTuplesUpperBound(), getShardCountForCondition(), Data_Namespace::GPU_LEVEL, InputTableInfo::info, and normalize_column_pairs().

Referenced by Executor::buildHashTableForQualifier().

39  {
40  auto inner_outer_pairs = normalize_column_pairs(
41  condition.get(), *executor->getCatalog(), executor->getTemporaryTables());
42  const auto& query_info =
43  get_inner_query_info(getInnerTableId(inner_outer_pairs), query_infos).info;
44  const auto total_entries = 2 * query_info.getNumTuplesUpperBound();
45  if (total_entries > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
46  throw TooManyHashEntries();
47  }
48  const auto shard_count = memory_level == Data_Namespace::GPU_LEVEL
50  condition.get(), executor, inner_outer_pairs)
51  : 0;
52  const auto entries_per_device =
53  get_entries_per_device(total_entries, shard_count, device_count, memory_level);
54  auto join_hash_table = std::shared_ptr<BaselineJoinHashTable>(
55  new BaselineJoinHashTable(condition,
56  query_infos,
57  memory_level,
58  preferred_hash_type,
59  entries_per_device,
60  column_cache,
61  executor,
62  inner_outer_pairs));
63  join_hash_table->checkHashJoinReplicationConstraint(getInnerTableId(inner_outer_pairs));
64  try {
65  join_hash_table->reify(device_count);
66  } catch (const TableMustBeReplicated& e) {
67  // Throw a runtime error to abort the query
68  join_hash_table->freeHashBufferMemory();
69  throw std::runtime_error(e.what());
70  } catch (const HashJoinFail& e) {
71  // HashJoinFail exceptions log an error and trigger a retry with a join loop (if
72  // possible)
73  join_hash_table->freeHashBufferMemory();
74  throw HashJoinFail(std::string("Could not build a 1-to-1 correspondence for columns "
75  "involved in equijoin | ") +
76  e.what());
77  } catch (const ColumnarConversionNotSupported& e) {
78  throw HashJoinFail(std::string("Could not build hash tables for equijoin | ") +
79  e.what());
80  } catch (const OutOfMemory& e) {
81  throw HashJoinFail(
82  std::string("Ran out of memory while building hash tables for equijoin | ") +
83  e.what());
84  } catch (const std::exception& e) {
85  throw std::runtime_error(
86  std::string("Fatal error while attempting to build hash tables for join: ") +
87  e.what());
88  }
89  return join_hash_table;
90 }
Fragmenter_Namespace::TableInfo info
Definition: InputMetadata.h:35
std::vector< InnerOuter > normalize_column_pairs(const Analyzer::BinOper *condition, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables)
BaselineJoinHashTable(const std::shared_ptr< Analyzer::BinOper > condition, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const HashType preferred_hash_type, const size_t entry_count, ColumnCacheMap &column_map, Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs)
size_t get_entries_per_device(const size_t total_entries, const size_t shard_count, const size_t device_count, const Data_Namespace::MemoryLevel memory_level)
int getInnerTableId() const noexcept override
static size_t getShardCountForCondition(const Analyzer::BinOper *condition, const Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs)
const InputTableInfo & get_inner_query_info(const int inner_table_id, const std::vector< InputTableInfo > &query_infos)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ getJoinHashBuffer()

int64_t BaselineJoinHashTable::getJoinHashBuffer ( const ExecutorDeviceType  device_type,
const int  device_id 
)
overridevirtualnoexcept

Implements JoinHashTableInterface.

Definition at line 133 of file BaselineJoinHashTable.cpp.

References CHECK, CHECK_LT, CPU, and cpu_hash_table_buff_.

134  {
135  if (device_type == ExecutorDeviceType::CPU && !cpu_hash_table_buff_) {
136  return 0;
137  }
138 #ifdef HAVE_CUDA
139  CHECK_LT(static_cast<size_t>(device_id), gpu_hash_table_buff_.size());
140  return device_type == ExecutorDeviceType::CPU
141  ? reinterpret_cast<int64_t>(&(*cpu_hash_table_buff_)[0])
142  : reinterpret_cast<int64_t>(gpu_hash_table_buff_[device_id]->getMemoryPtr());
143 #else
144  CHECK(device_type == ExecutorDeviceType::CPU);
145  return reinterpret_cast<int64_t>(&(*cpu_hash_table_buff_)[0]);
146 #endif
147 }
std::shared_ptr< std::vector< int8_t > > cpu_hash_table_buff_
#define CHECK_LT(x, y)
Definition: Logger.h:197
#define CHECK(condition)
Definition: Logger.h:187

◆ getKeyComponentCount()

size_t BaselineJoinHashTable::getKeyComponentCount ( ) const
protectedvirtual

Reimplemented in OverlapsJoinHashTable.

Definition at line 531 of file BaselineJoinHashTable.cpp.

References inner_outer_pairs_.

Referenced by codegenKey(), codegenMatchingSet(), codegenSlot(), initHashTableForDevice(), initHashTableOnCpu(), and offsetBufferOff().

531  {
532  return inner_outer_pairs_.size();
533 }
std::vector< InnerOuter > inner_outer_pairs_
+ Here is the caller graph for this function:

◆ getKeyComponentWidth()

size_t BaselineJoinHashTable::getKeyComponentWidth ( ) const
protectedvirtual

Reimplemented in OverlapsJoinHashTable.

Definition at line 519 of file BaselineJoinHashTable.cpp.

References CHECK_EQ, and inner_outer_pairs_.

Referenced by codegenKey(), codegenMatchingSet(), codegenSlot(), initHashTableForDevice(), initHashTableOnCpu(), and offsetBufferOff().

519  {
520  for (const auto& inner_outer_pair : inner_outer_pairs_) {
521  const auto inner_col = inner_outer_pair.first;
522  const auto& inner_col_ti = inner_col->get_type_info();
523  if (inner_col_ti.get_logical_size() > 4) {
524  CHECK_EQ(8, inner_col_ti.get_logical_size());
525  return 8;
526  }
527  }
528  return 4;
529 }
#define CHECK_EQ(x, y)
Definition: Logger.h:195
std::vector< InnerOuter > inner_outer_pairs_
+ Here is the caller graph for this function:

◆ getShardCountForCondition()

size_t BaselineJoinHashTable::getShardCountForCondition ( const Analyzer::BinOper condition,
const Executor executor,
const std::vector< InnerOuter > &  inner_outer_pairs 
)
static

Definition at line 120 of file BaselineJoinHashTable.cpp.

References get_shard_count().

Referenced by OverlapsJoinHashTable::getInstance(), getInstance(), shardCount(), and Executor::skipFragmentPair().

123  {
124  for (const auto& inner_outer_pair : inner_outer_pairs) {
125  const auto pair_shard_count = get_shard_count(inner_outer_pair, executor);
126  if (pair_shard_count) {
127  return pair_shard_count;
128  }
129  }
130  return 0;
131 }
size_t get_shard_count(const Analyzer::BinOper *join_condition, const Executor *executor)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ hashPtr()

llvm::Value * BaselineJoinHashTable::hashPtr ( const size_t  index)
protected

Definition at line 1064 of file BaselineJoinHashTable.cpp.

References JoinHashTable::codegenHashTableLoad(), executor_, LL_BUILDER, and LL_CONTEXT.

Referenced by codegenSlot().

1064  {
1065  auto hash_ptr = JoinHashTable::codegenHashTableLoad(index, executor_);
1066  const auto pi8_type = llvm::Type::getInt8PtrTy(LL_CONTEXT);
1067  return hash_ptr->getType()->isPointerTy()
1068  ? LL_BUILDER.CreatePointerCast(hash_ptr, pi8_type)
1069  : LL_BUILDER.CreateIntToPtr(hash_ptr, pi8_type);
1070 }
#define LL_CONTEXT
#define LL_BUILDER
static llvm::Value * codegenHashTableLoad(const size_t table_idx, Executor *executor)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ initHashTableForDevice()

int BaselineJoinHashTable::initHashTableForDevice ( const std::vector< JoinColumn > &  join_columns,
const std::vector< JoinColumnTypeInfo > &  join_column_types,
const std::vector< JoinBucketInfo > &  join_buckets,
const JoinHashTableInterface::HashType  layout,
const Data_Namespace::MemoryLevel  effective_memory_level,
const int  device_id 
)
protected

Definition at line 877 of file BaselineJoinHashTable.cpp.

References CudaAllocator::allocGpuAbstractBuffer(), catalog_, CHECK, CHECK_EQ, copy_to_gpu(), cpu_hash_table_buff_, cpu_hash_table_buff_mutex_, Data_Namespace::CPU_LEVEL, emitted_keys_count_, entry_count_, Catalog_Namespace::Catalog::getDataMgr(), getKeyComponentCount(), getKeyComponentWidth(), Data_Namespace::GPU_LEVEL, initHashTableOnCpu(), initHashTableOnGpu(), memory_level_, JoinHashTableInterface::OneToMany, JoinHashTableInterface::OneToOne, and VLOG.

Referenced by reifyForDevice().

883  {
884  const auto key_component_width = getKeyComponentWidth();
885  const auto key_component_count = getKeyComponentCount();
886  int err = 0;
887 #ifdef HAVE_CUDA
888  auto& data_mgr = catalog_->getDataMgr();
890  const auto entry_size =
891  (key_component_count +
892  (layout == JoinHashTableInterface::HashType::OneToOne ? 1 : 0)) *
893  key_component_width;
894  const auto keys_for_all_rows = emitted_keys_count_;
895  const size_t one_to_many_hash_entries =
897  ? 2 * entry_count_ + keys_for_all_rows
898  : 0;
899  const size_t hash_table_size =
900  entry_size * entry_count_ + one_to_many_hash_entries * sizeof(int32_t);
901 
902  // We can't allocate more than 2GB contiguous memory on GPU and each entry is 4 bytes.
903  if (hash_table_size > std::numeric_limits<int32_t>::max()) {
904  throw TooManyHashEntries();
905  }
906 
907  VLOG(1) << "Initializing GPU Hash Table for device " << device_id << " with "
908  << entry_count_ << " hash entries and " << one_to_many_hash_entries
909  << " entries in the one to many buffer";
910  VLOG(1) << "Total hash table size: " << hash_table_size << " Bytes";
911  gpu_hash_table_buff_[device_id] =
912  CudaAllocator::allocGpuAbstractBuffer(&data_mgr, hash_table_size, device_id);
913  }
914 #else
915  CHECK_EQ(Data_Namespace::CPU_LEVEL, effective_memory_level);
916 #endif
917  if (effective_memory_level == Data_Namespace::CPU_LEVEL) {
918  std::lock_guard<std::mutex> cpu_hash_table_buff_lock(cpu_hash_table_buff_mutex_);
919  err = initHashTableOnCpu(join_columns, join_column_types, join_bucket_info, layout);
920  // Transfer the hash table on the GPU if we've only built it on CPU
921  // but the query runs on GPU (join on dictionary encoded columns).
922  // Don't transfer the buffer if there was an error since we'll bail anyway.
923  if (memory_level_ == Data_Namespace::GPU_LEVEL && !err) {
924 #ifdef HAVE_CUDA
925  copy_to_gpu(
926  &data_mgr,
927  reinterpret_cast<CUdeviceptr>(gpu_hash_table_buff_[device_id]->getMemoryPtr()),
928  &(*cpu_hash_table_buff_)[0],
929  cpu_hash_table_buff_->size() * sizeof((*cpu_hash_table_buff_)[0]),
930  device_id);
931 #else
932  CHECK(false);
933 #endif
934  }
935  } else {
936  err = initHashTableOnGpu(join_columns,
937  join_column_types,
938  join_bucket_info,
939  layout,
940  key_component_width,
941  key_component_count,
942  device_id);
943  }
944  return err;
945 }
#define CHECK_EQ(x, y)
Definition: Logger.h:195
virtual int initHashTableOnGpu(const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_bucket_info, const JoinHashTableInterface::HashType layout, const size_t key_component_width, const size_t key_component_count, const int device_id)
virtual size_t getKeyComponentCount() const
std::shared_ptr< std::vector< int8_t > > cpu_hash_table_buff_
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:177
void copy_to_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
Definition: GpuMemUtils.cpp:31
const Catalog_Namespace::Catalog * catalog_
virtual size_t getKeyComponentWidth() const
static Data_Namespace::AbstractBuffer * allocGpuAbstractBuffer(Data_Namespace::DataMgr *data_mgr, const size_t num_bytes, const int device_id)
const Data_Namespace::MemoryLevel memory_level_
virtual int initHashTableOnCpu(const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_bucket_info, const JoinHashTableInterface::HashType layout)
#define CHECK(condition)
Definition: Logger.h:187
#define VLOG(n)
Definition: Logger.h:277
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ initHashTableOnCpu()

int BaselineJoinHashTable::initHashTableOnCpu ( const std::vector< JoinColumn > &  join_columns,
const std::vector< JoinColumnTypeInfo > &  join_column_types,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const JoinHashTableInterface::HashType  layout 
)
protectedvirtual

Reimplemented in OverlapsJoinHashTable.

Definition at line 546 of file BaselineJoinHashTable.cpp.

References CHECK, condition_, cpu_hash_table_buff_, cpu_threads(), entry_count_, fill_baseline_hash_join_buff_32(), fill_baseline_hash_join_buff_64(), fill_one_to_many_baseline_hash_table_32(), fill_one_to_many_baseline_hash_table_64(), getCompositeKeyInfo(), getInnerTableId(), getKeyComponentCount(), getKeyComponentWidth(), init_baseline_hash_join_buff_32(), init_baseline_hash_join_buff_64(), init_hash_join_buff(), initHashTableOnCpuFromCache(), JoinHashTableInterface::OneToMany, JoinHashTableInterface::OneToOne, putHashTableOnCpuToCache(), and VLOG.

Referenced by initHashTableForDevice().

550  {
551  const auto composite_key_info = getCompositeKeyInfo();
552  CHECK(!join_columns.empty());
553  HashTableCacheKey cache_key{join_columns.front().num_elems,
554  composite_key_info.cache_key_chunks,
555  condition_->get_optype()};
556  initHashTableOnCpuFromCache(cache_key);
557  if (cpu_hash_table_buff_) {
558  return 0;
559  }
560  const auto key_component_width = getKeyComponentWidth();
561  const auto key_component_count = getKeyComponentCount();
562  const auto entry_size =
563  (key_component_count +
564  (layout == JoinHashTableInterface::HashType::OneToOne ? 1 : 0)) *
565  key_component_width;
566  const auto keys_for_all_rows = join_columns.front().num_elems;
567  const size_t one_to_many_hash_entries =
569  ? 2 * entry_count_ + keys_for_all_rows
570  : 0;
571  const size_t hash_table_size =
572  entry_size * entry_count_ + one_to_many_hash_entries * sizeof(int32_t);
573 
574  // We can't allocate more than 2GB contiguous memory on GPU and each entry is 4 bytes.
575  if (hash_table_size > std::numeric_limits<int32_t>::max()) {
576  throw TooManyHashEntries();
577  }
578 
579  VLOG(1) << "Initializing CPU Join Hash Table with " << entry_count_
580  << " hash entries and " << one_to_many_hash_entries
581  << " entries in the one to many buffer";
582  VLOG(1) << "Total hash table size: " << hash_table_size << " Bytes";
583 
584  cpu_hash_table_buff_.reset(new std::vector<int8_t>(hash_table_size));
585  int thread_count = cpu_threads();
586  std::vector<std::future<void>> init_cpu_buff_threads;
587  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
588  init_cpu_buff_threads.emplace_back(
589  std::async(std::launch::async,
590  [this,
591  key_component_count,
592  key_component_width,
593  thread_idx,
594  thread_count,
595  layout] {
596  switch (key_component_width) {
597  case 4:
599  &(*cpu_hash_table_buff_)[0],
600  entry_count_,
601  key_component_count,
603  -1,
604  thread_idx,
605  thread_count);
606  break;
607  case 8:
609  &(*cpu_hash_table_buff_)[0],
610  entry_count_,
611  key_component_count,
613  -1,
614  thread_idx,
615  thread_count);
616  break;
617  default:
618  CHECK(false);
619  }
620  }));
621  }
622  for (auto& child : init_cpu_buff_threads) {
623  child.get();
624  }
625  std::vector<std::future<int>> fill_cpu_buff_threads;
626  for (int thread_idx = 0; thread_idx < thread_count; ++thread_idx) {
627  fill_cpu_buff_threads.emplace_back(std::async(
628  std::launch::async,
629  [this,
630  &composite_key_info,
631  &join_columns,
632  &join_column_types,
633  key_component_count,
634  key_component_width,
635  layout,
636  thread_idx,
637  thread_count] {
638  switch (key_component_width) {
639  case 4: {
640  const auto key_handler =
641  GenericKeyHandler(key_component_count,
642  true,
643  &join_columns[0],
644  &join_column_types[0],
645  &composite_key_info.sd_inner_proxy_per_key[0],
646  &composite_key_info.sd_outer_proxy_per_key[0]);
648  &(*cpu_hash_table_buff_)[0],
649  entry_count_,
650  -1,
651  key_component_count,
653  &key_handler,
654  join_columns[0].num_elems,
655  thread_idx,
656  thread_count);
657  break;
658  }
659  case 8: {
660  const auto key_handler =
661  GenericKeyHandler(key_component_count,
662  true,
663  &join_columns[0],
664  &join_column_types[0],
665  &composite_key_info.sd_inner_proxy_per_key[0],
666  &composite_key_info.sd_outer_proxy_per_key[0]);
668  &(*cpu_hash_table_buff_)[0],
669  entry_count_,
670  -1,
671  key_component_count,
673  &key_handler,
674  join_columns[0].num_elems,
675  thread_idx,
676  thread_count);
677  break;
678  }
679  default:
680  CHECK(false);
681  }
682  return -1;
683  }));
684  }
685  int err = 0;
686  for (auto& child : fill_cpu_buff_threads) {
687  int partial_err = child.get();
688  if (partial_err) {
689  err = partial_err;
690  }
691  }
692  if (err) {
693  cpu_hash_table_buff_.reset();
694  return err;
695  }
697  auto one_to_many_buff = reinterpret_cast<int32_t*>(&(*cpu_hash_table_buff_)[0] +
698  entry_count_ * entry_size);
699  init_hash_join_buff(one_to_many_buff, entry_count_, -1, 0, 1);
700  switch (key_component_width) {
701  case 4: {
702  const auto composite_key_dict =
703  reinterpret_cast<int32_t*>(&(*cpu_hash_table_buff_)[0]);
705  composite_key_dict,
706  entry_count_,
707  -1,
708  key_component_count,
709  join_columns,
710  join_column_types,
711  join_bucket_info,
712  composite_key_info.sd_inner_proxy_per_key,
713  composite_key_info.sd_outer_proxy_per_key,
714  thread_count);
715  break;
716  }
717  case 8: {
718  const auto composite_key_dict =
719  reinterpret_cast<int64_t*>(&(*cpu_hash_table_buff_)[0]);
721  composite_key_dict,
722  entry_count_,
723  -1,
724  key_component_count,
725  join_columns,
726  join_column_types,
727  join_bucket_info,
728  composite_key_info.sd_inner_proxy_per_key,
729  composite_key_info.sd_outer_proxy_per_key,
730  thread_count);
731  break;
732  }
733  default:
734  CHECK(false);
735  }
736  }
737  if (!err && getInnerTableId() > 0) {
738  putHashTableOnCpuToCache(cache_key);
739  }
740  return err;
741 }
void fill_one_to_many_baseline_hash_table_32(int32_t *buff, const int32_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void *> &sd_inner_proxy_per_key, const std::vector< const void *> &sd_outer_proxy_per_key, const int32_t cpu_thread_count)
void putHashTableOnCpuToCache(const HashTableCacheKey &)
virtual size_t getKeyComponentCount() const
std::shared_ptr< std::vector< int8_t > > cpu_hash_table_buff_
DEVICE void SUFFIX() init_hash_join_buff(int32_t *groups_buffer, const int32_t hash_entry_count, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void init_baseline_hash_join_buff_64(int8_t *hash_join_buff, const int32_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
void fill_one_to_many_baseline_hash_table_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const std::vector< JoinColumn > &join_column_per_key, const std::vector< JoinColumnTypeInfo > &type_info_per_key, const std::vector< JoinBucketInfo > &join_bucket_info, const std::vector< const void *> &sd_inner_proxy_per_key, const std::vector< const void *> &sd_outer_proxy_per_key, const int32_t cpu_thread_count)
CompositeKeyInfo getCompositeKeyInfo() const
int fill_baseline_hash_join_buff_64(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
int getInnerTableId() const noexcept override
virtual size_t getKeyComponentWidth() const
void initHashTableOnCpuFromCache(const HashTableCacheKey &)
#define CHECK(condition)
Definition: Logger.h:187
int cpu_threads()
Definition: thread_count.h:23
#define VLOG(n)
Definition: Logger.h:277
int fill_baseline_hash_join_buff_32(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, const GenericKeyHandler *key_handler, const size_t num_elems, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
const std::shared_ptr< Analyzer::BinOper > condition_
void init_baseline_hash_join_buff_32(int8_t *hash_join_buff, const int32_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const int32_t cpu_thread_idx, const int32_t cpu_thread_count)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ initHashTableOnCpuFromCache()

void BaselineJoinHashTable::initHashTableOnCpuFromCache ( const HashTableCacheKey key)
protected

Definition at line 1128 of file BaselineJoinHashTable.cpp.

References cpu_hash_table_buff_, emitted_keys_count_, entry_count_, hash_table_cache_, hash_table_cache_mutex_, and layout_.

Referenced by OverlapsJoinHashTable::initHashTableOnCpu(), initHashTableOnCpu(), and BaselineJoinHashTable::HashTableCacheKey::operator<().

1128  {
1129  std::lock_guard<std::mutex> hash_table_cache_lock(hash_table_cache_mutex_);
1130  for (const auto& kv : hash_table_cache_) {
1131  if (kv.first == key) {
1132  cpu_hash_table_buff_ = kv.second.buffer;
1133  layout_ = kv.second.type;
1134  entry_count_ = kv.second.entry_count;
1135  emitted_keys_count_ = kv.second.emitted_keys_count;
1136  break;
1137  }
1138  }
1139 }
std::shared_ptr< std::vector< int8_t > > cpu_hash_table_buff_
JoinHashTableInterface::HashType layout_
static std::mutex hash_table_cache_mutex_
static std::vector< std::pair< HashTableCacheKey, HashTableCacheValue > > hash_table_cache_
+ Here is the caller graph for this function:

◆ initHashTableOnGpu()

int BaselineJoinHashTable::initHashTableOnGpu ( const std::vector< JoinColumn > &  join_columns,
const std::vector< JoinColumnTypeInfo > &  join_column_types,
const std::vector< JoinBucketInfo > &  join_bucket_info,
const JoinHashTableInterface::HashType  layout,
const size_t  key_component_width,
const size_t  key_component_count,
const int  device_id 
)
protectedvirtual

Reimplemented in OverlapsJoinHashTable.

Definition at line 743 of file BaselineJoinHashTable.cpp.

References ThrustAllocator::allocateScopedBuffer(), catalog_, copy_from_gpu(), copy_to_gpu(), entry_count_, fill_baseline_hash_join_buff_on_device_32(), fill_baseline_hash_join_buff_on_device_64(), fill_one_to_many_baseline_hash_table_on_device_32(), fill_one_to_many_baseline_hash_table_on_device_64(), Catalog_Namespace::Catalog::getDataMgr(), init_baseline_hash_join_buff_on_device_32(), init_baseline_hash_join_buff_on_device_64(), init_hash_join_buff_on_device(), JoinHashTableInterface::OneToMany, JoinHashTableInterface::OneToOne, transfer_object_to_gpu(), transfer_pod_vector_to_gpu(), and UNREACHABLE.

Referenced by initHashTableForDevice().

750  {
751  int err = 0;
752 #ifdef HAVE_CUDA
753  auto& data_mgr = catalog_->getDataMgr();
754  ThrustAllocator allocator(&data_mgr, device_id);
755  auto dev_err_buff =
756  reinterpret_cast<CUdeviceptr>(allocator.allocateScopedBuffer(sizeof(int)));
757  copy_to_gpu(&data_mgr, dev_err_buff, &err, sizeof(err), device_id);
758  switch (key_component_width) {
759  case 4:
761  reinterpret_cast<int8_t*>(gpu_hash_table_buff_[device_id]->getMemoryPtr()),
762  entry_count_,
763  key_component_count,
765  -1,
766  block_size_,
767  grid_size_);
768  break;
769  case 8:
771  reinterpret_cast<int8_t*>(gpu_hash_table_buff_[device_id]->getMemoryPtr()),
772  entry_count_,
773  key_component_count,
775  -1,
776  block_size_,
777  grid_size_);
778  break;
779  default:
780  UNREACHABLE();
781  }
782  auto join_columns_gpu = transfer_pod_vector_to_gpu(join_columns, allocator);
783  auto hash_buff =
784  reinterpret_cast<int8_t*>(gpu_hash_table_buff_[device_id]->getMemoryPtr());
785  auto join_column_types_gpu = transfer_pod_vector_to_gpu(join_column_types, allocator);
786 
787  const auto key_handler = GenericKeyHandler(key_component_count,
788  true,
789  join_columns_gpu,
790  join_column_types_gpu,
791  nullptr,
792  nullptr);
793  const auto key_handler_gpu = transfer_object_to_gpu(key_handler, allocator);
794  switch (key_component_width) {
795  case 4: {
797  hash_buff,
798  entry_count_,
799  -1,
800  key_component_count,
802  reinterpret_cast<int*>(dev_err_buff),
803  key_handler_gpu,
804  join_columns.front().num_elems,
805  block_size_,
806  grid_size_);
807  copy_from_gpu(&data_mgr, &err, dev_err_buff, sizeof(err), device_id);
808  break;
809  }
810  case 8: {
812  hash_buff,
813  entry_count_,
814  -1,
815  key_component_count,
817  reinterpret_cast<int*>(dev_err_buff),
818  key_handler_gpu,
819  join_columns.front().num_elems,
820  block_size_,
821  grid_size_);
822  copy_from_gpu(&data_mgr, &err, dev_err_buff, sizeof(err), device_id);
823  break;
824  }
825  default:
826  UNREACHABLE();
827  }
828  if (err) {
829  return err;
830  }
832  const auto entry_size = key_component_count * key_component_width;
833  auto one_to_many_buff = reinterpret_cast<int32_t*>(
834  gpu_hash_table_buff_[device_id]->getMemoryPtr() + entry_count_ * entry_size);
835  switch (key_component_width) {
836  case 4: {
837  const auto composite_key_dict =
838  reinterpret_cast<int32_t*>(gpu_hash_table_buff_[device_id]->getMemoryPtr());
840  one_to_many_buff, entry_count_, -1, block_size_, grid_size_);
842  composite_key_dict,
843  entry_count_,
844  -1,
845  key_component_count,
846  key_handler_gpu,
847  join_columns.front().num_elems,
848  block_size_,
849  grid_size_);
850  break;
851  }
852  case 8: {
853  const auto composite_key_dict =
854  reinterpret_cast<int64_t*>(gpu_hash_table_buff_[device_id]->getMemoryPtr());
856  one_to_many_buff, entry_count_, -1, block_size_, grid_size_);
858  composite_key_dict,
859  entry_count_,
860  -1,
861  key_handler_gpu,
862  join_columns.front().num_elems,
863  block_size_,
864  grid_size_);
865  break;
866  }
867  default:
868  UNREACHABLE();
869  }
870  }
871 #else
872  UNREACHABLE();
873 #endif
874  return err;
875 }
void fill_one_to_many_baseline_hash_table_on_device_32(int32_t *buff, const int32_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
void fill_baseline_hash_join_buff_on_device_32(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:177
unsigned long long CUdeviceptr
Definition: nocuda.h:27
#define UNREACHABLE()
Definition: Logger.h:231
T * transfer_pod_vector_to_gpu(const std::vector< T > &vec, ThrustAllocator &allocator)
T * transfer_object_to_gpu(const T &object, ThrustAllocator &allocator)
void copy_to_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
Definition: GpuMemUtils.cpp:31
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
void init_baseline_hash_join_buff_on_device_32(int8_t *hash_join_buff, const int32_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t block_size_x, const size_t grid_size_x)
void init_baseline_hash_join_buff_on_device_64(int8_t *hash_join_buff, const int32_t entry_count, const size_t key_component_count, const bool with_val_slot, const int32_t invalid_slot_val, const size_t block_size_x, const size_t grid_size_x)
void fill_one_to_many_baseline_hash_table_on_device_64(int32_t *buff, const int64_t *composite_key_dict, const size_t hash_entry_count, const int32_t invalid_slot_val, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
const Catalog_Namespace::Catalog * catalog_
void init_hash_join_buff_on_device(int32_t *buff, const int32_t entry_count, const int32_t invalid_slot_val, const size_t block_size_x, const size_t grid_size_x)
void fill_baseline_hash_join_buff_on_device_64(int8_t *hash_buff, const size_t entry_count, const int32_t invalid_slot_val, const size_t key_component_count, const bool with_val_slot, int *dev_err_buff, const GenericKeyHandler *key_handler, const size_t num_elems, const size_t block_size_x, const size_t grid_size_x)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ isBitwiseEq()

bool BaselineJoinHashTable::isBitwiseEq ( ) const
protected

Definition at line 1165 of file BaselineJoinHashTable.cpp.

References condition_, and kBW_EQ.

Referenced by OverlapsJoinHashTable::fetchColumnsForDevice(), fetchColumnsForDevice(), and BaselineJoinHashTable::HashTableCacheKey::operator<().

1165  {
1166  return condition_->get_optype() == kBW_EQ;
1167 }
Definition: sqldefs.h:31
const std::shared_ptr< Analyzer::BinOper > condition_
+ Here is the caller graph for this function:

◆ offsetBufferOff()

size_t BaselineJoinHashTable::offsetBufferOff ( ) const
overridevirtualnoexcept

Implements JoinHashTableInterface.

Definition at line 1010 of file BaselineJoinHashTable.cpp.

References CHECK, entry_count_, getKeyComponentCount(), getKeyComponentWidth(), layout_, and JoinHashTableInterface::OneToMany.

Referenced by codegenMatchingSet(), and countBufferOff().

1010  {
1012  const auto key_component_width = getKeyComponentWidth();
1013  CHECK(key_component_width == 4 || key_component_width == 8);
1014  const auto key_component_count = getKeyComponentCount();
1015  return entry_count_ * key_component_count * key_component_width;
1016 }
virtual size_t getKeyComponentCount() const
JoinHashTableInterface::HashType layout_
virtual size_t getKeyComponentWidth() const
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ payloadBufferOff()

size_t BaselineJoinHashTable::payloadBufferOff ( ) const
overridevirtualnoexcept

Implements JoinHashTableInterface.

Reimplemented in OverlapsJoinHashTable.

Definition at line 1023 of file BaselineJoinHashTable.cpp.

References CHECK, countBufferOff(), getComponentBufferSize(), layout_, and JoinHashTableInterface::OneToMany.

1023  {
1026 }
size_t countBufferOff() const noexcept override
size_t getComponentBufferSize() const noexcept
JoinHashTableInterface::HashType layout_
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:

◆ putHashTableOnCpuToCache()

void BaselineJoinHashTable::putHashTableOnCpuToCache ( const HashTableCacheKey key)
protected

Definition at line 1141 of file BaselineJoinHashTable.cpp.

References cpu_hash_table_buff_, emitted_keys_count_, entry_count_, hash_table_cache_, hash_table_cache_mutex_, and layout_.

Referenced by OverlapsJoinHashTable::initHashTableOnCpu(), initHashTableOnCpu(), and BaselineJoinHashTable::HashTableCacheKey::operator<().

1141  {
1142  std::lock_guard<std::mutex> hash_table_cache_lock(hash_table_cache_mutex_);
1143  for (const auto& kv : hash_table_cache_) {
1144  if (std::get<0>(kv) == key) {
1145  return;
1146  }
1147  }
1148  hash_table_cache_.emplace_back(
1149  key,
1150  HashTableCacheValue{
1152 }
std::shared_ptr< std::vector< int8_t > > cpu_hash_table_buff_
JoinHashTableInterface::HashType layout_
static std::mutex hash_table_cache_mutex_
static std::vector< std::pair< HashTableCacheKey, HashTableCacheValue > > hash_table_cache_
+ Here is the caller graph for this function:

◆ reify()

void BaselineJoinHashTable::reify ( const int  device_count)
protected

Definition at line 183 of file BaselineJoinHashTable.cpp.

References CHECK_LT, condition_, freeHashBufferMemory(), HashTypeCache::get(), getCompositeKeyInfo(), layout_, JoinHashTableInterface::OneToMany, reifyWithLayout(), HashTypeCache::set(), and VLOG.

183  {
184  CHECK_LT(0, device_count);
185 #ifdef HAVE_CUDA
186  gpu_hash_table_buff_.resize(device_count);
187 #endif // HAVE_CUDA
188  const auto composite_key_info = getCompositeKeyInfo();
189  const auto type_and_found = HashTypeCache::get(composite_key_info.cache_key_chunks);
190  const auto layout = type_and_found.second ? type_and_found.first : layout_;
191 
192  if (condition_->is_overlaps_oper()) {
193  try {
195  return;
196  } catch (const std::exception& e) {
197  VLOG(1) << "Caught exception while building overlaps baseline hash table: "
198  << e.what();
199  throw;
200  }
201  }
202 
203  try {
204  reifyWithLayout(device_count, layout);
205  } catch (const std::exception& e) {
206  VLOG(1) << "Caught exception while building baseline hash table: " << e.what();
208  HashTypeCache::set(composite_key_info.cache_key_chunks,
211  }
212 }
static void set(const std::vector< ChunkKey > &key, const JoinHashTableInterface::HashType hash_type)
JoinHashTableInterface::HashType layout_
CompositeKeyInfo getCompositeKeyInfo() const
#define CHECK_LT(x, y)
Definition: Logger.h:197
virtual void reifyWithLayout(const int device_count, const JoinHashTableInterface::HashType layout)
#define VLOG(n)
Definition: Logger.h:277
const std::shared_ptr< Analyzer::BinOper > condition_
static std::pair< JoinHashTableInterface::HashType, bool > get(const std::vector< ChunkKey > &key)
+ Here is the call graph for this function:

◆ reifyForDevice()

void BaselineJoinHashTable::reifyForDevice ( const ColumnsForDevice columns_for_device,
const JoinHashTableInterface::HashType  layout,
const int  device_id 
)
protected

Definition at line 461 of file BaselineJoinHashTable.cpp.

References ERR_FAILED_TO_FETCH_COLUMN, ERR_FAILED_TO_JOIN_ON_VIRTUAL_COLUMN, getEffectiveMemoryLevel(), initHashTableForDevice(), inner_outer_pairs_, BaselineJoinHashTable::ColumnsForDevice::join_buckets, BaselineJoinHashTable::ColumnsForDevice::join_column_types, BaselineJoinHashTable::ColumnsForDevice::join_columns, and to_string().

Referenced by OverlapsJoinHashTable::reifyWithLayout(), and reifyWithLayout().

463  {
464  const auto effective_memory_level = getEffectiveMemoryLevel(inner_outer_pairs_);
465  const auto err = initHashTableForDevice(columns_for_device.join_columns,
466  columns_for_device.join_column_types,
467  columns_for_device.join_buckets,
468  layout,
469  effective_memory_level,
470  device_id);
471  if (err) {
472  switch (err) {
474  throw FailedToFetchColumn();
477  default:
478  throw HashJoinFail(
479  std::string("Unrecognized error when initializing baseline hash table (") +
480  std::to_string(err) + std::string(")"));
481  }
482  }
483 }
std::string to_string(char const *&&v)
std::vector< InnerOuter > inner_outer_pairs_
static const int ERR_FAILED_TO_FETCH_COLUMN
int initHashTableForDevice(const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_buckets, const JoinHashTableInterface::HashType layout, const Data_Namespace::MemoryLevel effective_memory_level, const int device_id)
static const int ERR_FAILED_TO_JOIN_ON_VIRTUAL_COLUMN
Data_Namespace::MemoryLevel getEffectiveMemoryLevel(const std::vector< InnerOuter > &inner_outer_pairs) const
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ reifyWithLayout()

void BaselineJoinHashTable::reifyWithLayout ( const int  device_count,
const JoinHashTableInterface::HashType  layout 
)
protectedvirtual

Reimplemented in OverlapsJoinHashTable.

Definition at line 214 of file BaselineJoinHashTable.cpp.

References approximateTupleCount(), CHECK, emitted_keys_count_, entry_count_, fetchColumnsForDevice(), get_entries_per_device(), get_inner_query_info(), getInnerTableId(), InputTableInfo::info, layout_, memory_level_, JoinHashTableInterface::OneToMany, only_shards_for_device(), query_infos_, reifyForDevice(), and shardCount().

Referenced by reify(), and ~BaselineJoinHashTable().

216  {
217  layout_ = layout;
218  const auto& query_info = get_inner_query_info(getInnerTableId(), query_infos_).info;
219  if (query_info.fragments.empty()) {
220  return;
221  }
222  std::vector<BaselineJoinHashTable::ColumnsForDevice> columns_per_device;
223  const auto shard_count = shardCount();
224  for (int device_id = 0; device_id < device_count; ++device_id) {
225  const auto fragments =
226  shard_count
227  ? only_shards_for_device(query_info.fragments, device_id, device_count)
228  : query_info.fragments;
229  const auto columns_for_device = fetchColumnsForDevice(fragments, device_id);
230  columns_per_device.push_back(columns_for_device);
231  }
233  CHECK(!columns_per_device.front().join_columns.empty());
234  emitted_keys_count_ = columns_per_device.front().join_columns.front().num_elems;
235  size_t tuple_count;
236  std::tie(tuple_count, std::ignore) = approximateTupleCount(columns_per_device);
237  const auto entry_count = 2 * std::max(tuple_count, size_t(1));
238 
239  entry_count_ =
240  get_entries_per_device(entry_count, shard_count, device_count, memory_level_);
241  }
242  std::vector<std::future<void>> init_threads;
243  for (int device_id = 0; device_id < device_count; ++device_id) {
244  const auto fragments =
245  shard_count
246  ? only_shards_for_device(query_info.fragments, device_id, device_count)
247  : query_info.fragments;
248  init_threads.push_back(std::async(std::launch::async,
250  this,
251  columns_per_device[device_id],
252  layout,
253  device_id));
254  }
255  for (auto& init_thread : init_threads) {
256  init_thread.wait();
257  }
258  for (auto& init_thread : init_threads) {
259  init_thread.get();
260  }
261 }
std::deque< Fragmenter_Namespace::FragmentInfo > only_shards_for_device(const std::deque< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id, const int device_count)
Fragmenter_Namespace::TableInfo info
Definition: InputMetadata.h:35
JoinHashTableInterface::HashType layout_
size_t get_entries_per_device(const size_t total_entries, const size_t shard_count, const size_t device_count, const Data_Namespace::MemoryLevel memory_level)
const std::vector< InputTableInfo > & query_infos_
int getInnerTableId() const noexcept override
void reifyForDevice(const ColumnsForDevice &columns_for_device, const JoinHashTableInterface::HashType layout, const int device_id)
const Data_Namespace::MemoryLevel memory_level_
virtual ColumnsForDevice fetchColumnsForDevice(const std::deque< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id)
#define CHECK(condition)
Definition: Logger.h:187
virtual std::pair< size_t, size_t > approximateTupleCount(const std::vector< ColumnsForDevice > &) const
const InputTableInfo & get_inner_query_info(const int inner_table_id, const std::vector< InputTableInfo > &query_infos)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ shardCount()

size_t BaselineJoinHashTable::shardCount ( ) const
protected

Definition at line 511 of file BaselineJoinHashTable.cpp.

References condition_, executor_, getShardCountForCondition(), Data_Namespace::GPU_LEVEL, inner_outer_pairs_, and memory_level_.

Referenced by checkHashJoinReplicationConstraint(), getAllColumnFragments(), OverlapsJoinHashTable::reifyWithLayout(), and reifyWithLayout().

511  {
513  return 0;
514  }
517 }
std::vector< InnerOuter > inner_outer_pairs_
const Data_Namespace::MemoryLevel memory_level_
static size_t getShardCountForCondition(const Analyzer::BinOper *condition, const Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs)
const std::shared_ptr< Analyzer::BinOper > condition_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ yieldCacheInvalidator()

static auto BaselineJoinHashTable::yieldCacheInvalidator ( ) -> std::function<void()>
inlinestatic

Definition at line 79 of file BaselineJoinHashTable.h.

References hash_table_cache_, and hash_table_cache_mutex_.

79  {
80  return []() -> void {
81  std::lock_guard<std::mutex> guard(hash_table_cache_mutex_);
82  hash_table_cache_.clear();
83  };
84  }
static std::mutex hash_table_cache_mutex_
static std::vector< std::pair< HashTableCacheKey, HashTableCacheValue > > hash_table_cache_

Member Data Documentation

◆ catalog_

◆ column_cache_

ColumnCacheMap& BaselineJoinHashTable::column_cache_
protected

Definition at line 234 of file BaselineJoinHashTable.h.

Referenced by fetchColumn(), and getAllColumnFragments().

◆ condition_

◆ cpu_hash_table_buff_

std::shared_ptr<std::vector<int8_t> > BaselineJoinHashTable::cpu_hash_table_buff_
protected

◆ cpu_hash_table_buff_mutex_

std::mutex BaselineJoinHashTable::cpu_hash_table_buff_mutex_
protected

Definition at line 236 of file BaselineJoinHashTable.h.

Referenced by initHashTableForDevice().

◆ emitted_keys_count_

◆ entry_count_

◆ ERR_FAILED_TO_FETCH_COLUMN

const int BaselineJoinHashTable::ERR_FAILED_TO_FETCH_COLUMN {-3}
staticprotected

Definition at line 264 of file BaselineJoinHashTable.h.

Referenced by reifyForDevice().

◆ ERR_FAILED_TO_JOIN_ON_VIRTUAL_COLUMN

const int BaselineJoinHashTable::ERR_FAILED_TO_JOIN_ON_VIRTUAL_COLUMN {-4}
staticprotected

Definition at line 265 of file BaselineJoinHashTable.h.

Referenced by reifyForDevice().

◆ executor_

◆ hash_table_cache_

◆ hash_table_cache_mutex_

std::mutex BaselineJoinHashTable::hash_table_cache_mutex_
staticprotected

◆ inner_outer_pairs_

◆ layout_

◆ linearized_multifrag_column_mutex_

std::mutex BaselineJoinHashTable::linearized_multifrag_column_mutex_
protected

Definition at line 243 of file BaselineJoinHashTable.h.

Referenced by getAllColumnFragments().

◆ linearized_multifrag_column_owner_

RowSetMemoryOwner BaselineJoinHashTable::linearized_multifrag_column_owner_
protected

Definition at line 244 of file BaselineJoinHashTable.h.

Referenced by getAllColumnFragments().

◆ linearized_multifrag_columns_

std::map<LinearizedColumnCacheKey, LinearizedColumn> BaselineJoinHashTable::linearized_multifrag_columns_
protected

Definition at line 242 of file BaselineJoinHashTable.h.

Referenced by getAllColumnFragments().

◆ memory_level_

◆ query_infos_

const std::vector<InputTableInfo>& BaselineJoinHashTable::query_infos_
protected

The documentation for this class was generated from the following files: