OmniSciDB  471d68cefb
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
BaselineJoinHashTable.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifdef HAVE_CUDA
20 #include <cuda.h>
21 #endif
22 #include <cstdint>
23 #include <map>
24 #include <mutex>
25 #include <thread>
26 #include <unordered_set>
27 #include <vector>
28 
29 #include "Analyzer/Analyzer.h"
30 #include "DataMgr/MemoryLevel.h"
39 
40 class Executor;
41 
42 // Representation for a hash table using the baseline layout: an open-addressing
43 // hash with a fill rate of 50%. It is used for equi-joins on multiple columns and
44 // on single sparse columns (with very wide range), typically big integer. As of
45 // now, such tuples must be unique within the inner table.
47  public:
49  static std::shared_ptr<BaselineJoinHashTable> getInstance(
50  const std::shared_ptr<Analyzer::BinOper> condition,
51  const std::vector<InputTableInfo>& query_infos,
52  const Data_Namespace::MemoryLevel memory_level,
53  const JoinType join_type,
54  const HashType preferred_hash_type,
55  const int device_count,
56  ColumnCacheMap& column_cache,
57  Executor* executor,
58  const HashTableBuildDagMap& hashtable_build_dag_map,
59  const TableIdToNodeMap& table_id_to_node_map);
60 
61  static size_t getShardCountForCondition(
62  const Analyzer::BinOper* condition,
63  const Executor* executor,
64  const std::vector<InnerOuter>& inner_outer_pairs);
65 
66  std::string toString(const ExecutorDeviceType device_type,
67  const int device_id = 0,
68  bool raw = false) const override;
69 
70  std::set<DecodedJoinHashBufferEntry> toSet(const ExecutorDeviceType device_type,
71  const int device_id) const override;
72 
73  llvm::Value* codegenSlot(const CompilationOptions&, const size_t) override;
74 
76  const size_t) override;
77 
78  int getInnerTableId() const noexcept override;
79 
80  int getInnerTableRteIdx() const noexcept override;
81 
82  HashType getHashType() const noexcept override;
83 
85  return memory_level_;
86  };
87 
88  int getDeviceCount() const noexcept override { return device_count_; };
89 
90  size_t offsetBufferOff() const noexcept override;
91 
92  size_t countBufferOff() const noexcept override;
93 
94  size_t payloadBufferOff() const noexcept override;
95 
96  std::string getHashJoinType() const final { return "Baseline"; }
97 
98  static auto getCacheInvalidator() -> std::function<void()> {
101  return []() -> void {
102  auto layout_cache_invalidator = hash_table_layout_cache_->getCacheInvalidator();
103  layout_cache_invalidator();
104 
105  auto main_cache_invalidator = hash_table_cache_->getCacheInvalidator();
106  main_cache_invalidator();
107  };
108  }
109 
112  return hash_table_cache_.get();
113  }
116  return hash_table_layout_cache_.get();
117  }
118 
120 
121  protected:
122  BaselineJoinHashTable(const std::shared_ptr<Analyzer::BinOper> condition,
123  const JoinType join_type,
124  const std::vector<InputTableInfo>& query_infos,
125  const Data_Namespace::MemoryLevel memory_level,
126  ColumnCacheMap& column_cache,
127  Executor* executor,
128  const std::vector<InnerOuter>& inner_outer_pairs,
129  const int device_count,
130  QueryPlanHash hashtable_cache_key,
131  HashtableCacheMetaInfo hashtable_cache_meta_info,
132  const TableIdToNodeMap& table_id_to_node_map);
133 
134  size_t getComponentBufferSize() const noexcept override;
135 
136  size_t getKeyBufferSize() const noexcept;
137 
138  static int getInnerTableId(const std::vector<InnerOuter>& inner_outer_pairs);
139 
140  virtual void reifyWithLayout(const HashType layout);
141 
143  const std::vector<Fragmenter_Namespace::FragmentInfo>& fragments,
144  const int device_id,
145  DeviceAllocator* dev_buff_owner);
146 
147  virtual std::pair<size_t, size_t> approximateTupleCount(
148  const std::vector<ColumnsForDevice>&,
149  QueryPlanHash key,
150  CacheItemType item_type,
151  DeviceIdentifier device_identifier) const;
152 
153  virtual size_t getKeyComponentWidth() const;
154 
155  virtual size_t getKeyComponentCount() const;
156 
157  virtual llvm::Value* codegenKey(const CompilationOptions&);
158 
159  size_t shardCount() const;
160 
161  Data_Namespace::MemoryLevel getEffectiveMemoryLevel(
162  const std::vector<InnerOuter>& inner_outer_pairs) const;
163 
164  void reify(const HashType preferred_layout);
165 
166  virtual void reifyForDevice(const ColumnsForDevice& columns_for_device,
167  const HashType layout,
168  const int device_id,
169  const size_t entry_count,
170  const size_t emitted_keys_count,
171  const logger::ThreadId parent_thread_id);
172 
173  virtual int initHashTableForDevice(
174  const std::vector<JoinColumn>& join_columns,
175  const std::vector<JoinColumnTypeInfo>& join_column_types,
176  const std::vector<JoinBucketInfo>& join_buckets,
177  const HashType layout,
178  const Data_Namespace::MemoryLevel effective_memory_level,
179  const size_t entry_count,
180  const size_t emitted_keys_count,
181  const int device_id);
182 
183  llvm::Value* hashPtr(const size_t index);
184 
185  std::shared_ptr<HashTable> initHashTableOnCpuFromCache(
186  QueryPlanHash key,
187  CacheItemType item_type,
188  DeviceIdentifier device_identifier);
189 
191  CacheItemType item_type,
192  std::shared_ptr<HashTable> hashtable_ptr,
193  DeviceIdentifier device_identifier,
194  size_t hashtable_building_time);
195 
196  std::pair<std::optional<size_t>, size_t> getApproximateTupleCountFromCache(
197  QueryPlanHash key,
198  CacheItemType item_type,
199  DeviceIdentifier device_identifier) const;
200 
201  bool isBitwiseEq() const;
202 
204  std::vector<InnerOuter> inner_outer_pairs;
205  const size_t num_elements;
206  const SQLOps optype;
208  };
209 
212  auto hash = boost::hash_value(::toString(info.optype));
213  for (InnerOuter inner_outer : info.inner_outer_pairs) {
214  auto inner_col = inner_outer.first;
215  auto rhs_col_var = dynamic_cast<const Analyzer::ColumnVar*>(inner_outer.second);
216  auto outer_col = rhs_col_var ? rhs_col_var : inner_col;
217  boost::hash_combine(hash, inner_col->toString());
218  if (inner_col->get_type_info().is_string()) {
219  boost::hash_combine(hash, outer_col->toString());
220  }
221  }
222  boost::hash_combine(hash, info.num_elements);
223  boost::hash_combine(hash, ::toString(info.join_type));
224  return hash;
225  }
226 
227  const std::shared_ptr<Analyzer::BinOper> condition_;
229  const std::vector<InputTableInfo>& query_infos_;
231  Executor* executor_;
234 
235  std::vector<InnerOuter> inner_outer_pairs_;
237  const int device_count_;
239  std::optional<HashType>
240  layout_override_; // allows us to use a 1:many hash table for many:many
241 
245 
246  static std::unique_ptr<HashtableRecycler> hash_table_cache_;
247  static std::unique_ptr<HashingSchemeRecycler> hash_table_layout_cache_;
248 };
Defines data structures for the semantic analysis phase of query processing.
size_t offsetBufferOff() const noexceptoverride
std::set< DecodedJoinHashBufferEntry > toSet(const ExecutorDeviceType device_type, const int device_id) const override
virtual std::pair< size_t, size_t > approximateTupleCount(const std::vector< ColumnsForDevice > &, QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier) const
void putHashTableOnCpuToCache(QueryPlanHash key, CacheItemType item_type, std::shared_ptr< HashTable > hashtable_ptr, DeviceIdentifier device_identifier, size_t hashtable_building_time)
size_t DeviceIdentifier
Definition: DataRecycler.h:111
JoinType
Definition: sqldefs.h:108
std::string toString(const ExecutorDeviceType device_type, const int device_id=0, bool raw=false) const override
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:113
std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * > InnerOuter
Definition: HashJoin.h:77
Data_Namespace::MemoryLevel getEffectiveMemoryLevel(const std::vector< InnerOuter > &inner_outer_pairs) const
ExecutorDeviceType
#define const
HashJoinMatchingSet codegenMatchingSet(const CompilationOptions &, const size_t) override
SQLOps
Definition: sqldefs.h:29
size_t getKeyBufferSize() const noexcept
std::pair< std::optional< size_t >, size_t > getApproximateTupleCountFromCache(QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier) const
const TableIdToNodeMap table_id_to_node_map_
size_t getComponentBufferSize() const noexceptoverride
int getInnerTableRteIdx() const noexceptoverride
virtual void reifyForDevice(const ColumnsForDevice &columns_for_device, const HashType layout, const int device_id, const size_t entry_count, const size_t emitted_keys_count, const logger::ThreadId parent_thread_id)
virtual ColumnsForDevice fetchColumnsForDevice(const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id, DeviceAllocator *dev_buff_owner)
const std::vector< InputTableInfo > & query_infos_
virtual llvm::Value * codegenKey(const CompilationOptions &)
std::shared_ptr< HashTable > initHashTableOnCpuFromCache(QueryPlanHash key, CacheItemType item_type, DeviceIdentifier device_identifier)
size_t payloadBufferOff() const noexceptoverride
std::vector< InnerOuter > inner_outer_pairs_
void reify(const HashType preferred_layout)
HashType getHashType() const noexceptoverride
static QueryPlanHash getAlternativeCacheKey(AlternativeCacheKeyForBaselineHashJoin &info)
CacheItemType
Definition: DataRecycler.h:36
static std::unique_ptr< HashtableRecycler > hash_table_cache_
ColumnCacheMap & column_cache_
std::unordered_map< int, const RelAlgNode * > TableIdToNodeMap
std::string getHashJoinType() const final
Data_Namespace::MemoryLevel getMemoryLevel() const noexceptoverride
BaselineJoinHashTable(const std::shared_ptr< Analyzer::BinOper > condition, const JoinType join_type, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, ColumnCacheMap &column_cache, Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs, const int device_count, QueryPlanHash hashtable_cache_key, HashtableCacheMetaInfo hashtable_cache_meta_info, const TableIdToNodeMap &table_id_to_node_map)
HashtableCacheMetaInfo hashtable_cache_meta_info_
virtual void reifyWithLayout(const HashType layout)
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
Executor(const ExecutorId id, Data_Namespace::DataMgr *data_mgr, const size_t block_size_x, const size_t grid_size_x, const size_t max_gpu_slab_size, const std::string &debug_dir, const std::string &debug_file)
Definition: Execute.cpp:155
static std::shared_ptr< BaselineJoinHashTable > getInstance(const std::shared_ptr< Analyzer::BinOper > condition, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor, const HashTableBuildDagMap &hashtable_build_dag_map, const TableIdToNodeMap &table_id_to_node_map)
Make hash table from an in-flight SQL query&#39;s parse tree etc.
int getInnerTableId() const noexceptoverride
static HashingSchemeRecycler * getHashingSchemeCache()
std::optional< HashType > layout_override_
const Catalog_Namespace::Catalog * catalog_
uint64_t ThreadId
Definition: Logger.h:345
size_t QueryPlanHash
const Data_Namespace::MemoryLevel memory_level_
llvm::Value * hashPtr(const size_t index)
virtual int initHashTableForDevice(const std::vector< JoinColumn > &join_columns, const std::vector< JoinColumnTypeInfo > &join_column_types, const std::vector< JoinBucketInfo > &join_buckets, const HashType layout, const Data_Namespace::MemoryLevel effective_memory_level, const size_t entry_count, const size_t emitted_keys_count, const int device_id)
llvm::Value * codegenSlot(const CompilationOptions &, const size_t) override
#define CHECK(condition)
Definition: Logger.h:209
static HashtableRecycler * getHashTableCache()
virtual size_t getKeyComponentCount() const
virtual size_t getKeyComponentWidth() const
static size_t getShardCountForCondition(const Analyzer::BinOper *condition, const Executor *executor, const std::vector< InnerOuter > &inner_outer_pairs)
static std::unique_ptr< HashingSchemeRecycler > hash_table_layout_cache_
int getDeviceCount() const noexceptoverride
static auto getCacheInvalidator() -> std::function< void()>
HashType
Definition: HashTable.h:19
const std::shared_ptr< Analyzer::BinOper > condition_
size_t countBufferOff() const noexceptoverride
std::unordered_map< JoinColumnsInfo, HashTableBuildDag > HashTableBuildDagMap