OmniSciDB  16c4e035a1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HashJoin.h
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <llvm/IR/Value.h>
20 #include <cstdint>
21 #include <set>
22 #include <string>
23 
24 #include "Analyzer/Analyzer.h"
33 
34 class TooManyHashEntries : public std::runtime_error {
35  public:
37  : std::runtime_error("Hash tables with more than 2B entries not supported yet") {}
38 
39  TooManyHashEntries(const std::string& reason) : std::runtime_error(reason) {}
40 };
41 
42 class TableMustBeReplicated : public std::runtime_error {
43  public:
44  TableMustBeReplicated(const std::string& table_name)
45  : std::runtime_error("Hash join failed: Table '" + table_name +
46  "' must be replicated.") {}
47 };
48 
49 class HashJoinFail : public std::runtime_error {
50  public:
51  HashJoinFail(const std::string& reason) : std::runtime_error(reason) {}
52 };
53 
55  public:
56  NeedsOneToManyHash() : HashJoinFail("Needs one to many hash") {}
57 };
58 
60  public:
62  : HashJoinFail("Not enough memory for columns involved in join") {}
63 };
64 
66  public:
67  FailedToJoinOnVirtualColumn() : HashJoinFail("Cannot join on rowid") {}
68 };
69 
71  public:
72  OverlapsHashTableTooBig(const size_t overlaps_hash_table_max_bytes)
73  : HashJoinFail(
74  "Could not create overlaps hash table with less than max allowed size of " +
75  std::to_string(overlaps_hash_table_max_bytes) + " bytes") {}
76 };
77 
78 class HashingTypeHintDetected : public std::runtime_error {
79  public:
81  : std::runtime_error("Detect hashing scheme query hint: " +
82  HashTablePropertyRecycler::getHashingString(hashing_type)) {
83  hashing_type_ = hashing_type;
84  }
85 
87 };
88 
89 using InnerOuter = std::pair<const Analyzer::ColumnVar*, const Analyzer::Expr*>;
90 
92  const std::vector<JoinColumn> join_columns;
93  const std::vector<JoinColumnTypeInfo> join_column_types;
94  const std::vector<std::shared_ptr<Chunk_NS::Chunk>> chunks_owner;
95  std::vector<JoinBucketInfo> join_buckets;
96  const std::vector<std::shared_ptr<void>> malloc_owner;
97 
98  void setBucketInfo(const std::vector<double>& bucket_sizes_for_dimension,
99  const std::vector<InnerOuter> inner_outer_pairs);
100 };
101 
103  llvm::Value* elements;
104  llvm::Value* count;
105  llvm::Value* slot;
106 };
107 
109  std::vector<const void*> sd_inner_proxy_per_key;
110  std::vector<const void*> sd_outer_proxy_per_key;
111  std::vector<ChunkKey> cache_key_chunks; // used for the cache key
112 };
113 
114 class DeviceAllocator;
115 
116 class HashJoin {
117  public:
118  virtual std::string toString(const ExecutorDeviceType device_type,
119  const int device_id = 0,
120  bool raw = false) const = 0;
121 
122  virtual std::string toStringFlat64(const ExecutorDeviceType device_type,
123  const int device_id) const;
124 
125  virtual std::string toStringFlat32(const ExecutorDeviceType device_type,
126  const int device_id) const;
127 
128  virtual DecodedJoinHashBufferSet toSet(const ExecutorDeviceType device_type,
129  const int device_id) const = 0;
130 
131  virtual llvm::Value* codegenSlot(const CompilationOptions&, const size_t) = 0;
132 
134  const size_t) = 0;
135 
136  virtual int getInnerTableId() const noexcept = 0;
137 
138  virtual int getInnerTableRteIdx() const noexcept = 0;
139 
140  virtual HashType getHashType() const noexcept = 0;
141 
142  virtual const RegisteredQueryHint& getRegisteredQueryHint() = 0;
143 
144  virtual void registerQueryHint(const RegisteredQueryHint& query_hint) = 0;
145 
146  static bool layoutRequiresAdditionalBuffers(HashType layout) noexcept {
147  return (layout == HashType::ManyToMany || layout == HashType::OneToMany);
148  }
149 
150  static std::string getHashTypeString(HashType ht) noexcept {
151  const char* HashTypeStrings[3] = {"OneToOne", "OneToMany", "ManyToMany"};
152  return HashTypeStrings[static_cast<int>(ht)];
153  };
154 
156  const std::vector<llvm::Value*>& hash_join_idx_args_in,
157  const bool is_sharded,
158  const bool col_is_nullable,
159  const bool is_bw_eq,
160  const int64_t sub_buff_size,
161  Executor* executor,
162  const bool is_bucketized = false);
163 
164  static llvm::Value* codegenHashTableLoad(const size_t table_idx, Executor* executor);
165 
166  virtual Data_Namespace::MemoryLevel getMemoryLevel() const noexcept = 0;
167 
168  virtual int getDeviceCount() const noexcept = 0;
169 
170  virtual size_t offsetBufferOff() const noexcept = 0;
171 
172  virtual size_t countBufferOff() const noexcept = 0;
173 
174  virtual size_t payloadBufferOff() const noexcept = 0;
175 
176  virtual std::string getHashJoinType() const = 0;
177 
178  virtual bool isBitwiseEq() const = 0;
179 
181  const Analyzer::ColumnVar* hash_col,
182  const std::vector<Fragmenter_Namespace::FragmentInfo>& fragment_info,
183  const Data_Namespace::MemoryLevel effective_memory_level,
184  const int device_id,
185  std::vector<std::shared_ptr<Chunk_NS::Chunk>>& chunks_owner,
186  DeviceAllocator* dev_buff_owner,
187  std::vector<std::shared_ptr<void>>& malloc_owner,
188  Executor* executor,
189  ColumnCacheMap* column_cache);
190 
192  static std::shared_ptr<HashJoin> getInstance(
193  const std::shared_ptr<Analyzer::BinOper> qual_bin_oper,
194  const std::vector<InputTableInfo>& query_infos,
195  const Data_Namespace::MemoryLevel memory_level,
196  const JoinType join_type,
197  const HashType preferred_hash_type,
198  const int device_count,
199  ColumnCacheMap& column_cache,
200  Executor* executor,
201  const HashTableBuildDagMap& hashtable_build_dag_map,
202  const RegisteredQueryHint& query_hint,
203  const TableIdToNodeMap& table_id_to_node_map);
204 
206  static std::shared_ptr<HashJoin> getSyntheticInstance(
207  std::string_view table1,
208  std::string_view column1,
209  std::string_view table2,
210  std::string_view column2,
211  const Data_Namespace::MemoryLevel memory_level,
212  const HashType preferred_hash_type,
213  const int device_count,
214  ColumnCacheMap& column_cache,
215  Executor* executor);
216 
218  static std::shared_ptr<HashJoin> getSyntheticInstance(
219  const std::shared_ptr<Analyzer::BinOper> qual_bin_oper,
220  const Data_Namespace::MemoryLevel memory_level,
221  const HashType preferred_hash_type,
222  const int device_count,
223  ColumnCacheMap& column_cache,
224  Executor* executor);
225 
226  static std::pair<std::string, std::shared_ptr<HashJoin>> getSyntheticInstance(
227  std::vector<std::shared_ptr<Analyzer::BinOper>>,
228  const Data_Namespace::MemoryLevel memory_level,
229  const HashType preferred_hash_type,
230  const int device_count,
231  ColumnCacheMap& column_cache,
232  Executor* executor);
233 
234  static int getInnerTableId(const std::vector<InnerOuter>& inner_outer_pairs) {
235  CHECK(!inner_outer_pairs.empty());
236  const auto first_inner_col = inner_outer_pairs.front().first;
237  return first_inner_col->get_table_id();
238  }
239 
240  static void checkHashJoinReplicationConstraint(const int table_id,
241  const size_t shard_count,
242  const Executor* executor);
243 
244  // Swap the columns if needed and make the inner column the first component.
246  const Analyzer::Expr* rhs,
248  const TemporaryTables* temporary_tables,
249  const bool is_overlaps_join = false);
250 
251  // Normalize each expression tuple
252  static std::vector<InnerOuter> normalizeColumnPairs(
253  const Analyzer::BinOper* condition,
255  const TemporaryTables* temporary_tables);
256 
257  static void invalidateCache() {
259  hash_table_property_cache_->clearCache();
260  }
261 
262  static void markCachedItemAsDirty(size_t table_key) {
264  auto candidate_table_keys =
265  hash_table_property_cache_->getMappedQueryPlanDagsWithTableKey(table_key);
266  if (candidate_table_keys.has_value()) {
267  hash_table_property_cache_->markCachedItemAsDirty(
268  table_key,
269  *candidate_table_keys,
272  }
273  }
274 
275  HashTable* getHashTableForDevice(const size_t device_id) const {
276  CHECK_LT(device_id, hash_tables_for_device_.size());
277  return hash_tables_for_device_[device_id].get();
278  }
279 
280  size_t getJoinHashBufferSize(const ExecutorDeviceType device_type) {
281  CHECK(device_type == ExecutorDeviceType::CPU);
282  return getJoinHashBufferSize(device_type, 0);
283  }
284 
285  size_t getJoinHashBufferSize(const ExecutorDeviceType device_type,
286  const int device_id) const {
287  auto hash_table = getHashTableForDevice(device_id);
288  if (!hash_table) {
289  return 0;
290  }
291  return hash_table->getHashTableBufferSize(device_type);
292  }
293 
294  int8_t* getJoinHashBuffer(const ExecutorDeviceType device_type,
295  const int device_id) const {
296  // TODO: just make device_id a size_t
297  CHECK_LT(size_t(device_id), hash_tables_for_device_.size());
298  if (!hash_tables_for_device_[device_id]) {
299  return nullptr;
300  }
301  CHECK(hash_tables_for_device_[device_id]);
302  auto hash_table = hash_tables_for_device_[device_id].get();
303 #ifdef HAVE_CUDA
304  if (device_type == ExecutorDeviceType::CPU) {
305  return hash_table->getCpuBuffer();
306  } else {
307  CHECK(hash_table);
308  const auto gpu_buff = hash_table->getGpuBuffer();
309  return gpu_buff;
310  }
311 #else
312  CHECK(device_type == ExecutorDeviceType::CPU);
313  return hash_table->getCpuBuffer();
314 #endif
315  }
316 
318  auto empty_hash_tables =
320  hash_tables_for_device_.swap(empty_hash_tables);
321  }
322 
324  const std::vector<InnerOuter>& inner_outer_pairs,
325  const Executor* executor);
326 
329  return hash_table_property_cache_.get();
330  }
331 
332  protected:
333  virtual size_t getComponentBufferSize() const noexcept = 0;
334 
335  std::vector<std::shared_ptr<HashTable>> hash_tables_for_device_;
337 };
338 
339 std::ostream& operator<<(std::ostream& os, const DecodedJoinHashBufferEntry& e);
340 
341 std::ostream& operator<<(std::ostream& os, const DecodedJoinHashBufferSet& s);
342 
343 std::shared_ptr<Analyzer::ColumnVar> getSyntheticColumnVar(std::string_view table,
344  std::string_view column,
345  int rte_idx,
346  Executor* executor);
347 
348 size_t get_shard_count(const Analyzer::BinOper* join_condition, const Executor* executor);
349 
350 size_t get_shard_count(
351  std::pair<const Analyzer::ColumnVar*, const Analyzer::Expr*> equi_pair,
352  const Executor* executor);
Defines data structures for the semantic analysis phase of query processing.
virtual int getInnerTableRteIdx() const noexcept=0
virtual size_t payloadBufferOff() const noexcept=0
virtual std::string getHashJoinType() const =0
virtual HashJoinMatchingSet codegenMatchingSet(const CompilationOptions &, const size_t)=0
JoinType
Definition: sqldefs.h:108
std::string cat(Ts &&...args)
static llvm::Value * codegenHashTableLoad(const size_t table_idx, Executor *executor)
Definition: HashJoin.cpp:219
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:114
std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * > InnerOuter
Definition: HashJoin.h:89
virtual HashType getHashType() const noexcept=0
ExecutorDeviceType
std::vector< ChunkKey > cache_key_chunks
Definition: HashJoin.h:111
std::vector< const void * > sd_inner_proxy_per_key
Definition: HashJoin.h:109
virtual int getDeviceCount() const noexcept=0
virtual std::string toStringFlat64(const ExecutorDeviceType device_type, const int device_id) const
Definition: HashJoin.cpp:114
#define const
void setBucketInfo(const std::vector< double > &bucket_sizes_for_dimension, const std::vector< InnerOuter > inner_outer_pairs)
Definition: HashJoin.cpp:34
JoinColumn fetchJoinColumn(const Analyzer::ColumnVar *hash_col, const std::vector< Fragmenter_Namespace::FragmentInfo > &fragment_info, const Data_Namespace::MemoryLevel effective_memory_level, const int device_id, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, DeviceAllocator *dev_buff_owner, std::vector< std::shared_ptr< void >> &malloc_owner, Executor *executor, ColumnCacheMap *column_cache)
Definition: HashJoin.cpp:56
llvm::Value * elements
Definition: HashJoin.h:103
llvm::Value * count
Definition: HashJoin.h:104
virtual Data_Namespace::MemoryLevel getMemoryLevel() const noexcept=0
std::vector< std::shared_ptr< HashTable > > hash_tables_for_device_
Definition: HashJoin.h:335
static HashTablePropertyRecycler * getHashTablePropertyCache()
Definition: HashJoin.h:327
Definition: HashTable.h:21
HashTableHashingType hashing_type_
Definition: HashJoin.h:86
OverlapsHashTableTooBig(const size_t overlaps_hash_table_max_bytes)
Definition: HashJoin.h:72
virtual llvm::Value * codegenSlot(const CompilationOptions &, const size_t)=0
TableMustBeReplicated(const std::string &table_name)
Definition: HashJoin.h:44
void freeHashBufferMemory()
Definition: HashJoin.h:317
virtual void registerQueryHint(const RegisteredQueryHint &query_hint)=0
virtual size_t offsetBufferOff() const noexcept=0
HashingTypeHintDetected(HashTableHashingType hashing_type)
Definition: HashJoin.h:80
virtual std::string toStringFlat32(const ExecutorDeviceType device_type, const int device_id) const
Definition: HashJoin.cpp:119
std::string to_string(char const *&&v)
virtual size_t countBufferOff() const noexcept=0
std::unordered_map< int, const ResultSetPtr & > TemporaryTables
Definition: InputMetadata.h:31
const std::vector< JoinColumnTypeInfo > join_column_types
Definition: HashJoin.h:93
HashJoinFail(const std::string &reason)
Definition: HashJoin.h:51
static void invalidateCache()
Definition: HashJoin.h:257
std::vector< const void * > sd_outer_proxy_per_key
Definition: HashJoin.h:110
std::unordered_map< int, const RelAlgNode * > TableIdToNodeMap
static std::unique_ptr< HashTablePropertyRecycler > hash_table_property_cache_
Definition: HashJoin.h:336
int8_t * getJoinHashBuffer(const ExecutorDeviceType device_type, const int device_id) const
Definition: HashJoin.h:294
virtual int getInnerTableId() const noexcept=0
size_t getJoinHashBufferSize(const ExecutorDeviceType device_type, const int device_id) const
Definition: HashJoin.h:285
virtual size_t getComponentBufferSize() const noexcept=0
static void checkHashJoinReplicationConstraint(const int table_id, const size_t shard_count, const Executor *executor)
Definition: HashJoin.cpp:645
const std::vector< std::shared_ptr< Chunk_NS::Chunk > > chunks_owner
Definition: HashJoin.h:94
HashTableHashingType
Definition: QueryHint.h:45
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
HashTable * getHashTableForDevice(const size_t device_id) const
Definition: HashJoin.h:275
static CompositeKeyInfo getCompositeKeyInfo(const std::vector< InnerOuter > &inner_outer_pairs, const Executor *executor)
Definition: HashJoin.cpp:392
#define CHECK_LT(x, y)
Definition: Logger.h:221
TooManyHashEntries(const std::string &reason)
Definition: HashJoin.h:39
static std::string getHashTypeString(HashType ht) noexcept
Definition: HashJoin.h:150
size_t getJoinHashBufferSize(const ExecutorDeviceType device_type)
Definition: HashJoin.h:280
std::set< DecodedJoinHashBufferEntry > DecodedJoinHashBufferSet
Definition: HashTable.h:34
virtual const RegisteredQueryHint & getRegisteredQueryHint()=0
static InnerOuter normalizeColumnPair(const Analyzer::Expr *lhs, const Analyzer::Expr *rhs, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables, const bool is_overlaps_join=false)
Definition: HashJoin.cpp:661
static std::vector< InnerOuter > normalizeColumnPairs(const Analyzer::BinOper *condition, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables)
Definition: HashJoin.cpp:802
#define CHECK(condition)
Definition: Logger.h:211
llvm::Value * slot
Definition: HashJoin.h:105
FileBuffer Chunk
A Chunk is the fundamental unit of execution in Map-D.
Definition: FileMgr.h:74
const std::vector< std::shared_ptr< void > > malloc_owner
Definition: HashJoin.h:96
static std::shared_ptr< HashJoin > getSyntheticInstance(std::string_view table1, std::string_view column1, std::string_view table2, std::string_view column2, const Data_Namespace::MemoryLevel memory_level, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor)
Make hash table from named tables and columns (such as for testing).
Definition: HashJoin.cpp:541
virtual DecodedJoinHashBufferSet toSet(const ExecutorDeviceType device_type, const int device_id) const =0
std::vector< JoinBucketInfo > join_buckets
Definition: HashJoin.h:95
static constexpr DeviceIdentifier CPU_DEVICE_IDENTIFIER
Definition: DataRecycler.h:133
static void markCachedItemAsDirty(size_t table_key)
Definition: HashJoin.h:262
size_t get_shard_count(const Analyzer::BinOper *join_condition, const Executor *executor)
Definition: HashJoin.cpp:848
static std::shared_ptr< HashJoin > getInstance(const std::shared_ptr< Analyzer::BinOper > qual_bin_oper, const std::vector< InputTableInfo > &query_infos, const Data_Namespace::MemoryLevel memory_level, const JoinType join_type, const HashType preferred_hash_type, const int device_count, ColumnCacheMap &column_cache, Executor *executor, const HashTableBuildDagMap &hashtable_build_dag_map, const RegisteredQueryHint &query_hint, const TableIdToNodeMap &table_id_to_node_map)
Make hash table from an in-flight SQL query&#39;s parse tree etc.
Definition: HashJoin.cpp:245
virtual std::string toString(const ExecutorDeviceType device_type, const int device_id=0, bool raw=false) const =0
std::shared_ptr< Analyzer::ColumnVar > getSyntheticColumnVar(std::string_view table, std::string_view column, int rte_idx, Executor *executor)
Definition: HashJoin.cpp:429
HashType
Definition: HashTable.h:19
const std::vector< JoinColumn > join_columns
Definition: HashJoin.h:92
static bool layoutRequiresAdditionalBuffers(HashType layout) noexcept
Definition: HashJoin.h:146
virtual bool isBitwiseEq() const =0
std::unordered_map< JoinColumnsInfo, HashTableBuildDag > HashTableBuildDagMap