OmniSciDB  95562058bd
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
JoinHashTable.h File Reference
#include "Analyzer/Analyzer.h"
#include "Catalog/Catalog.h"
#include "DataMgr/Allocators/ThrustAllocator.h"
#include "DataMgr/Chunk/Chunk.h"
#include "QueryEngine/ColumnarResults.h"
#include "QueryEngine/Descriptors/InputDescriptors.h"
#include "QueryEngine/Descriptors/RowSetMemoryOwner.h"
#include "QueryEngine/ExpressionRange.h"
#include "QueryEngine/InputMetadata.h"
#include "QueryEngine/JoinHashTable/JoinHashTableInterface.h"
#include <llvm/IR/Value.h>
#include <functional>
#include <memory>
#include <mutex>
#include <stdexcept>
+ Include dependency graph for JoinHashTable.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  JoinHashTable
 
struct  JoinHashTable::JoinHashTableCacheKey
 

Functions

size_t get_shard_count (const Analyzer::BinOper *join_condition, const Executor *executor)
 
size_t get_shard_count (std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * > equi_pair, const Executor *executor)
 
bool needs_dictionary_translation (const Analyzer::ColumnVar *inner_col, const Analyzer::Expr *outer_col, const Executor *executor)
 
InnerOuter normalize_column_pair (const Analyzer::Expr *lhs, const Analyzer::Expr *rhs, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables, const bool is_overlaps_join=false)
 
std::vector< InnerOuternormalize_column_pairs (const Analyzer::BinOper *condition, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables)
 
std::vector
< Fragmenter_Namespace::FragmentInfo
only_shards_for_device (const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id, const int device_count)
 
const InputTableInfoget_inner_query_info (const int inner_table_id, const std::vector< InputTableInfo > &query_infos)
 
size_t get_entries_per_device (const size_t total_entries, const size_t shard_count, const size_t device_count, const Data_Namespace::MemoryLevel memory_level)
 

Function Documentation

size_t get_entries_per_device ( const size_t  total_entries,
const size_t  shard_count,
const size_t  device_count,
const Data_Namespace::MemoryLevel  memory_level 
)

Definition at line 1607 of file JoinHashTable.cpp.

References CHECK_GT, and Data_Namespace::GPU_LEVEL.

Referenced by OverlapsJoinHashTable::calculateCounts(), BaselineJoinHashTable::getInstance(), OverlapsJoinHashTable::getInstance(), and BaselineJoinHashTable::reifyWithLayout().

1610  {
1611  const auto entries_per_shard =
1612  shard_count ? (total_entries + shard_count - 1) / shard_count : total_entries;
1613  size_t entries_per_device = entries_per_shard;
1614  if (memory_level == Data_Namespace::GPU_LEVEL && shard_count) {
1615  const auto shards_per_device = (shard_count + device_count - 1) / device_count;
1616  CHECK_GT(shards_per_device, 0u);
1617  entries_per_device = entries_per_shard * shards_per_device;
1618  }
1619  return entries_per_device;
1620 }
#define CHECK_GT(x, y)
Definition: Logger.h:209

+ Here is the caller graph for this function:

const InputTableInfo& get_inner_query_info ( const int  inner_table_id,
const std::vector< InputTableInfo > &  query_infos 
)

Definition at line 1593 of file JoinHashTable.cpp.

References CHECK.

Referenced by JoinHashTable::getInnerQueryInfo(), OverlapsJoinHashTable::getInstance(), BaselineJoinHashTable::getInstance(), OverlapsJoinHashTable::reifyWithLayout(), and BaselineJoinHashTable::reifyWithLayout().

1595  {
1596  std::optional<size_t> ti_idx;
1597  for (size_t i = 0; i < query_infos.size(); ++i) {
1598  if (inner_table_id == query_infos[i].table_id) {
1599  ti_idx = i;
1600  break;
1601  }
1602  }
1603  CHECK(ti_idx);
1604  return query_infos[*ti_idx];
1605 }
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the caller graph for this function:

size_t get_shard_count ( const Analyzer::BinOper join_condition,
const Executor executor 
)

Definition at line 264 of file JoinHashTable.cpp.

References anonymous_namespace{JoinHashTable.cpp}::get_cols(), and get_shard_count().

Referenced by JoinHashTable::checkHashJoinReplicationConstraint(), get_shard_count(), BaselineJoinHashTable::getShardCountForCondition(), JoinHashTable::initOneToManyHashTable(), JoinHashTable::shardCount(), and Executor::skipFragmentPair().

265  {
266  const Analyzer::ColumnVar* inner_col{nullptr};
267  const Analyzer::Expr* outer_col{nullptr};
268  std::shared_ptr<Analyzer::BinOper> redirected_bin_oper;
269  try {
270  std::tie(inner_col, outer_col) =
271  get_cols(join_condition, *executor->getCatalog(), executor->getTemporaryTables());
272  } catch (...) {
273  return 0;
274  }
275  if (!inner_col || !outer_col) {
276  return 0;
277  }
278  return get_shard_count({inner_col, outer_col}, executor);
279 }
size_t get_shard_count(const Analyzer::BinOper *join_condition, const Executor *executor)
std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * > get_cols(const Analyzer::BinOper *qual_bin_oper, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t get_shard_count ( std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * >  equi_pair,
const Executor executor 
)

Definition at line 302 of file JoinHashTable.cpp.

References CHECK, and anonymous_namespace{JoinHashTable.cpp}::shard_count_less_or_equal_device_count().

304  {
305  const auto inner_col = equi_pair.first;
306  const auto outer_col = dynamic_cast<const Analyzer::ColumnVar*>(equi_pair.second);
307  if (!outer_col || inner_col->get_table_id() < 0 || outer_col->get_table_id() < 0) {
308  return 0;
309  }
310  if (outer_col->get_rte_idx()) {
311  return 0;
312  }
313  if (inner_col->get_type_info() != outer_col->get_type_info()) {
314  return 0;
315  }
316  const auto catalog = executor->getCatalog();
317  const auto inner_td = catalog->getMetadataForTable(inner_col->get_table_id());
318  CHECK(inner_td);
319  const auto outer_td = catalog->getMetadataForTable(outer_col->get_table_id());
320  CHECK(outer_td);
321  if (inner_td->shardedColumnId == 0 || outer_td->shardedColumnId == 0 ||
322  inner_td->nShards != outer_td->nShards) {
323  return 0;
324  }
325  if (!shard_count_less_or_equal_device_count(inner_td->tableId, executor)) {
326  return 0;
327  }
328  // The two columns involved must be the ones on which the tables have been sharded on.
329  return (inner_td->shardedColumnId == inner_col->get_column_id() &&
330  outer_td->shardedColumnId == outer_col->get_column_id()) ||
331  (outer_td->shardedColumnId == inner_col->get_column_id() &&
332  inner_td->shardedColumnId == inner_col->get_column_id())
333  ? inner_td->nShards
334  : 0;
335 }
bool shard_count_less_or_equal_device_count(const int inner_table_id, const Executor *executor)
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

bool needs_dictionary_translation ( const Analyzer::ColumnVar inner_col,
const Analyzer::Expr outer_col,
const Executor executor 
)

Definition at line 447 of file JoinHashTable.cpp.

References CHECK, CHECK_EQ, get_column_descriptor_maybe(), Analyzer::ColumnVar::get_column_id(), get_column_type(), and Analyzer::ColumnVar::get_table_id().

Referenced by BaselineJoinHashTable::getEffectiveMemoryLevel(), JoinHashTable::reifyOneToManyForDevice(), and JoinHashTable::reifyOneToOneForDevice().

449  {
450  const auto catalog = executor->getCatalog();
451  CHECK(catalog);
452  const auto inner_cd = get_column_descriptor_maybe(
453  inner_col->get_column_id(), inner_col->get_table_id(), *catalog);
454  const auto& inner_ti = get_column_type(inner_col->get_column_id(),
455  inner_col->get_table_id(),
456  inner_cd,
457  executor->getTemporaryTables());
458  // Only strings may need dictionary translation.
459  if (!inner_ti.is_string()) {
460  return false;
461  }
462  const auto outer_col = dynamic_cast<const Analyzer::ColumnVar*>(outer_col_expr);
463  CHECK(outer_col);
464  const auto outer_cd = get_column_descriptor_maybe(
465  outer_col->get_column_id(), outer_col->get_table_id(), *catalog);
466  // Don't want to deal with temporary tables for now, require translation.
467  if (!inner_cd || !outer_cd) {
468  return true;
469  }
470  const auto& outer_ti = get_column_type(outer_col->get_column_id(),
471  outer_col->get_table_id(),
472  outer_cd,
473  executor->getTemporaryTables());
474  CHECK_EQ(inner_ti.is_string(), outer_ti.is_string());
475  // If the two columns don't share the dictionary, translation is needed.
476  return outer_ti.get_comp_param() != inner_ti.get_comp_param();
477 }
int get_table_id() const
Definition: Analyzer.h:194
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const SQLTypeInfo get_column_type(const int col_id, const int table_id, const ColumnDescriptor *cd, const TemporaryTables *temporary_tables)
Definition: Execute.h:199
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:183
#define CHECK(condition)
Definition: Logger.h:197
int get_column_id() const
Definition: Analyzer.h:195

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

InnerOuter normalize_column_pair ( const Analyzer::Expr lhs,
const Analyzer::Expr rhs,
const Catalog_Namespace::Catalog cat,
const TemporaryTables temporary_tables,
const bool  is_overlaps_join = false 
)

Definition at line 41 of file JoinHashTable.cpp.

References cat(), get_column_descriptor_maybe(), get_column_type(), Analyzer::Expr::get_type_info(), kCAST, kENCODING_DICT, kPOINT, and ScalarExprVisitor< T >::visit().

Referenced by anonymous_namespace{JoinHashTable.cpp}::get_cols(), and normalize_column_pairs().

45  {
46  const auto& lhs_ti = lhs->get_type_info();
47  const auto& rhs_ti = rhs->get_type_info();
48  if (!is_overlaps_join) {
49  if (lhs_ti.get_type() != rhs_ti.get_type()) {
50  throw HashJoinFail("Equijoin types must be identical, found: " +
51  lhs_ti.get_type_name() + ", " + rhs_ti.get_type_name());
52  }
53  if (!lhs_ti.is_integer() && !lhs_ti.is_time() && !lhs_ti.is_string() &&
54  !lhs_ti.is_decimal()) {
55  throw HashJoinFail("Cannot apply hash join to inner column type " +
56  lhs_ti.get_type_name());
57  }
58  // Decimal types should be identical.
59  if (lhs_ti.is_decimal() && (lhs_ti.get_scale() != rhs_ti.get_scale() ||
60  lhs_ti.get_precision() != rhs_ti.get_precision())) {
61  throw HashJoinFail("Equijoin with different decimal types");
62  }
63  }
64 
65  const auto lhs_cast = dynamic_cast<const Analyzer::UOper*>(lhs);
66  const auto rhs_cast = dynamic_cast<const Analyzer::UOper*>(rhs);
67  if (lhs_ti.is_string() && (static_cast<bool>(lhs_cast) != static_cast<bool>(rhs_cast) ||
68  (lhs_cast && lhs_cast->get_optype() != kCAST) ||
69  (rhs_cast && rhs_cast->get_optype() != kCAST))) {
70  throw HashJoinFail("Cannot use hash join for given expression");
71  }
72  // Casts to decimal are not suported.
73  if (lhs_ti.is_decimal() && (lhs_cast || rhs_cast)) {
74  throw HashJoinFail("Cannot use hash join for given expression");
75  }
76  const auto lhs_col =
77  lhs_cast ? dynamic_cast<const Analyzer::ColumnVar*>(lhs_cast->get_operand())
78  : dynamic_cast<const Analyzer::ColumnVar*>(lhs);
79  const auto rhs_col =
80  rhs_cast ? dynamic_cast<const Analyzer::ColumnVar*>(rhs_cast->get_operand())
81  : dynamic_cast<const Analyzer::ColumnVar*>(rhs);
82  if (!lhs_col && !rhs_col) {
83  throw HashJoinFail("Cannot use hash join for given expression");
84  }
85  const Analyzer::ColumnVar* inner_col{nullptr};
86  const Analyzer::ColumnVar* outer_col{nullptr};
87  auto outer_ti = lhs_ti;
88  auto inner_ti = rhs_ti;
89  const Analyzer::Expr* outer_expr{lhs};
90  if ((!lhs_col || (rhs_col && lhs_col->get_rte_idx() < rhs_col->get_rte_idx())) &&
91  (!rhs_col || (!lhs_col || lhs_col->get_rte_idx() < rhs_col->get_rte_idx()))) {
92  inner_col = rhs_col;
93  outer_col = lhs_col;
94  } else {
95  if (lhs_col && lhs_col->get_rte_idx() == 0) {
96  throw HashJoinFail("Cannot use hash join for given expression");
97  }
98  inner_col = lhs_col;
99  outer_col = rhs_col;
100  std::swap(outer_ti, inner_ti);
101  outer_expr = rhs;
102  }
103  if (!inner_col) {
104  throw HashJoinFail("Cannot use hash join for given expression");
105  }
106  if (!outer_col) {
107  MaxRangeTableIndexVisitor rte_idx_visitor;
108  int outer_rte_idx = rte_idx_visitor.visit(outer_expr);
109  // The inner column candidate is not actually inner; the outer
110  // expression contains columns which are at least as deep.
111  if (inner_col->get_rte_idx() <= outer_rte_idx) {
112  throw HashJoinFail("Cannot use hash join for given expression");
113  }
114  }
115  // We need to fetch the actual type information from the catalog since Analyzer
116  // always reports nullable as true for inner table columns in left joins.
117  const auto inner_col_cd = get_column_descriptor_maybe(
118  inner_col->get_column_id(), inner_col->get_table_id(), cat);
119  const auto inner_col_real_ti = get_column_type(inner_col->get_column_id(),
120  inner_col->get_table_id(),
121  inner_col_cd,
122  temporary_tables);
123  const auto& outer_col_ti =
124  !(dynamic_cast<const Analyzer::FunctionOper*>(lhs)) && outer_col
125  ? outer_col->get_type_info()
126  : outer_ti;
127  // Casts from decimal are not supported.
128  if ((inner_col_real_ti.is_decimal() || outer_col_ti.is_decimal()) &&
129  (lhs_cast || rhs_cast)) {
130  throw HashJoinFail("Cannot use hash join for given expression");
131  }
132  if (is_overlaps_join) {
133  if (!inner_col_real_ti.is_array()) {
134  throw HashJoinFail(
135  "Overlaps join only supported for inner columns with array type");
136  }
137  auto is_bounds_array = [](const auto ti) {
138  return ti.is_fixlen_array() && ti.get_size() == 32;
139  };
140  if (!is_bounds_array(inner_col_real_ti)) {
141  throw HashJoinFail(
142  "Overlaps join only supported for 4-element double fixed length arrays");
143  }
144  if (!(outer_col_ti.get_type() == kPOINT || is_bounds_array(outer_col_ti))) {
145  throw HashJoinFail(
146  "Overlaps join only supported for geometry outer columns of type point or "
147  "geometry columns with bounds");
148  }
149  } else {
150  if (!(inner_col_real_ti.is_integer() || inner_col_real_ti.is_time() ||
151  inner_col_real_ti.is_decimal() ||
152  (inner_col_real_ti.is_string() &&
153  inner_col_real_ti.get_compression() == kENCODING_DICT))) {
154  throw HashJoinFail(
155  "Can only apply hash join to integer-like types and dictionary encoded "
156  "strings");
157  }
158  }
159 
160  auto normalized_inner_col = inner_col;
161  auto normalized_outer_col = outer_col ? outer_col : outer_expr;
162 
163  const auto& normalized_inner_ti = normalized_inner_col->get_type_info();
164  const auto& normalized_outer_ti = normalized_outer_col->get_type_info();
165 
166  if (normalized_inner_ti.is_string() != normalized_outer_ti.is_string()) {
167  throw HashJoinFail(std::string("Could not build hash tables for incompatible types " +
168  normalized_inner_ti.get_type_name() + " and " +
169  normalized_outer_ti.get_type_name()));
170  }
171 
172  return {normalized_inner_col, normalized_outer_col};
173 }
std::string cat(Ts &&...args)
const SQLTypeInfo get_column_type(const int col_id, const int table_id, const ColumnDescriptor *cd, const TemporaryTables *temporary_tables)
Definition: Execute.h:199
Definition: sqldefs.h:49
T visit(const Analyzer::Expr *expr) const
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:183
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector<InnerOuter> normalize_column_pairs ( const Analyzer::BinOper condition,
const Catalog_Namespace::Catalog cat,
const TemporaryTables temporary_tables 
)

Definition at line 175 of file JoinHashTable.cpp.

References cat(), CHECK, CHECK_EQ, Analyzer::BinOper::get_left_operand(), Analyzer::BinOper::get_right_operand(), Analyzer::BinOper::is_overlaps_oper(), normalize_column_pair(), and run_benchmark_import::result.

Referenced by anonymous_namespace{FromTableReordering.cpp}::get_join_qual_cost(), OverlapsJoinHashTable::getInstance(), BaselineJoinHashTable::getInstance(), and Executor::skipFragmentPair().

177  {
178  std::vector<InnerOuter> result;
179  const auto lhs_tuple_expr =
180  dynamic_cast<const Analyzer::ExpressionTuple*>(condition->get_left_operand());
181  const auto rhs_tuple_expr =
182  dynamic_cast<const Analyzer::ExpressionTuple*>(condition->get_right_operand());
183 
184  CHECK_EQ(static_cast<bool>(lhs_tuple_expr), static_cast<bool>(rhs_tuple_expr));
185  if (lhs_tuple_expr) {
186  const auto& lhs_tuple = lhs_tuple_expr->getTuple();
187  const auto& rhs_tuple = rhs_tuple_expr->getTuple();
188  CHECK_EQ(lhs_tuple.size(), rhs_tuple.size());
189  for (size_t i = 0; i < lhs_tuple.size(); ++i) {
190  result.push_back(normalize_column_pair(lhs_tuple[i].get(),
191  rhs_tuple[i].get(),
192  cat,
193  temporary_tables,
194  condition->is_overlaps_oper()));
195  }
196  } else {
197  CHECK(!lhs_tuple_expr && !rhs_tuple_expr);
198  result.push_back(normalize_column_pair(condition->get_left_operand(),
199  condition->get_right_operand(),
200  cat,
201  temporary_tables,
202  condition->is_overlaps_oper()));
203  }
204 
205  return result;
206 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::string cat(Ts &&...args)
const Expr * get_right_operand() const
Definition: Analyzer.h:443
#define CHECK(condition)
Definition: Logger.h:197
const Expr * get_left_operand() const
Definition: Analyzer.h:442
bool is_overlaps_oper() const
Definition: Analyzer.h:440
InnerOuter normalize_column_pair(const Analyzer::Expr *lhs, const Analyzer::Expr *rhs, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables, const bool is_overlaps_join)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector<Fragmenter_Namespace::FragmentInfo> only_shards_for_device ( const std::vector< Fragmenter_Namespace::FragmentInfo > &  fragments,
const int  device_id,
const int  device_count 
)

Definition at line 479 of file JoinHashTable.cpp.

References CHECK_GE.

Referenced by JoinHashTable::reify(), OverlapsJoinHashTable::reifyWithLayout(), and BaselineJoinHashTable::reifyWithLayout().

482  {
483  std::vector<Fragmenter_Namespace::FragmentInfo> shards_for_device;
484  for (const auto& fragment : fragments) {
485  CHECK_GE(fragment.shard, 0);
486  if (fragment.shard % device_count == device_id) {
487  shards_for_device.push_back(fragment);
488  }
489  }
490  return shards_for_device;
491 }
#define CHECK_GE(x, y)
Definition: Logger.h:210

+ Here is the caller graph for this function: