OmniSciDB  340b00dbf6
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
JoinHashTable.cpp File Reference
#include "QueryEngine/JoinHashTable/JoinHashTable.h"
#include <atomic>
#include <future>
#include <numeric>
#include <thread>
#include "Logger/Logger.h"
#include "QueryEngine/CodeGenerator.h"
#include "QueryEngine/ColumnFetcher.h"
#include "QueryEngine/Execute.h"
#include "QueryEngine/ExpressionRewrite.h"
#include "QueryEngine/JoinHashTable/HashJoinRuntime.h"
#include "QueryEngine/RangeTableIndexVisitor.h"
#include "QueryEngine/RuntimeFunctions.h"
+ Include dependency graph for JoinHashTable.cpp:

Go to the source code of this file.

Classes

class  anonymous_namespace{JoinHashTable.cpp}::NeedsOneToManyHash
 

Namespaces

 anonymous_namespace{JoinHashTable.cpp}
 

Functions

InnerOuter normalize_column_pair (const Analyzer::Expr *lhs, const Analyzer::Expr *rhs, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables, const bool is_overlaps_join)
 
std::vector< InnerOuternormalize_column_pairs (const Analyzer::BinOper *condition, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables)
 
std::pair< const
Analyzer::ColumnVar *, const
Analyzer::Expr * > 
anonymous_namespace{JoinHashTable.cpp}::get_cols (const Analyzer::BinOper *qual_bin_oper, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables)
 
HashEntryInfo anonymous_namespace{JoinHashTable.cpp}::get_bucketized_hash_entry_info (SQLTypeInfo const &context_ti, ExpressionRange const &col_range, bool const is_bw_eq)
 
size_t anonymous_namespace{JoinHashTable.cpp}::get_hash_entry_count (const ExpressionRange &col_range, const bool is_bw_eq)
 
size_t get_shard_count (const Analyzer::BinOper *join_condition, const Executor *executor)
 
bool anonymous_namespace{JoinHashTable.cpp}::shard_count_less_or_equal_device_count (const int inner_table_id, const Executor *executor)
 
size_t get_shard_count (std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * > equi_pair, const Executor *executor)
 
bool needs_dictionary_translation (const Analyzer::ColumnVar *inner_col, const Analyzer::Expr *outer_col_expr, const Executor *executor)
 
std::vector
< Fragmenter_Namespace::FragmentInfo
only_shards_for_device (const std::vector< Fragmenter_Namespace::FragmentInfo > &fragments, const int device_id, const int device_count)
 
const InputTableInfoget_inner_query_info (const int inner_table_id, const std::vector< InputTableInfo > &query_infos)
 
size_t get_entries_per_device (const size_t total_entries, const size_t shard_count, const size_t device_count, const Data_Namespace::MemoryLevel memory_level)
 

Function Documentation

size_t get_entries_per_device ( const size_t  total_entries,
const size_t  shard_count,
const size_t  device_count,
const Data_Namespace::MemoryLevel  memory_level 
)

Definition at line 1609 of file JoinHashTable.cpp.

References CHECK_GT, and Data_Namespace::GPU_LEVEL.

Referenced by OverlapsJoinHashTable::calculateCounts(), OverlapsJoinHashTable::getInstance(), BaselineJoinHashTable::getInstance(), and BaselineJoinHashTable::reifyWithLayout().

1612  {
1613  const auto entries_per_shard =
1614  shard_count ? (total_entries + shard_count - 1) / shard_count : total_entries;
1615  size_t entries_per_device = entries_per_shard;
1616  if (memory_level == Data_Namespace::GPU_LEVEL && shard_count) {
1617  const auto shards_per_device = (shard_count + device_count - 1) / device_count;
1618  CHECK_GT(shards_per_device, 0u);
1619  entries_per_device = entries_per_shard * shards_per_device;
1620  }
1621  return entries_per_device;
1622 }
#define CHECK_GT(x, y)
Definition: Logger.h:209

+ Here is the caller graph for this function:

const InputTableInfo& get_inner_query_info ( const int  inner_table_id,
const std::vector< InputTableInfo > &  query_infos 
)

Definition at line 1595 of file JoinHashTable.cpp.

References CHECK.

Referenced by JoinHashTable::getInnerQueryInfo(), OverlapsJoinHashTable::getInstance(), BaselineJoinHashTable::getInstance(), OverlapsJoinHashTable::reifyWithLayout(), and BaselineJoinHashTable::reifyWithLayout().

1597  {
1598  std::optional<size_t> ti_idx;
1599  for (size_t i = 0; i < query_infos.size(); ++i) {
1600  if (inner_table_id == query_infos[i].table_id) {
1601  ti_idx = i;
1602  break;
1603  }
1604  }
1605  CHECK(ti_idx);
1606  return query_infos[*ti_idx];
1607 }
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the caller graph for this function:

size_t get_shard_count ( const Analyzer::BinOper join_condition,
const Executor executor 
)

Definition at line 265 of file JoinHashTable.cpp.

References anonymous_namespace{JoinHashTable.cpp}::get_cols(), and get_shard_count().

Referenced by JoinHashTable::checkHashJoinReplicationConstraint(), get_shard_count(), BaselineJoinHashTable::getShardCountForCondition(), JoinHashTable::initOneToManyHashTable(), JoinHashTable::shardCount(), and Executor::skipFragmentPair().

266  {
267  const Analyzer::ColumnVar* inner_col{nullptr};
268  const Analyzer::Expr* outer_col{nullptr};
269  std::shared_ptr<Analyzer::BinOper> redirected_bin_oper;
270  try {
271  std::tie(inner_col, outer_col) =
272  get_cols(join_condition, *executor->getCatalog(), executor->getTemporaryTables());
273  } catch (...) {
274  return 0;
275  }
276  if (!inner_col || !outer_col) {
277  return 0;
278  }
279  return get_shard_count({inner_col, outer_col}, executor);
280 }
size_t get_shard_count(const Analyzer::BinOper *join_condition, const Executor *executor)
std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * > get_cols(const Analyzer::BinOper *qual_bin_oper, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t get_shard_count ( std::pair< const Analyzer::ColumnVar *, const Analyzer::Expr * >  equi_pair,
const Executor executor 
)

Definition at line 303 of file JoinHashTable.cpp.

References CHECK, and anonymous_namespace{JoinHashTable.cpp}::shard_count_less_or_equal_device_count().

305  {
306  const auto inner_col = equi_pair.first;
307  const auto outer_col = dynamic_cast<const Analyzer::ColumnVar*>(equi_pair.second);
308  if (!outer_col || inner_col->get_table_id() < 0 || outer_col->get_table_id() < 0) {
309  return 0;
310  }
311  if (outer_col->get_rte_idx()) {
312  return 0;
313  }
314  if (inner_col->get_type_info() != outer_col->get_type_info()) {
315  return 0;
316  }
317  const auto catalog = executor->getCatalog();
318  const auto inner_td = catalog->getMetadataForTable(inner_col->get_table_id());
319  CHECK(inner_td);
320  const auto outer_td = catalog->getMetadataForTable(outer_col->get_table_id());
321  CHECK(outer_td);
322  if (inner_td->shardedColumnId == 0 || outer_td->shardedColumnId == 0 ||
323  inner_td->nShards != outer_td->nShards) {
324  return 0;
325  }
326  if (!shard_count_less_or_equal_device_count(inner_td->tableId, executor)) {
327  return 0;
328  }
329  // The two columns involved must be the ones on which the tables have been sharded on.
330  return (inner_td->shardedColumnId == inner_col->get_column_id() &&
331  outer_td->shardedColumnId == outer_col->get_column_id()) ||
332  (outer_td->shardedColumnId == inner_col->get_column_id() &&
333  inner_td->shardedColumnId == inner_col->get_column_id())
334  ? inner_td->nShards
335  : 0;
336 }
bool shard_count_less_or_equal_device_count(const int inner_table_id, const Executor *executor)
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

bool needs_dictionary_translation ( const Analyzer::ColumnVar inner_col,
const Analyzer::Expr outer_col_expr,
const Executor executor 
)

Definition at line 448 of file JoinHashTable.cpp.

References CHECK, CHECK_EQ, get_column_descriptor_maybe(), Analyzer::ColumnVar::get_column_id(), get_column_type(), and Analyzer::ColumnVar::get_table_id().

Referenced by BaselineJoinHashTable::getEffectiveMemoryLevel(), JoinHashTable::reifyOneToManyForDevice(), and JoinHashTable::reifyOneToOneForDevice().

450  {
451  const auto catalog = executor->getCatalog();
452  CHECK(catalog);
453  const auto inner_cd = get_column_descriptor_maybe(
454  inner_col->get_column_id(), inner_col->get_table_id(), *catalog);
455  const auto& inner_ti = get_column_type(inner_col->get_column_id(),
456  inner_col->get_table_id(),
457  inner_cd,
458  executor->getTemporaryTables());
459  // Only strings may need dictionary translation.
460  if (!inner_ti.is_string()) {
461  return false;
462  }
463  const auto outer_col = dynamic_cast<const Analyzer::ColumnVar*>(outer_col_expr);
464  CHECK(outer_col);
465  const auto outer_cd = get_column_descriptor_maybe(
466  outer_col->get_column_id(), outer_col->get_table_id(), *catalog);
467  // Don't want to deal with temporary tables for now, require translation.
468  if (!inner_cd || !outer_cd) {
469  return true;
470  }
471  const auto& outer_ti = get_column_type(outer_col->get_column_id(),
472  outer_col->get_table_id(),
473  outer_cd,
474  executor->getTemporaryTables());
475  CHECK_EQ(inner_ti.is_string(), outer_ti.is_string());
476  // If the two columns don't share the dictionary, translation is needed.
477  return outer_ti.get_comp_param() != inner_ti.get_comp_param();
478 }
int get_table_id() const
Definition: Analyzer.h:194
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const SQLTypeInfo get_column_type(const int col_id, const int table_id, const ColumnDescriptor *cd, const TemporaryTables *temporary_tables)
Definition: Execute.h:199
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:183
#define CHECK(condition)
Definition: Logger.h:197
int get_column_id() const
Definition: Analyzer.h:195

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

InnerOuter normalize_column_pair ( const Analyzer::Expr lhs,
const Analyzer::Expr rhs,
const Catalog_Namespace::Catalog cat,
const TemporaryTables temporary_tables,
const bool  is_overlaps_join 
)

Definition at line 42 of file JoinHashTable.cpp.

References cat(), get_column_descriptor_maybe(), get_column_type(), Analyzer::Expr::get_type_info(), kCAST, kENCODING_DICT, kPOINT, and ScalarExprVisitor< T >::visit().

Referenced by anonymous_namespace{JoinHashTable.cpp}::get_cols(), and normalize_column_pairs().

46  {
47  const auto& lhs_ti = lhs->get_type_info();
48  const auto& rhs_ti = rhs->get_type_info();
49  if (!is_overlaps_join) {
50  if (lhs_ti.get_type() != rhs_ti.get_type()) {
51  throw HashJoinFail("Equijoin types must be identical, found: " +
52  lhs_ti.get_type_name() + ", " + rhs_ti.get_type_name());
53  }
54  if (!lhs_ti.is_integer() && !lhs_ti.is_time() && !lhs_ti.is_string() &&
55  !lhs_ti.is_decimal()) {
56  throw HashJoinFail("Cannot apply hash join to inner column type " +
57  lhs_ti.get_type_name());
58  }
59  // Decimal types should be identical.
60  if (lhs_ti.is_decimal() && (lhs_ti.get_scale() != rhs_ti.get_scale() ||
61  lhs_ti.get_precision() != rhs_ti.get_precision())) {
62  throw HashJoinFail("Equijoin with different decimal types");
63  }
64  }
65 
66  const auto lhs_cast = dynamic_cast<const Analyzer::UOper*>(lhs);
67  const auto rhs_cast = dynamic_cast<const Analyzer::UOper*>(rhs);
68  if (lhs_ti.is_string() && (static_cast<bool>(lhs_cast) != static_cast<bool>(rhs_cast) ||
69  (lhs_cast && lhs_cast->get_optype() != kCAST) ||
70  (rhs_cast && rhs_cast->get_optype() != kCAST))) {
71  throw HashJoinFail("Cannot use hash join for given expression");
72  }
73  // Casts to decimal are not suported.
74  if (lhs_ti.is_decimal() && (lhs_cast || rhs_cast)) {
75  throw HashJoinFail("Cannot use hash join for given expression");
76  }
77  const auto lhs_col =
78  lhs_cast ? dynamic_cast<const Analyzer::ColumnVar*>(lhs_cast->get_operand())
79  : dynamic_cast<const Analyzer::ColumnVar*>(lhs);
80  const auto rhs_col =
81  rhs_cast ? dynamic_cast<const Analyzer::ColumnVar*>(rhs_cast->get_operand())
82  : dynamic_cast<const Analyzer::ColumnVar*>(rhs);
83  if (!lhs_col && !rhs_col) {
84  throw HashJoinFail("Cannot use hash join for given expression");
85  }
86  const Analyzer::ColumnVar* inner_col{nullptr};
87  const Analyzer::ColumnVar* outer_col{nullptr};
88  auto outer_ti = lhs_ti;
89  auto inner_ti = rhs_ti;
90  const Analyzer::Expr* outer_expr{lhs};
91  if ((!lhs_col || (rhs_col && lhs_col->get_rte_idx() < rhs_col->get_rte_idx())) &&
92  (!rhs_col || (!lhs_col || lhs_col->get_rte_idx() < rhs_col->get_rte_idx()))) {
93  inner_col = rhs_col;
94  outer_col = lhs_col;
95  } else {
96  if (lhs_col && lhs_col->get_rte_idx() == 0) {
97  throw HashJoinFail("Cannot use hash join for given expression");
98  }
99  inner_col = lhs_col;
100  outer_col = rhs_col;
101  std::swap(outer_ti, inner_ti);
102  outer_expr = rhs;
103  }
104  if (!inner_col) {
105  throw HashJoinFail("Cannot use hash join for given expression");
106  }
107  if (!outer_col) {
108  MaxRangeTableIndexVisitor rte_idx_visitor;
109  int outer_rte_idx = rte_idx_visitor.visit(outer_expr);
110  // The inner column candidate is not actually inner; the outer
111  // expression contains columns which are at least as deep.
112  if (inner_col->get_rte_idx() <= outer_rte_idx) {
113  throw HashJoinFail("Cannot use hash join for given expression");
114  }
115  }
116  // We need to fetch the actual type information from the catalog since Analyzer
117  // always reports nullable as true for inner table columns in left joins.
118  const auto inner_col_cd = get_column_descriptor_maybe(
119  inner_col->get_column_id(), inner_col->get_table_id(), cat);
120  const auto inner_col_real_ti = get_column_type(inner_col->get_column_id(),
121  inner_col->get_table_id(),
122  inner_col_cd,
123  temporary_tables);
124  const auto& outer_col_ti =
125  !(dynamic_cast<const Analyzer::FunctionOper*>(lhs)) && outer_col
126  ? outer_col->get_type_info()
127  : outer_ti;
128  // Casts from decimal are not supported.
129  if ((inner_col_real_ti.is_decimal() || outer_col_ti.is_decimal()) &&
130  (lhs_cast || rhs_cast)) {
131  throw HashJoinFail("Cannot use hash join for given expression");
132  }
133  if (is_overlaps_join) {
134  if (!inner_col_real_ti.is_array()) {
135  throw HashJoinFail(
136  "Overlaps join only supported for inner columns with array type");
137  }
138  auto is_bounds_array = [](const auto ti) {
139  return ti.is_fixlen_array() && ti.get_size() == 32;
140  };
141  if (!is_bounds_array(inner_col_real_ti)) {
142  throw HashJoinFail(
143  "Overlaps join only supported for 4-element double fixed length arrays");
144  }
145  if (!(outer_col_ti.get_type() == kPOINT || is_bounds_array(outer_col_ti))) {
146  throw HashJoinFail(
147  "Overlaps join only supported for geometry outer columns of type point or "
148  "geometry columns with bounds");
149  }
150  } else {
151  if (!(inner_col_real_ti.is_integer() || inner_col_real_ti.is_time() ||
152  inner_col_real_ti.is_decimal() ||
153  (inner_col_real_ti.is_string() &&
154  inner_col_real_ti.get_compression() == kENCODING_DICT))) {
155  throw HashJoinFail(
156  "Can only apply hash join to integer-like types and dictionary encoded "
157  "strings");
158  }
159  }
160 
161  auto normalized_inner_col = inner_col;
162  auto normalized_outer_col = outer_col ? outer_col : outer_expr;
163 
164  const auto& normalized_inner_ti = normalized_inner_col->get_type_info();
165  const auto& normalized_outer_ti = normalized_outer_col->get_type_info();
166 
167  if (normalized_inner_ti.is_string() != normalized_outer_ti.is_string()) {
168  throw HashJoinFail(std::string("Could not build hash tables for incompatible types " +
169  normalized_inner_ti.get_type_name() + " and " +
170  normalized_outer_ti.get_type_name()));
171  }
172 
173  return {normalized_inner_col, normalized_outer_col};
174 }
std::string cat(Ts &&...args)
const SQLTypeInfo get_column_type(const int col_id, const int table_id, const ColumnDescriptor *cd, const TemporaryTables *temporary_tables)
Definition: Execute.h:199
Definition: sqldefs.h:49
T visit(const Analyzer::Expr *expr) const
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:183
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector<InnerOuter> normalize_column_pairs ( const Analyzer::BinOper condition,
const Catalog_Namespace::Catalog cat,
const TemporaryTables temporary_tables 
)

Definition at line 176 of file JoinHashTable.cpp.

References cat(), CHECK, CHECK_EQ, Analyzer::BinOper::get_left_operand(), Analyzer::BinOper::get_right_operand(), Analyzer::BinOper::is_overlaps_oper(), normalize_column_pair(), and run_benchmark_import::result.

Referenced by anonymous_namespace{FromTableReordering.cpp}::get_join_qual_cost(), OverlapsJoinHashTable::getInstance(), BaselineJoinHashTable::getInstance(), and Executor::skipFragmentPair().

178  {
179  std::vector<InnerOuter> result;
180  const auto lhs_tuple_expr =
181  dynamic_cast<const Analyzer::ExpressionTuple*>(condition->get_left_operand());
182  const auto rhs_tuple_expr =
183  dynamic_cast<const Analyzer::ExpressionTuple*>(condition->get_right_operand());
184 
185  CHECK_EQ(static_cast<bool>(lhs_tuple_expr), static_cast<bool>(rhs_tuple_expr));
186  if (lhs_tuple_expr) {
187  const auto& lhs_tuple = lhs_tuple_expr->getTuple();
188  const auto& rhs_tuple = rhs_tuple_expr->getTuple();
189  CHECK_EQ(lhs_tuple.size(), rhs_tuple.size());
190  for (size_t i = 0; i < lhs_tuple.size(); ++i) {
191  result.push_back(normalize_column_pair(lhs_tuple[i].get(),
192  rhs_tuple[i].get(),
193  cat,
194  temporary_tables,
195  condition->is_overlaps_oper()));
196  }
197  } else {
198  CHECK(!lhs_tuple_expr && !rhs_tuple_expr);
199  result.push_back(normalize_column_pair(condition->get_left_operand(),
200  condition->get_right_operand(),
201  cat,
202  temporary_tables,
203  condition->is_overlaps_oper()));
204  }
205 
206  return result;
207 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::string cat(Ts &&...args)
const Expr * get_right_operand() const
Definition: Analyzer.h:443
#define CHECK(condition)
Definition: Logger.h:197
const Expr * get_left_operand() const
Definition: Analyzer.h:442
bool is_overlaps_oper() const
Definition: Analyzer.h:440
InnerOuter normalize_column_pair(const Analyzer::Expr *lhs, const Analyzer::Expr *rhs, const Catalog_Namespace::Catalog &cat, const TemporaryTables *temporary_tables, const bool is_overlaps_join)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector<Fragmenter_Namespace::FragmentInfo> only_shards_for_device ( const std::vector< Fragmenter_Namespace::FragmentInfo > &  fragments,
const int  device_id,
const int  device_count 
)

Definition at line 480 of file JoinHashTable.cpp.

References CHECK_GE.

Referenced by JoinHashTable::reify(), OverlapsJoinHashTable::reifyWithLayout(), and BaselineJoinHashTable::reifyWithLayout().

483  {
484  std::vector<Fragmenter_Namespace::FragmentInfo> shards_for_device;
485  for (const auto& fragment : fragments) {
486  CHECK_GE(fragment.shard, 0);
487  if (fragment.shard % device_count == device_id) {
488  shards_for_device.push_back(fragment);
489  }
490  }
491  return shards_for_device;
492 }
#define CHECK_GE(x, y)
Definition: Logger.h:210

+ Here is the caller graph for this function: