OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
21 #include "GpuMemUtils.h"
23 #include "ResultSet.h"
24 
25 #include "ThirdParty/robin_hood/robin_hood.h"
26 
27 #include <memory>
28 
29 #ifdef HAVE_CUDA
30 #include <cuda.h>
31 #else
32 #include <Shared/nocuda.h>
33 #endif
34 
36  public:
37  using ModeIndexSet = robin_hood::unordered_set<size_t>;
38  using QuantileParam = std::optional<double>;
40  bool has_count_distinct{false};
41  bool has_mode{false};
42  bool has_tdigest{false};
43  std::vector<int64_t> count_distinct_buf_size;
45  std::vector<QuantileParam> qualtile_params;
46  };
47 
48  // Row-based execution constructor
51  const int device_id,
52  const ExecutorDeviceType device_type,
53  const ExecutorDispatchMode dispatch_mode,
54  const bool output_columnar,
55  const bool sort_on_gpu,
56  const shared::TableKey& outer_table_key,
57  const int64_t num_rows,
58  const std::vector<std::vector<const int8_t*>>& col_buffers,
59  const std::vector<std::vector<uint64_t>>& frag_offsets,
60  RenderAllocatorMap* render_allocator_map,
61  RenderInfo* render_info,
62  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
63  DeviceAllocator* gpu_allocator,
64  const size_t thread_idx,
65  const Executor* executor);
66 
67  // Table functions execution constructor
70  const int device_id,
71  const ExecutorDeviceType device_type,
72  const int64_t num_rows,
73  const std::vector<std::vector<const int8_t*>>& col_buffers,
74  const std::vector<std::vector<uint64_t>>& frag_offsets,
75  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
76  DeviceAllocator* device_allocator,
77  const Executor* executor);
78 
79  const auto getCountDistinctBitmapDevicePtr() const {
81  }
82 
83  const auto getCountDistinctBitmapHostPtr() const {
85  }
86 
87  const auto getCountDistinctBitmapBytes() const {
89  }
90 
91  // TODO: lazy init (maybe lazy init count distinct above, too?)
93 
94  const auto getVarlenOutputPtr() const { return varlen_output_buffer_; }
95 
96  ResultSet* getResultSet(const size_t index) const {
97  CHECK_LT(index, result_sets_.size());
98  return result_sets_[index].get();
99  }
100 
101  std::unique_ptr<ResultSet> getResultSetOwned(const size_t index) {
102  CHECK_LT(index, result_sets_.size());
103  return std::move(result_sets_[index]);
104  }
105 
106  void resetResultSet(const size_t index) {
107  CHECK_LT(index, result_sets_.size());
108  result_sets_[index].reset();
109  }
110 
111  int64_t getAggInitValForIndex(const size_t index) const {
112  CHECK_LT(index, init_agg_vals_.size());
113  return init_agg_vals_[index];
114  }
115 
116  const auto getGroupByBuffersPtr() {
117  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
118  }
119 
120  const auto getGroupByBuffersSize() const { return group_by_buffers_.size(); }
121 
122  const auto getNumBuffers() const {
124  return num_buffers_;
125  }
126 
129  const int device_id,
130  const unsigned block_size_x,
131  const unsigned grid_size_x,
132  const bool zero_initialize_buffers);
133 
136  const size_t entry_count,
137  const GpuGroupByBuffers& gpu_group_by_buffers,
138  const int device_id,
139  const unsigned block_size_x,
140  const unsigned grid_size_x);
141 
142  void copyGroupByBuffersFromGpu(DeviceAllocator& device_allocator,
144  const size_t entry_count,
145  const GpuGroupByBuffers& gpu_group_by_buffers,
146  const RelAlgExecutionUnit* ra_exe_unit,
147  const unsigned block_size_x,
148  const unsigned grid_size_x,
149  const int device_id,
150  const bool prepend_index_buffer) const;
151 
152  private:
153  void initGroupByBuffer(int64_t* buffer,
154  const RelAlgExecutionUnit& ra_exe_unit,
156  TargetAggOpsMetadata& agg_expr_metadata,
157  const ExecutorDeviceType device_type,
158  const bool output_columnar,
159  const Executor* executor);
160 
162  int64_t* groups_buffer,
163  const std::vector<int64_t>& init_vals,
164  TargetAggOpsMetadata& agg_expr_metadata,
165  const int32_t groups_buffer_entry_count,
166  const size_t warp_size,
167  const Executor* executor,
168  const RelAlgExecutionUnit& ra_exe_unit);
169 
171  int64_t* groups_buffer,
172  const std::vector<int64_t>& init_vals,
173  const Executor* executor,
174  const RelAlgExecutionUnit& ra_exe_unit);
175 
177  int8_t* row_ptr,
178  const std::vector<int64_t>& init_vals,
179  const TargetAggOpsMetadata& agg_op_metadata);
180 
182 
183  std::vector<int64_t> calculateCountDistinctBufferSize(
185  const RelAlgExecutionUnit& ra_exe_unit) const;
186 
188  const RelAlgExecutionUnit& ra_exe_unit);
189 
190  int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz);
191 
192  int64_t allocateCountDistinctSet();
193 
195  const RelAlgExecutionUnit& ra_exe_unit);
196 
198  const RelAlgExecutionUnit& ra_exe_unit);
199 
200  std::vector<QuantileParam> initializeQuantileParams(
202  const RelAlgExecutionUnit& ra_exe_unit);
203 
205  const RelAlgExecutionUnit& ra_exe_unit);
206 
208  const int8_t* init_agg_vals_dev_ptr,
209  const size_t n,
210  const int device_id,
211  const unsigned block_size_x,
212  const unsigned grid_size_x);
213 
215  const RelAlgExecutionUnit& ra_exe_unit,
217  const int8_t* init_agg_vals_dev_ptr,
218  const int device_id,
219  const ExecutorDispatchMode dispatch_mode,
220  const unsigned block_size_x,
221  const unsigned grid_size_x,
222  const int8_t warp_size,
223  const bool can_sort_on_gpu,
224  const bool output_columnar,
225  RenderAllocator* render_allocator);
226 
228  const ExecutorDeviceType device_type,
229  const Executor* executor) const;
230 
232  const size_t projection_count);
234  Data_Namespace::DataMgr* data_mgr,
235  const GpuGroupByBuffers& gpu_group_by_buffers,
236  const size_t projection_count,
237  const int device_id);
238 
240  const RelAlgExecutionUnit& ra_exe_unit);
241 
244  const GpuGroupByBuffers& gpu_group_by_buffers,
245  const RelAlgExecutionUnit& ra_exe_unit,
246  const unsigned total_thread_count,
247  const int device_id);
248 
249  std::shared_ptr<VarlenOutputInfo> getVarlenOutputInfo();
250 
251  const int64_t num_rows_;
252  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner_;
253  std::vector<std::unique_ptr<ResultSet>> result_sets_;
254 
255  std::vector<int64_t> init_agg_vals_;
256 
257  size_t num_buffers_;
258  std::vector<int64_t*> group_by_buffers_;
259  std::shared_ptr<VarlenOutputInfo> varlen_output_info_;
262 
267 
269  std::vector<Data_Namespace::AbstractBuffer*> temporary_buffers_;
270 
271  const size_t thread_idx_;
272 
273  friend class Executor; // Accesses result_sets_
274  friend class QueryExecutionContext;
275 };
GpuGroupByBuffers setupTableFunctionGpuBuffers(const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x, const bool zero_initialize_buffers)
ModeIndexSet initializeModeIndexSet(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
#define CHECK_EQ(x, y)
Definition: Logger.h:301
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void resetResultSet(const size_t index)
GpuGroupByBuffers createAndInitializeGroupByBufferGpu(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
DeviceAllocator * device_allocator_
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
void allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:28
std::optional< double > QuantileParam
const auto getCountDistinctBitmapHostPtr() const
ExecutorDeviceType
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > initializeQuantileParams(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
std::vector< int64_t > calculateCountDistinctBufferSize(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit) const
std::vector< int64_t > init_agg_vals_
std::unique_ptr< ResultSet > getResultSetOwned(const size_t index)
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
int64_t getAggInitValForIndex(const size_t index) const
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const TargetAggOpsMetadata &agg_op_metadata)
void copyGroupByBuffersFromGpu(DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, TargetAggOpsMetadata &agg_expr_metadata, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
const auto getNumBuffers() const
void allocateTDigestsBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void allocateModeBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
robin_hood::unordered_set< size_t > ModeIndexSet
Abstract class for managing device memory allocations.
const auto getCountDistinctBitmapBytes() const
Descriptor for the result set buffer layout.
const auto getVarlenOutputPtr() const
std::vector< Data_Namespace::AbstractBuffer * > temporary_buffers_
void copyFromTableFunctionGpuBuffers(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
Basic constructors and methods of the row set interface.
const auto getCountDistinctBitmapDevicePtr() const
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:38
ResultSet * getResultSet(const size_t index) const
std::vector< std::unique_ptr< ResultSet > > result_sets_
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, TargetAggOpsMetadata &agg_expr_metadata, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
const auto getVarlenOutputHostPtr() const
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)