OmniSciDB  21ac014ffc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer.h
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
20 
22 #include "GpuMemUtils.h"
23 #include "ResultSet.h"
24 
26 
27 #include <memory>
28 
29 #ifdef HAVE_CUDA
30 #include <cuda.h>
31 #else
32 #include <Shared/nocuda.h>
33 #endif
34 
36  public:
37  // Row-based execution constructor
40  const int device_id,
41  const ExecutorDeviceType device_type,
42  const ExecutorDispatchMode dispatch_mode,
43  const bool output_columnar,
44  const bool sort_on_gpu,
45  const int64_t num_rows,
46  const std::vector<std::vector<const int8_t*>>& col_buffers,
47  const std::vector<std::vector<uint64_t>>& frag_offsets,
48  RenderAllocatorMap* render_allocator_map,
49  RenderInfo* render_info,
50  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
51  DeviceAllocator* gpu_allocator,
52  const size_t thread_idx,
53  const Executor* executor);
54 
55  // Table functions execution constructor
57  const QueryMemoryDescriptor& query_mem_desc,
58  const int device_id,
59  const ExecutorDeviceType device_type,
60  const int64_t num_rows,
61  const std::vector<std::vector<const int8_t*>>& col_buffers,
62  const std::vector<std::vector<uint64_t>>& frag_offsets,
63  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
64  DeviceAllocator* device_allocator,
65  const Executor* executor);
66 
68 
70 
71  const auto getCountDistinctBitmapBytes() const {
73  }
74 
75  // TODO: lazy init (maybe lazy init count distinct above, too?)
77 
78  const auto getVarlenOutputPtr() const { return varlen_output_buffer_; }
79 
80  ResultSet* getResultSet(const size_t index) const {
81  CHECK_LT(index, result_sets_.size());
82  return result_sets_[index].get();
83  }
84 
85  std::unique_ptr<ResultSet> getResultSetOwned(const size_t index) {
86  CHECK_LT(index, result_sets_.size());
87  return std::move(result_sets_[index]);
88  }
89 
90  void resetResultSet(const size_t index) {
91  CHECK_LT(index, result_sets_.size());
92  result_sets_[index].reset();
93  }
94 
95  int64_t getAggInitValForIndex(const size_t index) const {
96  CHECK_LT(index, init_agg_vals_.size());
97  return init_agg_vals_[index];
98  }
99 
100  const auto getGroupByBuffersPtr() {
101  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
102  }
103 
104  const auto getGroupByBuffersSize() const { return group_by_buffers_.size(); }
105 
106  const auto getNumBuffers() const {
108  return num_buffers_;
109  }
110 
111 #ifdef HAVE_CUDA
112  GpuGroupByBuffers setupTableFunctionGpuBuffers(
113  const QueryMemoryDescriptor& query_mem_desc,
114  const int device_id,
115  const unsigned block_size_x,
116  const unsigned grid_size_x);
117  void copyFromTableFunctionGpuBuffers(Data_Namespace::DataMgr* data_mgr,
118  const QueryMemoryDescriptor& query_mem_desc,
119  const size_t entry_count,
120  const GpuGroupByBuffers& gpu_group_by_buffers,
121  const int device_id,
122  const unsigned block_size_x,
123  const unsigned grid_size_x);
124 #endif
125 
127  const QueryMemoryDescriptor& query_mem_desc,
128  const size_t entry_count,
129  const GpuGroupByBuffers& gpu_group_by_buffers,
130  const RelAlgExecutionUnit* ra_exe_unit,
131  const unsigned block_size_x,
132  const unsigned grid_size_x,
133  const int device_id,
134  const bool prepend_index_buffer) const;
135 
136  private:
137  void initGroupByBuffer(int64_t* buffer,
138  const RelAlgExecutionUnit& ra_exe_unit,
139  const QueryMemoryDescriptor& query_mem_desc,
140  const ExecutorDeviceType device_type,
141  const bool output_columnar,
142  const Executor* executor);
143 
144  void initRowGroups(const QueryMemoryDescriptor& query_mem_desc,
145  int64_t* groups_buffer,
146  const std::vector<int64_t>& init_vals,
147  const int32_t groups_buffer_entry_count,
148  const size_t warp_size,
149  const Executor* executor);
150 
151  void initColumnarGroups(const QueryMemoryDescriptor& query_mem_desc,
152  int64_t* groups_buffer,
153  const std::vector<int64_t>& init_vals,
154  const Executor* executor);
155 
156  using QuantileParam = std::optional<double>;
157  void initColumnsPerRow(const QueryMemoryDescriptor& query_mem_desc,
158  int8_t* row_ptr,
159  const std::vector<int64_t>& init_vals,
160  const std::vector<int64_t>& bitmap_sizes,
161  const std::vector<QuantileParam>& quantile_params);
162 
163  void allocateCountDistinctGpuMem(const QueryMemoryDescriptor& query_mem_desc);
164 
165  std::vector<int64_t> allocateCountDistinctBuffers(
166  const QueryMemoryDescriptor& query_mem_desc,
167  const bool deferred,
168  const Executor* executor);
169 
170  int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz);
171 
172  int64_t allocateCountDistinctSet();
173 
174  std::vector<QuantileParam> allocateTDigests(const QueryMemoryDescriptor& query_mem_desc,
175  const bool deferred,
176  const Executor* executor);
177 
178 #ifdef HAVE_CUDA
179  GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor& query_mem_desc,
180  const CUdeviceptr init_agg_vals_dev_ptr,
181  const size_t n,
182  const int device_id,
183  const unsigned block_size_x,
184  const unsigned grid_size_x);
185 
186  GpuGroupByBuffers createAndInitializeGroupByBufferGpu(
187  const RelAlgExecutionUnit& ra_exe_unit,
188  const QueryMemoryDescriptor& query_mem_desc,
189  const CUdeviceptr init_agg_vals_dev_ptr,
190  const int device_id,
191  const ExecutorDispatchMode dispatch_mode,
192  const unsigned block_size_x,
193  const unsigned grid_size_x,
194  const int8_t warp_size,
195  const bool can_sort_on_gpu,
196  const bool output_columnar,
197  RenderAllocator* render_allocator);
198 #endif
199 
200  size_t computeNumberOfBuffers(const QueryMemoryDescriptor& query_mem_desc,
201  const ExecutorDeviceType device_type,
202  const Executor* executor) const;
203 
204  void compactProjectionBuffersCpu(const QueryMemoryDescriptor& query_mem_desc,
205  const size_t projection_count);
206  void compactProjectionBuffersGpu(const QueryMemoryDescriptor& query_mem_desc,
207  Data_Namespace::DataMgr* data_mgr,
208  const GpuGroupByBuffers& gpu_group_by_buffers,
209  const size_t projection_count,
210  const int device_id);
211 
212  void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor& query_mem_desc,
213  const RelAlgExecutionUnit& ra_exe_unit);
214 
216  const QueryMemoryDescriptor& query_mem_desc,
217  const GpuGroupByBuffers& gpu_group_by_buffers,
218  const RelAlgExecutionUnit& ra_exe_unit,
219  const unsigned total_thread_count,
220  const int device_id);
221 
222  std::shared_ptr<VarlenOutputInfo> getVarlenOutputInfo();
223 
224  const int64_t num_rows_;
225  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner_;
226  std::vector<std::unique_ptr<ResultSet>> result_sets_;
227 
228  std::vector<int64_t> init_agg_vals_;
229 
230  size_t num_buffers_;
231  std::vector<int64_t*> group_by_buffers_;
232  std::shared_ptr<VarlenOutputInfo> varlen_output_info_;
235 
240 
242  std::vector<Data_Namespace::AbstractBuffer*> temporary_buffers_;
243 
244  const size_t thread_idx_;
245 
246  friend class Executor; // Accesses result_sets_
247  friend class QueryExecutionContext;
248 };
#define CHECK_EQ(x, y)
Definition: Logger.h:214
void resetResultSet(const size_t index)
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< QuantileParam > &quantile_params)
DeviceAllocator * device_allocator_
ExecutorDeviceType
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
std::optional< double > QuantileParam
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
std::vector< int64_t > init_agg_vals_
std::unique_ptr< ResultSet > getResultSetOwned(const size_t index)
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
int64_t getAggInitValForIndex(const size_t index) const
const auto getCountDistinctBitmapPtr() const
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
#define CHECK_LT(x, y)
Definition: Logger.h:216
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
const auto getNumBuffers() const
Abstract class for managing device memory allocations.
const auto getCountDistinctBitmapBytes() const
Descriptor for the result set buffer layout.
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
const auto getVarlenOutputPtr() const
std::vector< Data_Namespace::AbstractBuffer * > temporary_buffers_
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
Basic constructors and methods of the row set interface.
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
ResultSet * getResultSet(const size_t index) const
std::vector< std::unique_ptr< ResultSet > > result_sets_
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
const auto getVarlenOutputHostPtr() const
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
const auto getCountDistinctHostPtr() const
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)