OmniSciDB  04ee39c94c
QueryMemoryInitializer.h
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
20 
22 #include "GpuMemUtils.h"
23 #include "ResultSet.h"
24 
26 
27 #include <memory>
28 
29 #ifdef HAVE_CUDA
30 #include <cuda.h>
31 #else
32 #include <Shared/nocuda.h>
33 #endif
34 
36  public:
38  const QueryMemoryDescriptor& query_mem_desc,
39  const int device_id,
40  const ExecutorDeviceType device_type,
41  const ExecutorDispatchMode dispatch_mode,
42  const bool output_columnar,
43  const bool sort_on_gpu,
44  const int64_t num_rows,
45  const std::vector<std::vector<const int8_t*>>& col_buffers,
46  const std::vector<std::vector<uint64_t>>& frag_offsets,
47  RenderAllocatorMap* render_allocator_map,
48  RenderInfo* render_info,
49  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
50  DeviceAllocator* gpu_allocator,
51  const Executor* executor);
52 
54 
56 
57  const auto getCountDistinctBitmapBytes() const {
59  }
60 
61  ResultSet* getResultSet(const size_t index) const {
62  CHECK_LT(index, result_sets_.size());
63  return result_sets_[index].get();
64  }
65 
66  std::unique_ptr<ResultSet> getResultSetOwned(const size_t index) {
67  CHECK_LT(index, result_sets_.size());
68  return std::move(result_sets_[index]);
69  }
70 
71  void resetResultSet(const size_t index) {
72  CHECK_LT(index, result_sets_.size());
73  result_sets_[index].reset();
74  }
75 
76  int64_t getAggInitValForIndex(const size_t index) const {
77  CHECK_LT(index, init_agg_vals_.size());
78  return init_agg_vals_[index];
79  }
80 
81  const auto getGroupByBuffersPtr() {
82  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
83  }
84 
85  const auto getGroupByBuffersSize() const { return group_by_buffers_.size(); }
86 
87  const auto getNumBuffers() const {
89  return num_buffers_;
90  }
91 
92  private:
93  void initGroups(const QueryMemoryDescriptor& query_mem_desc,
94  int64_t* groups_buffer,
95  const std::vector<int64_t>& init_vals,
96  const int32_t groups_buffer_entry_count,
97  const size_t warp_size,
98  const Executor* executor);
99 
100  void initColumnarGroups(const QueryMemoryDescriptor& query_mem_desc,
101  int64_t* groups_buffer,
102  const std::vector<int64_t>& init_vals,
103  const Executor* executor);
104 
105  void initColumnPerRow(const QueryMemoryDescriptor& query_mem_desc,
106  int8_t* row_ptr,
107  const size_t bin,
108  const std::vector<int64_t>& init_vals,
109  const std::vector<ssize_t>& bitmap_sizes);
110 
111  void allocateCountDistinctGpuMem(const QueryMemoryDescriptor& query_mem_desc);
112 
113  std::vector<ssize_t> allocateCountDistinctBuffers(
114  const QueryMemoryDescriptor& query_mem_desc,
115  const bool deferred,
116  const Executor* executor);
117 
118  int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz);
119 
120  int64_t allocateCountDistinctSet();
121 
122 #ifdef HAVE_CUDA
123  GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor& query_mem_desc,
124  const CUdeviceptr init_agg_vals_dev_ptr,
125  const size_t n,
126  const int device_id,
127  const unsigned block_size_x,
128  const unsigned grid_size_x);
129 
130  GpuGroupByBuffers createAndInitializeGroupByBufferGpu(
131  const RelAlgExecutionUnit& ra_exe_unit,
132  const QueryMemoryDescriptor& query_mem_desc,
133  const CUdeviceptr init_agg_vals_dev_ptr,
134  const int device_id,
135  const ExecutorDispatchMode dispatch_mode,
136  const unsigned block_size_x,
137  const unsigned grid_size_x,
138  const int8_t warp_size,
139  const bool can_sort_on_gpu,
140  const bool output_columnar,
141  RenderAllocator* render_allocator);
142 #endif
143 
144  size_t computeNumberOfBuffers(const QueryMemoryDescriptor& query_mem_desc,
145  const ExecutorDeviceType device_type,
146  const Executor* executor) const;
147 
148  void compactProjectionBuffersCpu(const QueryMemoryDescriptor& query_mem_desc,
149  const size_t projection_count);
150  void compactProjectionBuffersGpu(const QueryMemoryDescriptor& query_mem_desc,
151  Data_Namespace::DataMgr* data_mgr,
152  const GpuGroupByBuffers& gpu_group_by_buffers,
153  const size_t projection_count,
154  const int device_id);
155 
157  const QueryMemoryDescriptor& query_mem_desc,
158  const size_t entry_count,
159  const GpuGroupByBuffers& gpu_group_by_buffers,
160  const RelAlgExecutionUnit& ra_exe_unit,
161  const unsigned block_size_x,
162  const unsigned grid_size_x,
163  const int device_id,
164  const bool prepend_index_buffer) const;
165 
166  void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor& query_mem_desc,
167  const RelAlgExecutionUnit& ra_exe_unit);
168 
170  const QueryMemoryDescriptor& query_mem_desc,
171  const GpuGroupByBuffers& gpu_group_by_buffers,
172  const RelAlgExecutionUnit& ra_exe_unit,
173  const unsigned total_thread_count,
174  const int device_id);
175 
176  const int64_t num_rows_;
177 
178  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner_;
179  std::vector<std::unique_ptr<ResultSet>> result_sets_;
180 
181  std::vector<int64_t> init_agg_vals_;
182 
183  const size_t num_buffers_;
184  std::vector<int64_t*> group_by_buffers_;
185 
190 
192 
193  friend class Executor; // Accesses result_sets_
194  friend class QueryExecutionContext;
195 };
#define CHECK_EQ(x, y)
Definition: Logger.h:195
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
std::vector< ssize_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
const auto getNumBuffers() const
void resetResultSet(const size_t index)
const int8_t const int64_t * num_rows
const int64_t const uint32_t const uint32_t const uint32_t const bool const int8_t warp_size
DeviceAllocator * device_allocator_
ExecutorDeviceType
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t *>> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
const auto getCountDistinctBitmapBytes() const
unsigned long long CUdeviceptr
Definition: nocuda.h:27
const auto getCountDistinctBitmapPtr() const
const int64_t const uint32_t groups_buffer_entry_count
ResultSet * getResultSet(const size_t index) const
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< int64_t > init_agg_vals_
std::unique_ptr< ResultSet > getResultSetOwned(const size_t index)
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
int64_t getAggInitValForIndex(const size_t index) const
const auto getCountDistinctHostPtr() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
#define CHECK_LT(x, y)
Definition: Logger.h:197
Abstract class for managing device memory allocations.
Descriptor for the result set buffer layout.
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
Basic constructors and methods of the row set interface.
const int64_t * init_vals
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
std::vector< std::unique_ptr< ResultSet > > result_sets_
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< ssize_t > &bitmap_sizes)
void sort_on_gpu(int64_t *val_buff, int32_t *key_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)