30 std::vector<std::unique_ptr<
char[]>>& literals_owner,
33 switch (device_type) {
35 literals_owner.emplace_back(std::make_unique<
char[]>(
sizeof(int64_t)));
36 std::memcpy(literals_owner.back().get(), &literal,
sizeof(
T));
37 return reinterpret_cast<const int8_t*
>(literals_owner.back().get());
41 const auto gpu_literal_buf_ptr = gpu_allocator->
alloc(
sizeof(int64_t));
43 gpu_literal_buf_ptr, reinterpret_cast<int8_t*>(&literal),
sizeof(
T));
44 return gpu_literal_buf_ptr;
52 size_t input_element_count) {
53 size_t allocated_output_row_count = 0;
61 allocated_output_row_count =
69 return allocated_output_row_count;
76 const std::vector<InputTableInfo>& table_infos,
81 CHECK(compilation_context);
82 std::vector<std::shared_ptr<Chunk_NS::Chunk>> chunks_owner;
83 std::vector<std::unique_ptr<char[]>> literals_owner;
85 const int device_id = 0;
86 std::unique_ptr<CudaAllocator> device_allocator;
88 auto& data_mgr = executor->catalog_->getDataMgr();
89 device_allocator.reset(
new CudaAllocator(&data_mgr, device_id));
91 std::vector<const int8_t*> col_buf_ptrs;
92 std::vector<int64_t> col_sizes;
93 std::optional<size_t> output_column_size;
94 for (
const auto& input_expr : exe_unit.
input_exprs) {
95 if (
auto col_var = dynamic_cast<Analyzer::ColumnVar*>(input_expr)) {
96 auto table_id = col_var->get_table_id();
97 auto table_info_it = std::find_if(
98 table_infos.begin(), table_infos.end(), [&table_id](
const auto& table_info) {
99 return table_info.table_id == table_id;
101 CHECK(table_info_it != table_infos.end());
105 table_info_it->info.fragments.front(),
109 device_allocator.get(),
114 if (!output_column_size) {
115 output_column_size = buf_elem_count;
118 col_sizes.push_back(buf_elem_count);
119 col_buf_ptrs.push_back(col_buf);
120 }
else if (
const auto& constant_val = dynamic_cast<Analyzer::Constant*>(input_expr)) {
123 col_sizes.push_back(0);
124 const auto const_val_datum = constant_val->get_constval();
125 const auto& ti = constant_val->get_type_info();
132 device_allocator.get()));
138 device_allocator.get()));
143 }
else if (ti.is_integer()) {
149 device_allocator.get()));
155 device_allocator.get()));
161 device_allocator.get()));
167 device_allocator.get()));
173 throw std::runtime_error(
"Literal value " + constant_val->toString() +
174 " is not yet supported.");
180 CHECK(output_column_size);
182 switch (device_type) {
206 std::vector<const int8_t*>& col_buf_ptrs,
207 std::vector<int64_t>& col_sizes,
208 const size_t elem_count,
209 Executor* executor) {
211 const auto byte_stream_ptr =
reinterpret_cast<const int8_t**
>(col_buf_ptrs.data());
212 CHECK(byte_stream_ptr);
220 for (
size_t i = 0; i < num_out_columns; i++) {
226 auto query_buffers = std::make_unique<QueryMemoryInitializer>(
231 allocated_output_row_count,
232 std::vector<std::vector<const int8_t*>>{col_buf_ptrs},
233 std::vector<std::vector<uint64_t>>{{0}},
239 int64_t output_row_count = allocated_output_row_count;
240 auto group_by_buffers_ptr = query_buffers->getGroupByBuffersPtr();
241 CHECK(group_by_buffers_ptr);
243 auto output_buffers_ptr =
reinterpret_cast<int64_t*
>(group_by_buffers_ptr[0]);
244 std::vector<int64_t*> output_col_buf_ptrs;
245 for (
size_t i = 0; i < num_out_columns; i++) {
246 output_col_buf_ptrs.emplace_back(output_buffers_ptr + i * allocated_output_row_count);
250 const auto err = compilation_context->
getFuncPtr()(
251 byte_stream_ptr, col_sizes.data(), output_col_buf_ptrs.data(), &output_row_count);
253 throw std::runtime_error(
"Error executing table function: " +
std::to_string(err));
255 if (exe_unit.table_func.hasNonUserSpecifiedOutputSizeConstant()) {
256 if (static_cast<size_t>(output_row_count) != allocated_output_row_count) {
257 throw std::runtime_error(
258 "Table function with constant sizing parameter must return " +
263 if (output_row_count < 0 || (
size_t)output_row_count > allocated_output_row_count) {
264 output_row_count = allocated_output_row_count;
268 query_buffers->getResultSet(0)->updateStorageEntryCount(output_row_count);
270 const size_t column_size = output_row_count *
sizeof(int64_t);
271 const size_t allocated_column_size = allocated_output_row_count *
sizeof(int64_t);
273 int8_t* src =
reinterpret_cast<int8_t*
>(output_buffers_ptr);
274 int8_t* dst =
reinterpret_cast<int8_t*
>(output_buffers_ptr);
275 for (
size_t i = 0; i < num_out_columns; i++) {
277 auto t = memmove(dst, src, column_size);
280 src += allocated_column_size;
284 return query_buffers->getResultSetOwned(0);
301 std::vector<const int8_t*>& col_buf_ptrs,
302 std::vector<int64_t>& col_sizes,
303 const size_t elem_count,
305 Executor* executor) {
308 auto& data_mgr = executor->catalog_->getDataMgr();
309 auto gpu_allocator = std::make_unique<CudaAllocator>(&data_mgr, device_id);
310 CHECK(gpu_allocator);
313 auto byte_stream_ptr = gpu_allocator->alloc(col_buf_ptrs.size() *
sizeof(int64_t));
314 gpu_allocator->copyToDevice(byte_stream_ptr,
315 reinterpret_cast<int8_t*>(col_buf_ptrs.data()),
316 col_buf_ptrs.size() *
sizeof(int64_t));
319 auto col_sizes_ptr = gpu_allocator->alloc(col_sizes.size() *
sizeof(int64_t));
320 gpu_allocator->copyToDevice(col_sizes_ptr,
321 reinterpret_cast<int8_t*>(col_sizes.data()),
322 col_sizes.size() *
sizeof(int64_t));
326 reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(
sizeof(int32_t)));
332 for (
size_t i = 0; i < num_out_columns; i++) {
337 auto query_buffers = std::make_unique<QueryMemoryInitializer>(
342 allocated_output_row_count,
343 std::vector<std::vector<const int8_t*>>{col_buf_ptrs},
344 std::vector<std::vector<uint64_t>>{{0}},
350 int64_t output_row_count = allocated_output_row_count;
353 reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(
sizeof(int64_t*)));
354 gpu_allocator->copyToDevice(reinterpret_cast<int8_t*>(kernel_params[
OUTPUT_ROW_COUNT]),
355 reinterpret_cast<int8_t*>(&output_row_count),
356 sizeof(output_row_count));
359 const unsigned block_size_x = 1;
360 const unsigned block_size_y = 1;
361 const unsigned block_size_z = 1;
363 const unsigned grid_size_x = 1;
364 const unsigned grid_size_y = 1;
365 const unsigned grid_size_z = 1;
367 auto gpu_output_buffers = query_buffers->setupTableFunctionGpuBuffers(
368 query_mem_desc, device_id, block_size_x, grid_size_x);
375 std::vector<void*> param_ptrs;
376 for (
auto& param : kernel_params) {
377 param_ptrs.push_back(¶m);
381 const auto gpu_context = compilation_context->
getGpuCode();
383 const auto native_code = gpu_context->getNativeCode(device_id);
384 auto cu_func =
static_cast<CUfunction>(native_code.first);
399 gpu_allocator->copyFromDevice(
400 reinterpret_cast<int8_t*>(&output_row_count),
401 reinterpret_cast<int8_t*>(kernel_params[OUTPUT_ROW_COUNT]),
403 if (exe_unit.table_func.hasNonUserSpecifiedOutputSizeConstant()) {
404 if (static_cast<size_t>(output_row_count) != allocated_output_row_count) {
405 throw std::runtime_error(
406 "Table function with constant sizing parameter must return " +
411 if (output_row_count < 0 || (
size_t)output_row_count > allocated_output_row_count) {
412 output_row_count = allocated_output_row_count;
417 query_buffers->getResultSet(0)->updateStorageEntryCount(output_row_count);
420 query_buffers->copyFromTableFunctionGpuBuffers(&data_mgr,
428 return query_buffers->getResultSetOwned(0);
Defines data structures for the semantic analysis phase of query processing.
ResultSetPtr launchGpuCode(const TableFunctionExecutionUnit &exe_unit, const TableFunctionCompilationContext *compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, const size_t elem_count, const int device_id, Executor *executor)
size_t get_output_row_count(const TableFunctionExecutionUnit &exe_unit, size_t input_element_count)
GpuCompilationContext * getGpuCode() const
std::vector< Analyzer::Expr * > input_exprs
void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const override
const size_t output_buffer_size_param
const table_functions::TableFunction table_func
void checkCudaErrors(CUresult err)
unsigned long long CUdeviceptr
void setOutputColumnar(const bool val)
ColumnCacheMap columnarized_table_cache_
std::shared_ptr< ResultSet > ResultSetPtr
ResultSetPtr execute(const TableFunctionExecutionUnit &exe_unit, const std::vector< InputTableInfo > &table_infos, const TableFunctionCompilationContext *compilation_context, const ColumnFetcher &column_fetcher, const ExecutorDeviceType device_type, Executor *executor)
size_t get_bit_width(const SQLTypeInfo &ti)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
TableFunctionCompilationContext::FuncPtr getFuncPtr() const
int8_t * alloc(const size_t num_bytes) override
static std::pair< const int8_t *, size_t > getOneColumnFragment(Executor *executor, const Analyzer::ColumnVar &hash_col, const Fragmenter_Namespace::FragmentInfo &fragment, const Data_Namespace::MemoryLevel effective_mem_lvl, const int device_id, DeviceAllocator *device_allocator, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, ColumnCacheMap &column_cache)
Gets one chunk's pointer and element count on either CPU or GPU.
std::vector< Analyzer::Expr * > target_exprs
ResultSetPtr launchCpuCode(const TableFunctionExecutionUnit &exe_unit, const TableFunctionCompilationContext *compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, const size_t elem_count, Executor *executor)
void addColSlotInfo(const std::vector< std::tuple< int8_t, int8_t >> &slots_for_col)
OutputBufferSizeType getOutputRowSizeType() const
const int8_t * create_literal_buffer(T literal, const ExecutorDeviceType device_type, std::vector< std::unique_ptr< char[]>> &literals_owner, CudaAllocator *gpu_allocator)