30 std::vector<std::unique_ptr<
char[]>>& literals_owner,
33 switch (device_type) {
35 literals_owner.emplace_back(std::make_unique<
char[]>(
sizeof(int64_t)));
36 std::memcpy(literals_owner.back().get(), &literal,
sizeof(
T));
37 return reinterpret_cast<const int8_t*
>(literals_owner.back().get());
41 const auto gpu_literal_buf_ptr = gpu_allocator->
alloc(
sizeof(int64_t));
43 gpu_literal_buf_ptr, reinterpret_cast<int8_t*>(&literal),
sizeof(
T));
44 return gpu_literal_buf_ptr;
52 size_t input_element_count) {
53 size_t allocated_output_row_count = 0;
61 allocated_output_row_count =
69 return allocated_output_row_count;
76 const std::vector<InputTableInfo>& table_infos,
81 CHECK(compilation_context);
82 std::vector<std::shared_ptr<Chunk_NS::Chunk>> chunks_owner;
83 std::vector<std::unique_ptr<char[]>> literals_owner;
85 const int device_id = 0;
86 std::unique_ptr<CudaAllocator> device_allocator;
88 auto& data_mgr = executor->catalog_->getDataMgr();
89 device_allocator.reset(
new CudaAllocator(&data_mgr, device_id));
91 std::vector<const int8_t*> col_buf_ptrs;
92 std::vector<int64_t> col_sizes;
93 std::optional<size_t> output_column_size;
98 std::vector<std::vector<const int8_t*>> col_list_bufs;
99 for (
const auto& input_expr : exe_unit.
input_exprs) {
100 auto ti = input_expr->get_type_info();
101 if (!ti.is_column_list()) {
104 if (
auto col_var = dynamic_cast<Analyzer::ColumnVar*>(input_expr)) {
105 auto table_id = col_var->get_table_id();
106 auto table_info_it = std::find_if(
107 table_infos.begin(), table_infos.end(), [&table_id](
const auto& table_info) {
108 return table_info.table_id == table_id;
110 CHECK(table_info_it != table_infos.end());
114 table_info_it->info.fragments.front(),
118 device_allocator.get(),
123 if (!output_column_size) {
124 output_column_size = (buf_elem_count ? buf_elem_count : 1);
126 if (ti.is_column_list()) {
127 if (col_index == -1) {
128 col_list_bufs.push_back({});
129 col_list_bufs.back().reserve(ti.get_dimension());
131 CHECK_EQ(col_sizes.back(), buf_elem_count);
134 col_list_bufs.back().push_back(col_buf);
136 if (col_index + 1 == ti.get_dimension()) {
140 col_buf_ptrs.push_back((
const int8_t*)col_list_bufs.back().data());
142 col_buf_ptrs.push_back(col_buf);
144 col_sizes.push_back(buf_elem_count);
145 }
else if (
const auto& constant_val = dynamic_cast<Analyzer::Constant*>(input_expr)) {
148 col_sizes.push_back(0);
149 const auto const_val_datum = constant_val->get_constval();
150 const auto& ti = constant_val->get_type_info();
157 device_allocator.get()));
163 device_allocator.get()));
168 }
else if (ti.is_integer()) {
174 device_allocator.get()));
180 device_allocator.get()));
186 device_allocator.get()));
192 device_allocator.get()));
198 throw std::runtime_error(
"Literal value " + constant_val->toString() +
199 " is not yet supported.");
205 CHECK(output_column_size);
206 switch (device_type) {
230 std::vector<const int8_t*>& col_buf_ptrs,
231 std::vector<int64_t>& col_sizes,
232 const size_t elem_count,
233 Executor* executor) {
235 const auto byte_stream_ptr =
reinterpret_cast<const int8_t**
>(col_buf_ptrs.data());
236 CHECK(byte_stream_ptr);
244 for (
size_t i = 0;
i < num_out_columns;
i++) {
250 auto query_buffers = std::make_unique<QueryMemoryInitializer>(
255 allocated_output_row_count,
256 std::vector<std::vector<const int8_t*>>{col_buf_ptrs},
257 std::vector<std::vector<uint64_t>>{{0}},
263 int64_t output_row_count = allocated_output_row_count;
264 auto group_by_buffers_ptr = query_buffers->getGroupByBuffersPtr();
265 CHECK(group_by_buffers_ptr);
267 auto output_buffers_ptr =
reinterpret_cast<int64_t*
>(group_by_buffers_ptr[0]);
268 std::vector<int64_t*> output_col_buf_ptrs;
269 for (
size_t i = 0;
i < num_out_columns;
i++) {
270 output_col_buf_ptrs.emplace_back(output_buffers_ptr +
i * allocated_output_row_count);
275 const auto err = compilation_context->
getFuncPtr()(
276 byte_stream_ptr, col_sizes.data(), output_col_buf_ptrs.data(), &output_row_count);
278 throw std::runtime_error(
"Error executing table function: " +
std::to_string(err));
280 if (exe_unit.table_func.hasNonUserSpecifiedOutputSizeConstant()) {
281 if (static_cast<size_t>(output_row_count) != allocated_output_row_count) {
282 throw std::runtime_error(
283 "Table function with constant sizing parameter must return " +
288 if (output_row_count < 0 || (
size_t)output_row_count > allocated_output_row_count) {
289 output_row_count = allocated_output_row_count;
293 query_buffers->getResultSet(0)->updateStorageEntryCount(output_row_count);
295 const size_t column_size = output_row_count *
sizeof(int64_t);
296 const size_t allocated_column_size = allocated_output_row_count *
sizeof(int64_t);
298 int8_t* src =
reinterpret_cast<int8_t*
>(output_buffers_ptr);
299 int8_t* dst =
reinterpret_cast<int8_t*
>(output_buffers_ptr);
300 for (
size_t i = 0;
i < num_out_columns;
i++) {
302 auto t = memmove(dst, src, column_size);
305 src += allocated_column_size;
309 return query_buffers->getResultSetOwned(0);
326 std::vector<const int8_t*>& col_buf_ptrs,
327 std::vector<int64_t>& col_sizes,
328 const size_t elem_count,
330 Executor* executor) {
333 auto& data_mgr = executor->catalog_->getDataMgr();
334 auto gpu_allocator = std::make_unique<CudaAllocator>(&data_mgr, device_id);
335 CHECK(gpu_allocator);
338 auto byte_stream_ptr = gpu_allocator->alloc(col_buf_ptrs.size() *
sizeof(int64_t));
339 gpu_allocator->copyToDevice(byte_stream_ptr,
340 reinterpret_cast<int8_t*>(col_buf_ptrs.data()),
341 col_buf_ptrs.size() *
sizeof(int64_t));
344 auto col_sizes_ptr = gpu_allocator->alloc(col_sizes.size() *
sizeof(int64_t));
345 gpu_allocator->copyToDevice(col_sizes_ptr,
346 reinterpret_cast<int8_t*>(col_sizes.data()),
347 col_sizes.size() *
sizeof(int64_t));
351 reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(
sizeof(int32_t)));
357 for (
size_t i = 0;
i < num_out_columns;
i++) {
362 auto query_buffers = std::make_unique<QueryMemoryInitializer>(
367 allocated_output_row_count,
368 std::vector<std::vector<const int8_t*>>{col_buf_ptrs},
369 std::vector<std::vector<uint64_t>>{{0}},
375 int64_t output_row_count = allocated_output_row_count;
378 reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(
sizeof(int64_t*)));
379 gpu_allocator->copyToDevice(reinterpret_cast<int8_t*>(kernel_params[
OUTPUT_ROW_COUNT]),
380 reinterpret_cast<int8_t*>(&output_row_count),
381 sizeof(output_row_count));
384 const unsigned block_size_x = 1;
385 const unsigned block_size_y = 1;
386 const unsigned block_size_z = 1;
388 const unsigned grid_size_x = 1;
389 const unsigned grid_size_y = 1;
390 const unsigned grid_size_z = 1;
392 auto gpu_output_buffers = query_buffers->setupTableFunctionGpuBuffers(
393 query_mem_desc, device_id, block_size_x, grid_size_x);
400 std::vector<void*> param_ptrs;
401 for (
auto& param : kernel_params) {
402 param_ptrs.push_back(¶m);
406 const auto gpu_context = compilation_context->
getGpuCode();
408 const auto native_code = gpu_context->getNativeCode(device_id);
409 auto cu_func =
static_cast<CUfunction>(native_code.first);
424 gpu_allocator->copyFromDevice(
425 reinterpret_cast<int8_t*>(&output_row_count),
426 reinterpret_cast<int8_t*>(kernel_params[OUTPUT_ROW_COUNT]),
428 if (exe_unit.table_func.hasNonUserSpecifiedOutputSizeConstant()) {
429 if (static_cast<size_t>(output_row_count) != allocated_output_row_count) {
430 throw std::runtime_error(
431 "Table function with constant sizing parameter must return " +
436 if (output_row_count < 0 || (
size_t)output_row_count > allocated_output_row_count) {
437 output_row_count = allocated_output_row_count;
442 query_buffers->getResultSet(0)->updateStorageEntryCount(output_row_count);
445 query_buffers->copyFromTableFunctionGpuBuffers(&data_mgr,
453 return query_buffers->getResultSetOwned(0);
Defines data structures for the semantic analysis phase of query processing.
ResultSetPtr launchGpuCode(const TableFunctionExecutionUnit &exe_unit, const TableFunctionCompilationContext *compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, const size_t elem_count, const int device_id, Executor *executor)
size_t get_output_row_count(const TableFunctionExecutionUnit &exe_unit, size_t input_element_count)
GpuCompilationContext * getGpuCode() const
std::vector< Analyzer::Expr * > input_exprs
void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const override
const size_t output_buffer_size_param
const table_functions::TableFunction table_func
void checkCudaErrors(CUresult err)
unsigned long long CUdeviceptr
void setOutputColumnar(const bool val)
ColumnCacheMap columnarized_table_cache_
std::shared_ptr< ResultSet > ResultSetPtr
ResultSetPtr execute(const TableFunctionExecutionUnit &exe_unit, const std::vector< InputTableInfo > &table_infos, const TableFunctionCompilationContext *compilation_context, const ColumnFetcher &column_fetcher, const ExecutorDeviceType device_type, Executor *executor)
size_t get_bit_width(const SQLTypeInfo &ti)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
TableFunctionCompilationContext::FuncPtr getFuncPtr() const
int8_t * alloc(const size_t num_bytes) override
static std::pair< const int8_t *, size_t > getOneColumnFragment(Executor *executor, const Analyzer::ColumnVar &hash_col, const Fragmenter_Namespace::FragmentInfo &fragment, const Data_Namespace::MemoryLevel effective_mem_lvl, const int device_id, DeviceAllocator *device_allocator, const size_t thread_idx, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, ColumnCacheMap &column_cache)
Gets one chunk's pointer and element count on either CPU or GPU.
#define DEBUG_TIMER(name)
std::vector< Analyzer::Expr * > target_exprs
ResultSetPtr launchCpuCode(const TableFunctionExecutionUnit &exe_unit, const TableFunctionCompilationContext *compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, const size_t elem_count, Executor *executor)
void addColSlotInfo(const std::vector< std::tuple< int8_t, int8_t >> &slots_for_col)
OutputBufferSizeType getOutputRowSizeType() const
const int8_t * create_literal_buffer(T literal, const ExecutorDeviceType device_type, std::vector< std::unique_ptr< char[]>> &literals_owner, CudaAllocator *gpu_allocator)