#include <TableFunctionExecutionContext.h>

Public Member Functions
	TableFunctionExecutionContext (std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)

	TableFunctionExecutionContext (const TableFunctionExecutionContext &)=delete

TableFunctionExecutionContext &	operator= (const TableFunctionExecutionContext &)=delete

ResultSetPtr	execute (const TableFunctionExecutionUnit &exe_unit, const std::vector< InputTableInfo > &table_infos, const std::shared_ptr< CompilationContext > &compilation_context, const ColumnFetcher &column_fetcher, const ExecutorDeviceType device_type, Executor *executor, bool is_pre_launch_udtf)

Private Member Functions
void	launchPreCodeOnCpu (const TableFunctionExecutionUnit &exe_unit, const std::shared_ptr< CpuCompilationContext > &compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, std::vector< const int8_t * > &input_str_dict_proxy_ptrs, const size_t elem_count, Executor *executor)

ResultSetPtr	launchCpuCode (const TableFunctionExecutionUnit &exe_unit, const std::shared_ptr< CpuCompilationContext > &compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, std::vector< const int8_t * > &input_str_dict_proxy_ptrs, const size_t elem_count, std::vector< int8_t * > &output_str_dict_proxy_ptrs, Executor *executor)

ResultSetPtr	launchGpuCode (const TableFunctionExecutionUnit &exe_unit, const std::shared_ptr< GpuCompilationContext > &compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, std::vector< const int8_t * > &input_str_dict_proxy_ptrs, const size_t elem_count, std::vector< int8_t * > &output_str_dict_proxy_ptrs, const int device_id, Executor *executor)

Private Attributes
std::shared_ptr < RowSetMemoryOwner >	row_set_mem_owner_

Detailed Description

Definition at line 27 of file TableFunctionExecutionContext.h.

Constructor & Destructor Documentation

TableFunctionExecutionContext::TableFunctionExecutionContext ( std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner )

inline

Definition at line 29 of file TableFunctionExecutionContext.h.

30 : row_set_mem_owner_(row_set_mem_owner) {}

TableFunctionExecutionContext::row_set_mem_owner_

std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_

Definition: TableFunctionExecutionContext.h:75

TableFunctionExecutionContext::TableFunctionExecutionContext ( const TableFunctionExecutionContext & )

delete

Member Function Documentation

ResultSetPtr TableFunctionExecutionContext::execute	(	const TableFunctionExecutionUnit &	exe_unit,
		const std::vector< InputTableInfo > &	table_infos,
		const std::shared_ptr< CompilationContext > &	compilation_context,
		const ColumnFetcher &	column_fetcher,
		const ExecutorDeviceType	device_type,
		Executor *	executor,
		bool	is_pre_launch_udtf
	)

Definition at line 104 of file TableFunctionExecutionContext.cpp.

References anonymous_namespace{TableFunctionExecutionContext.cpp}::append_literal_buffer(), CHECK, CHECK_EQ, ColumnFetcher::columnarized_table_cache_, table_functions::TableFunction::containsPreFlightFn(), CPU, Data_Namespace::CPU_LEVEL, DEBUG_TIMER, get_bit_width(), ColumnFetcher::getOneColumnFragment(), getQueryEngineCudaStreamForDevice(), GPU, Data_Namespace::GPU_LEVEL, table_functions::TableFunction::hasOutputSizeIndependentOfInputSize(), TableFunctionExecutionUnit::input_exprs, is_null(), launchCpuCode(), launchGpuCode(), launchPreCodeOnCpu(), TableFunctionExecutionUnit::table_func, TableFunctionExecutionUnit::target_exprs, and UNREACHABLE.

Referenced by Executor::executeTableFunction().

                              {
   auto timer = DEBUG_TIMER(__func__);
   CHECK(compilation_context);
   std::vector<std::shared_ptr<Chunk_NS::Chunk>> chunks_owner;
   std::vector<std::unique_ptr<char[]>> literals_owner;
 
   const int device_id = 0;  // TODO(adb): support multi-gpu table functions
   std::unique_ptr<CudaAllocator> device_allocator;
   if (device_type == ExecutorDeviceType::GPU) {
     auto data_mgr = executor->getDataMgr();
     device_allocator.reset(new CudaAllocator(
         data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id)));
   }
   std::vector<const int8_t*> col_buf_ptrs;
   std::vector<int64_t> col_sizes;
   std::vector<const int8_t*> input_str_dict_proxy_ptrs;
   std::optional<size_t> input_num_rows;
 
   int col_index = -1;
   // TODO: col_list_bufs are allocated on CPU memory, so UDTFs with column_list
   // arguments are not supported on GPU atm.
   std::vector<std::vector<const int8_t*>> col_list_bufs;
   std::vector<std::vector<const int8_t*>> input_col_list_str_dict_proxy_ptrs;
 
   for (const auto& input_expr : exe_unit.input_exprs) {
     auto ti = input_expr->get_type_info();
     if (!ti.is_column_list()) {
       CHECK_EQ(col_index, -1);
     }
     if (auto col_var = dynamic_cast<Analyzer::ColumnVar*>(input_expr)) {
       CHECK(ti.is_column_list() || ti.is_column()) << "ti=" << ti;
       const auto& table_key = col_var->getTableKey();
       auto table_info_it = std::find_if(
           table_infos.begin(), table_infos.end(), [&table_key](const auto& table_info) {
             return table_info.table_key == table_key;
           });
       CHECK(table_info_it != table_infos.end());
       auto [col_buf, buf_elem_count] = ColumnFetcher::getOneColumnFragment(
           executor,
           *col_var,
           table_info_it->info.fragments.front(),
           device_type == ExecutorDeviceType::CPU ? Data_Namespace::MemoryLevel::CPU_LEVEL
                                                  : Data_Namespace::MemoryLevel::GPU_LEVEL,
           device_id,
           device_allocator.get(),
           /*thread_idx=*/0,
           chunks_owner,
           column_fetcher.columnarized_table_cache_);
       // We use the number of entries in the first column to be the number of rows to base
       // the output off of (optionally depending on the sizing parameter)
       if (!input_num_rows) {
         input_num_rows = (buf_elem_count > 0 ? buf_elem_count : 1);
       }
 
       int8_t* input_str_dict_proxy_ptr = nullptr;
       if (ti.is_subtype_dict_encoded_string()) {
         const auto input_string_dictionary_proxy = executor->getStringDictionaryProxy(
             ti.getStringDictKey(), executor->getRowSetMemoryOwner(), true);
         input_str_dict_proxy_ptr =
             reinterpret_cast<int8_t*>(input_string_dictionary_proxy);
       }
       if (ti.is_column_list()) {
         if (col_index == -1) {
           col_list_bufs.emplace_back();
           input_col_list_str_dict_proxy_ptrs.emplace_back();
           col_list_bufs.back().reserve(ti.get_dimension());
           input_col_list_str_dict_proxy_ptrs.back().reserve(ti.get_dimension());
         } else {
           CHECK_EQ(col_sizes.back(), buf_elem_count);
         }
         col_index++;
         col_list_bufs.back().push_back(col_buf);
         input_col_list_str_dict_proxy_ptrs.back().push_back(input_str_dict_proxy_ptr);
         // append col_buf to column_list col_buf
         if (col_index + 1 == ti.get_dimension()) {
           col_index = -1;
         }
         // columns in the same column_list point to column_list data
         col_buf_ptrs.push_back((const int8_t*)col_list_bufs.back().data());
         input_str_dict_proxy_ptrs.push_back(
             (const int8_t*)input_col_list_str_dict_proxy_ptrs.back().data());
       } else {
         col_buf_ptrs.push_back(col_buf);
         input_str_dict_proxy_ptrs.push_back(input_str_dict_proxy_ptr);
       }
       col_sizes.push_back(buf_elem_count);
     } else {
       // literals
       col_sizes.push_back(0);
       input_str_dict_proxy_ptrs.push_back(nullptr);
       size_t literal_buffer_size = 0;
       int8_t* cpu_literal_buf_ptr = nullptr;
 
       if (const auto& constant_val = dynamic_cast<Analyzer::Constant*>(input_expr)) {
         // TODO(adb): Unify literal handling with rest of system, either in Codegen or as
         // a separate serialization component
         const auto const_val_datum = constant_val->get_constval();
         const auto& ti = constant_val->get_type_info();
         if (ti.is_text_encoding_none()) {
           // clang-format off
           /*
             Literal string is encoded in a contiguous buffer with the
             following memory layout:
 
             | <string size> | <string data>       |
             |<-- 8 bytes -->|<-- <string size> -->|
           */
           // clang-format on
           literal_buffer_size =
               sizeof(int64_t) + ((const_val_datum.stringval->size() + 7) / 8) * 8;
         } else {
           literal_buffer_size = ((get_bit_width(ti) / 8 + 7) / 8) * 8;
         }
         // literal_buffer_size is round up to the next multiple of 8
         literals_owner.emplace_back(std::make_unique<char[]>(literal_buffer_size));
         cpu_literal_buf_ptr = reinterpret_cast<int8_t*>(literals_owner.back().get());
         append_literal_buffer(const_val_datum, ti, cpu_literal_buf_ptr, 0);
       } else if (const auto& array_expr =
                      dynamic_cast<Analyzer::ArrayExpr*>(input_expr)) {
         const auto& ti = input_expr->get_type_info().get_elem_type();
         // clang-format off
         /*
           Literal array expression is encoded in a contiguous buffer
           with the following memory layout:
 
           | <array size> | <array is_null> |  <array data>                             |
           |<-- 8 bytes ->|<-- 8 bytes ---->|<-- <array size> * <array element size> -->|
         */
         // clang-format on
         int64_t size = array_expr->getElementCount();
         int64_t is_null = (array_expr->isNull() ? 0xffffffffffffffff : 0);
         const auto elem_size = get_bit_width(ti) / 8;
         // literal_buffer_size is round up to the next multiple of 8
         literal_buffer_size = 2 * sizeof(int64_t) + (((size + 7) / 8) * 8) * elem_size;
         literals_owner.emplace_back(std::make_unique<char[]>(literal_buffer_size));
         cpu_literal_buf_ptr = reinterpret_cast<int8_t*>(literals_owner.back().get());
         std::memcpy(cpu_literal_buf_ptr, &size, sizeof(int64_t));
         std::memcpy(cpu_literal_buf_ptr + sizeof(int64_t), &is_null, sizeof(int64_t));
         for (int64_t i = 0; i < size; i++) {
           if (const auto& constant_val =
                   dynamic_cast<const Analyzer::Constant*>(array_expr->getElement(i))) {
             const auto const_val_datum = constant_val->get_constval();
             append_literal_buffer(const_val_datum,
                                   ti,
                                   cpu_literal_buf_ptr,
                                   sizeof(int64_t) * 2 + i * elem_size);
           } else {
             UNREACHABLE();
           }
         }
       } else {
         throw TableFunctionError("Unsupported expression as input to table function: " +
                                  input_expr->toString() +
                                  "\n Only literal constants and columns are supported!");
       }
       if (device_type == ExecutorDeviceType::GPU) {
         auto* gpu_allocator = device_allocator.get();
         const auto gpu_literal_buf_ptr = gpu_allocator->alloc(literal_buffer_size);
         gpu_allocator->copyToDevice(
             gpu_literal_buf_ptr, cpu_literal_buf_ptr, literal_buffer_size);
         col_buf_ptrs.push_back(gpu_literal_buf_ptr);
       } else {
         CHECK_EQ(device_type, ExecutorDeviceType::CPU);
         col_buf_ptrs.push_back(cpu_literal_buf_ptr);
       }
     }
   }
   CHECK_EQ(col_buf_ptrs.size(), exe_unit.input_exprs.size());
   CHECK_EQ(col_sizes.size(), exe_unit.input_exprs.size());
   if (!exe_unit.table_func
            .hasOutputSizeIndependentOfInputSize()) {  // includes compile-time constants,
                                                       // user-specified constants,
                                                       // and runtime table funtion
                                                       // specified sizing, only
                                                       // user-specified row-multipliers
                                                       // currently take into account input
                                                       // row size
     CHECK(input_num_rows);
   }
   std::vector<int8_t*> output_str_dict_proxy_ptrs;
   for (const auto& output_expr : exe_unit.target_exprs) {
     int8_t* output_str_dict_proxy_ptr = nullptr;
     auto ti = output_expr->get_type_info();
     if (ti.is_dict_encoded_string()) {
       const auto output_string_dictionary_proxy = executor->getStringDictionaryProxy(
           ti.getStringDictKey(), executor->getRowSetMemoryOwner(), true);
       output_str_dict_proxy_ptr =
           reinterpret_cast<int8_t*>(output_string_dictionary_proxy);
     }
     output_str_dict_proxy_ptrs.emplace_back(output_str_dict_proxy_ptr);
   }
 
   if (is_pre_launch_udtf) {
     CHECK(exe_unit.table_func.containsPreFlightFn());
     launchPreCodeOnCpu(
         exe_unit,
         std::dynamic_pointer_cast<CpuCompilationContext>(compilation_context),
         col_buf_ptrs,
         col_sizes,
         input_str_dict_proxy_ptrs,
         *input_num_rows,
         executor);
     return nullptr;
   } else {
     switch (device_type) {
       case ExecutorDeviceType::CPU:
         return launchCpuCode(
             exe_unit,
             std::dynamic_pointer_cast<CpuCompilationContext>(compilation_context),
             col_buf_ptrs,
             col_sizes,
             input_str_dict_proxy_ptrs,
             *input_num_rows,
             output_str_dict_proxy_ptrs,
             executor);
       case ExecutorDeviceType::GPU:
         return launchGpuCode(
             exe_unit,
             std::dynamic_pointer_cast<GpuCompilationContext>(compilation_context),
             col_buf_ptrs,
             col_sizes,
             input_str_dict_proxy_ptrs,
             *input_num_rows,
             output_str_dict_proxy_ptrs,
             /*device_id=*/0,
             executor);
     }
   }
   UNREACHABLE();
   return nullptr;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

ResultSetPtr TableFunctionExecutionContext::launchCpuCode	(	const TableFunctionExecutionUnit &	exe_unit,
		const std::shared_ptr< CpuCompilationContext > &	compilation_context,
		std::vector< const int8_t * > &	col_buf_ptrs,
		std::vector< int64_t > &	col_sizes,
		std::vector< const int8_t * > &	input_str_dict_proxy_ptrs,
		const size_t	elem_count,
		std::vector< int8_t * > &	output_str_dict_proxy_ptrs,
		Executor *	executor
	)

private

Definition at line 495 of file TableFunctionExecutionContext.cpp.

References align_to_int64(), CHECK, CHECK_EQ, DEBUG_TIMER, GenericError, anonymous_namespace{TableFunctionExecutionContext.cpp}::get_output_row_count(), FlatBufferManager::getBufferSize(), NotAnError, row_set_mem_owner_, and to_string().

Referenced by execute().

                         {
   auto timer = DEBUG_TIMER(__func__);
   int64_t output_row_count = 0;
 
   // If TableFunctionManager must be a singleton but it has been
   // initialized from another thread, TableFunctionManager constructor
   // blocks via TableFunctionManager_singleton_mutex until the
   // existing singleton is deconstructed.
   auto mgr = std::make_unique<TableFunctionManager>(
       exe_unit,
       executor,
       col_buf_ptrs,
       row_set_mem_owner_,
       /*is_singleton=*/!exe_unit.table_func.usesManager());
 
   if (exe_unit.table_func.hasOutputSizeKnownPreLaunch()) {
     // allocate output buffers because the size is known up front, from
     // user specified parameters (and table size in the case of a user
     // specified row multiplier)
     output_row_count = get_output_row_count(exe_unit, elem_count);
   } else if (exe_unit.table_func.hasPreFlightOutputSizer()) {
     output_row_count = exe_unit.output_buffer_size_param;
   }
 
   // setup the inputs
   // We can have an empty col_buf_ptrs vector if there are no arguments to the function
   const auto byte_stream_ptr = !col_buf_ptrs.empty()
                                    ? reinterpret_cast<const int8_t**>(col_buf_ptrs.data())
                                    : nullptr;
   if (!col_buf_ptrs.empty()) {
     CHECK(byte_stream_ptr);
   }
   const auto col_sizes_ptr = !col_sizes.empty() ? col_sizes.data() : nullptr;
   if (!col_sizes.empty()) {
     CHECK(col_sizes_ptr);
   }
   const auto input_str_dict_proxy_byte_stream_ptr =
       !input_str_dict_proxy_ptrs.empty()
           ? reinterpret_cast<const int8_t**>(input_str_dict_proxy_ptrs.data())
           : nullptr;
 
   const auto output_str_dict_proxy_byte_stream_ptr =
       !output_str_dict_proxy_ptrs.empty()
           ? reinterpret_cast<int8_t**>(output_str_dict_proxy_ptrs.data())
           : nullptr;
 
   // execute
   int32_t err;
   try {
     err = compilation_context->table_function_entry_point()(
         reinterpret_cast<const int8_t*>(mgr.get()),
         byte_stream_ptr,                       // input columns buffer
         col_sizes_ptr,                         // input column sizes
         input_str_dict_proxy_byte_stream_ptr,  // input str dictionary proxies
         nullptr,
         output_str_dict_proxy_byte_stream_ptr,
         &output_row_count);
   } catch (std::exception const& e) {
     throw UserTableFunctionError("Error executing table function: " +
                                  std::string(e.what()));
   }
 
   if (err == TableFunctionErrorCode::NotAnError) {
     // table_function_entry_point does not initialize output_row_count
     // when a UDTF returns NotAnError, so we'll set it here.
     output_row_count = mgr->get_nrows();
   } else if (err == TableFunctionErrorCode::GenericError) {
     throw UserTableFunctionError("Error executing table function: " +
                                  std::string(mgr->get_error_message()));
   }
 
   else if (err) {
     throw UserTableFunctionError("Error executing table function: " +
                                  std::to_string(err));
   }
 
   if (exe_unit.table_func.hasCompileTimeOutputSizeConstant()) {
     if (static_cast<size_t>(output_row_count) != mgr->get_nrows()) {
       throw TableFunctionError(
           "Table function with constant sizing parameter must return " +
           std::to_string(mgr->get_nrows()) + " (got " + std::to_string(output_row_count) +
           ")");
     }
   } else {
     if (output_row_count < 0 || (size_t)output_row_count > mgr->get_nrows()) {
       output_row_count = mgr->get_nrows();
     }
   }
   // Update entry count, it may differ from allocated mem size
   if (exe_unit.table_func.hasTableFunctionSpecifiedParameter() && !mgr->query_buffers) {
     // set_output_row_size has not been called
     if (output_row_count == 0) {
       // allocate for empty output columns
       mgr->allocate_output_buffers(0);
     } else {
       throw TableFunctionError("Table function must call set_output_row_size");
     }
   }
 
   mgr->query_buffers->getResultSet(0)->updateStorageEntryCount(output_row_count);
 
   auto group_by_buffers_ptr = mgr->query_buffers->getGroupByBuffersPtr();
   CHECK(group_by_buffers_ptr);
   auto output_buffers_ptr = reinterpret_cast<int64_t*>(group_by_buffers_ptr[0]);
 
   auto num_out_columns = exe_unit.target_exprs.size();
   int8_t* src = reinterpret_cast<int8_t*>(output_buffers_ptr);
   int8_t* dst = reinterpret_cast<int8_t*>(output_buffers_ptr);
   // Todo (todd): Consolidate this column byte offset logic that occurs in at least 4
   // places
 
   for (size_t col_idx = 0; col_idx < num_out_columns; col_idx++) {
     auto ti = exe_unit.target_exprs[col_idx]->get_type_info();
     if (ti.usesFlatBuffer()) {
       // TODO: implement FlatBuffer normalization when the
       // max_nof_values is larger than the nof specified values.
       //
       // TODO: implement flatbuffer resize when output_row_count < mgr->get_nrows()
       CHECK_EQ(mgr->get_nrows(), output_row_count);
       FlatBufferManager m{src};
       const size_t allocated_column_size = m.getBufferSize();
       const size_t actual_column_size = allocated_column_size;
       src = align_to_int64(src + allocated_column_size);
       dst = align_to_int64(dst + actual_column_size);
       if (ti.is_text_encoding_dict_array()) {
         const auto* ti_lite =
             reinterpret_cast<const SQLTypeInfoLite*>(m.get_user_data_buffer());
         CHECK(ti_lite);
         CHECK_EQ(*ti_lite, ti.toLite());  // ensure dict/db_id are preserved
       }
     } else {
       const size_t target_width = ti.get_size();
       const size_t allocated_column_size = target_width * mgr->get_nrows();
       const size_t actual_column_size = target_width * output_row_count;
       if (src != dst) {
         auto t = memmove(dst, src, actual_column_size);
         CHECK_EQ(dst, t);
       }
       src = align_to_int64(src + allocated_column_size);
       dst = align_to_int64(dst + actual_column_size);
     }
   }
   return mgr->query_buffers->getResultSetOwned(0);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

ResultSetPtr TableFunctionExecutionContext::launchGpuCode	(	const TableFunctionExecutionUnit &	exe_unit,
		const std::shared_ptr< GpuCompilationContext > &	compilation_context,
		std::vector< const int8_t * > &	col_buf_ptrs,
		std::vector< int64_t > &	col_sizes,
		std::vector< const int8_t * > &	input_str_dict_proxy_ptrs,
		const size_t	elem_count,
		std::vector< int8_t * > &	output_str_dict_proxy_ptrs,
		const int	device_id,
		Executor *	executor
	)

private

Definition at line 662 of file TableFunctionExecutionContext.cpp.

Referenced by execute().

                         {
 #ifdef HAVE_CUDA
   auto timer = DEBUG_TIMER(__func__);
   if (exe_unit.table_func.hasTableFunctionSpecifiedParameter()) {
     throw QueryMustRunOnCpu();
   }
 
   auto num_out_columns = exe_unit.target_exprs.size();
   auto data_mgr = executor->getDataMgr();
   auto gpu_allocator = std::make_unique<CudaAllocator>(
       data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
   CHECK(gpu_allocator);
   std::vector<CUdeviceptr> kernel_params(KERNEL_PARAM_COUNT, 0);
 
   // TODO: implement table function manager for CUDA
   // kernels. kernel_params[MANAGER] ought to contain a device pointer
   // to a struct that a table function kernel with a
   // TableFunctionManager argument can access from the device.
   kernel_params[MANAGER] =
       reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(sizeof(int8_t*)));
 
   // setup the inputs
   auto byte_stream_ptr = !(col_buf_ptrs.empty())
                              ? gpu_allocator->alloc(col_buf_ptrs.size() * sizeof(int64_t))
                              : nullptr;
   if (byte_stream_ptr) {
     gpu_allocator->copyToDevice(byte_stream_ptr,
                                 reinterpret_cast<int8_t*>(col_buf_ptrs.data()),
                                 col_buf_ptrs.size() * sizeof(int64_t));
   }
   kernel_params[COL_BUFFERS] = reinterpret_cast<CUdeviceptr>(byte_stream_ptr);
 
   auto col_sizes_ptr = !(col_sizes.empty())
                            ? gpu_allocator->alloc(col_sizes.size() * sizeof(int64_t))
                            : nullptr;
   if (col_sizes_ptr) {
     gpu_allocator->copyToDevice(col_sizes_ptr,
                                 reinterpret_cast<int8_t*>(col_sizes.data()),
                                 col_sizes.size() * sizeof(int64_t));
   }
   kernel_params[COL_SIZES] = reinterpret_cast<CUdeviceptr>(col_sizes_ptr);
 
   kernel_params[ERROR_BUFFER] =
       reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(sizeof(int32_t)));
   // initialize output memory
   QueryMemoryDescriptor query_mem_desc(
       executor, elem_count, QueryDescriptionType::TableFunction);
 
   for (size_t i = 0; i < num_out_columns; i++) {
     const size_t col_width = exe_unit.target_exprs[i]->get_type_info().get_size();
     query_mem_desc.addColSlotInfo({std::make_tuple(col_width, col_width)});
   }
   const auto allocated_output_row_count = get_output_row_count(exe_unit, elem_count);
   auto query_buffers = std::make_unique<QueryMemoryInitializer>(
       exe_unit,
       query_mem_desc,
       device_id,
       ExecutorDeviceType::GPU,
       (allocated_output_row_count == 0 ? 1 : allocated_output_row_count),
       std::vector<std::vector<const int8_t*>>{col_buf_ptrs},
       std::vector<std::vector<uint64_t>>{{0}},  // frag offsets
       row_set_mem_owner_,
       gpu_allocator.get(),
       executor);
 
   // setup the output
   int64_t output_row_count = allocated_output_row_count;
 
   kernel_params[OUTPUT_ROW_COUNT] =
       reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(sizeof(int64_t*)));
   gpu_allocator->copyToDevice(reinterpret_cast<int8_t*>(kernel_params[OUTPUT_ROW_COUNT]),
                               reinterpret_cast<int8_t*>(&output_row_count),
                               sizeof(output_row_count));
   /*
  ￼ TODO: RBC generated runtime table functions do not support
    concurrent execution on a CUDA device. Hence, we'll force 1 as
    block/grid size in the case of runtime table functions.  To support
    this, in RBC, we'll need to expose threadIdx/blockIdx/blockDim to
    runtime table functions and these must do something sensible with
    this information..
  */
   const unsigned block_size_x =
       (exe_unit.table_func.isRuntime() ? 1 : executor->blockSize());
   const unsigned block_size_y = 1;
   const unsigned block_size_z = 1;
   const unsigned grid_size_x =
       (exe_unit.table_func.isRuntime() ? 1 : executor->gridSize());
   const unsigned grid_size_y = 1;
   const unsigned grid_size_z = 1;
 
   auto gpu_output_buffers =
       query_buffers->setupTableFunctionGpuBuffers(query_mem_desc,
                                                   device_id,
                                                   block_size_x,
                                                   grid_size_x,
                                                   true /* zero_initialize_buffers */);
 
   kernel_params[OUTPUT_BUFFERS] = reinterpret_cast<CUdeviceptr>(gpu_output_buffers.ptrs);
 
   // execute
   CHECK_EQ(static_cast<size_t>(KERNEL_PARAM_COUNT), kernel_params.size());
 
   std::vector<void*> param_ptrs;
   for (auto& param : kernel_params) {
     param_ptrs.push_back(&param);
   }
 
   // Get cu func
 
   CHECK(compilation_context);
   const auto native_code = compilation_context->getNativeCode(device_id);
   auto cu_func = static_cast<CUfunction>(native_code.first);
   auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
   VLOG(1) << "Launch GPU table function kernel compiled with the following block and "
              "grid sizes: "
           << block_size_x << " and " << grid_size_x;
   checkCudaErrors(cuLaunchKernel(cu_func,
                                  grid_size_x,
                                  grid_size_y,
                                  grid_size_z,
                                  block_size_x,
                                  block_size_y,
                                  block_size_z,
                                  0,  // shared mem bytes
                                  qe_cuda_stream,
                                  &param_ptrs[0],
                                  nullptr));
   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
 
   // read output row count from GPU
   gpu_allocator->copyFromDevice(
       reinterpret_cast<int8_t*>(&output_row_count),
       reinterpret_cast<int8_t*>(kernel_params[OUTPUT_ROW_COUNT]),
       sizeof(int64_t));
   if (exe_unit.table_func.hasNonUserSpecifiedOutputSize()) {
     if (static_cast<size_t>(output_row_count) != allocated_output_row_count) {
       throw TableFunctionError(
           "Table function with constant sizing parameter must return " +
           std::to_string(allocated_output_row_count) + " (got " +
           std::to_string(output_row_count) + ")");
     }
   } else {
     if (output_row_count < 0 || (size_t)output_row_count > allocated_output_row_count) {
       output_row_count = allocated_output_row_count;
     }
   }
 
   // Update entry count, it may differ from allocated mem size
   query_buffers->getResultSet(0)->updateStorageEntryCount(output_row_count);
 
   // Copy back to CPU storage
   query_buffers->copyFromTableFunctionGpuBuffers(data_mgr,
                                                  query_mem_desc,
                                                  output_row_count,
                                                  gpu_output_buffers,
                                                  device_id,
                                                  block_size_x,
                                                  grid_size_x);
 
   return query_buffers->getResultSetOwned(0);
 #else
   UNREACHABLE();
   return nullptr;
 #endif
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void TableFunctionExecutionContext::launchPreCodeOnCpu	(	const TableFunctionExecutionUnit &	exe_unit,
		const std::shared_ptr< CpuCompilationContext > &	compilation_context,
		std::vector< const int8_t * > &	col_buf_ptrs,
		std::vector< int64_t > &	col_sizes,
		std::vector< const int8_t * > &	input_str_dict_proxy_ptrs,
		const size_t	elem_count,
		Executor *	executor
	)

private

Definition at line 345 of file TableFunctionExecutionContext.cpp.

References CHECK, DEBUG_TIMER, GenericError, NotAnError, row_set_mem_owner_, and to_string().

Referenced by execute().

                         {
   auto timer = DEBUG_TIMER(__func__);
   int64_t output_row_count = 0;
 
   // If TableFunctionManager must be a singleton but it has been
   // initialized from another thread, TableFunctionManager constructor
   // blocks via TableFunctionManager_singleton_mutex until the
   // existing singleton is deconstructed.
   auto mgr = std::make_unique<TableFunctionManager>(
       exe_unit,
       executor,
       col_buf_ptrs,
       row_set_mem_owner_,
       /*is_singleton=*/!exe_unit.table_func.usesManager());
 
   // setup the inputs
   // We can have an empty col_buf_ptrs vector if there are no arguments to the function
   const auto byte_stream_ptr = !col_buf_ptrs.empty()
                                    ? reinterpret_cast<const int8_t**>(col_buf_ptrs.data())
                                    : nullptr;
   if (!col_buf_ptrs.empty()) {
     CHECK(byte_stream_ptr);
   }
   const auto col_sizes_ptr = !col_sizes.empty() ? col_sizes.data() : nullptr;
   if (!col_sizes.empty()) {
     CHECK(col_sizes_ptr);
   }
   const auto input_str_dict_proxy_byte_stream_ptr =
       !input_str_dict_proxy_ptrs.empty()
           ? reinterpret_cast<const int8_t**>(input_str_dict_proxy_ptrs.data())
           : nullptr;
 
   // execute
   const auto err = compilation_context->table_function_entry_point()(
       reinterpret_cast<const int8_t*>(mgr.get()),
       byte_stream_ptr,                       // input columns buffer
       col_sizes_ptr,                         // input column sizes
       input_str_dict_proxy_byte_stream_ptr,  // input string dictionary proxy ptrs
       nullptr,
       nullptr,  // output string dictionary proxy ptrs - not supported for pre-flights yet
       &output_row_count);
   if (err == TableFunctionErrorCode::NotAnError) {
     // table_function_entry_point does not initialize output_row_count
     // when a UDTF returns NotAnError, so we'll set it here.
     output_row_count = mgr->get_nrows();
   }
   if (exe_unit.table_func.hasPreFlightOutputSizer()) {
     exe_unit.output_buffer_size_param = output_row_count;
   }
 
   if (err == TableFunctionErrorCode::NotAnError) {
   } else if (err == TableFunctionErrorCode::GenericError) {
     throw UserTableFunctionError("Error executing table function pre flight check: " +
                                  std::string(mgr->get_error_message()));
   } else if (err) {
     throw UserTableFunctionError("Error executing table function pre flight check: " +
                                  std::to_string(err));
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

TableFunctionExecutionContext& TableFunctionExecutionContext::operator= ( const TableFunctionExecutionContext & )

delete

Member Data Documentation

std::shared_ptr<RowSetMemoryOwner> TableFunctionExecutionContext::row_set_mem_owner_

private

Definition at line 75 of file TableFunctionExecutionContext.h.

Referenced by launchCpuCode(), launchGpuCode(), and launchPreCodeOnCpu().

The documentation for this class was generated from the following files:

/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/TableFunctions/TableFunctionExecutionContext.h
/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/TableFunctions/TableFunctionExecutionContext.cpp

Public Member Functions

Private Member Functions

Private Attributes

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation

Member Data Documentation