_table_function_execution_context_8cpp_source.html

 /*

  * Copyright 2022 HEAVY.AI, Inc.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */


 #include "QueryEngine/TableFunctions/TableFunctionExecutionContext.h"


 #include "Analyzer/Analyzer.h"

 #include "Logger/Logger.h"

 #include "QueryEngine/ColumnFetcher.h"

 #include "QueryEngine/GpuMemUtils.h"

 #include "QueryEngine/QueryEngine.h"

 #include "QueryEngine/TableFunctions/TableFunctionCompilationContext.h"

 #include "QueryEngine/TableFunctions/TableFunctionManager.h"

 #include "QueryEngine/Utils/FlatBuffer.h"

 #include "Shared/funcannotations.h"


 namespace {


 void append_literal_buffer(const Datum& d,

                            const SQLTypeInfo& ti,

                            int8_t* literal_buffer,

                            int64_t offset) {

   if (ti.is_fp()) {

     switch (get_bit_width(ti)) {

       case 32:

         std::memcpy(literal_buffer + offset, &d.floatval, sizeof(float));

         break;

       case 64:

         std::memcpy(literal_buffer + offset, &d.doubleval, sizeof(double));

         break;

       default:

         UNREACHABLE();

     }

   } else if (ti.is_integer() || ti.is_timestamp() || ti.is_timeinterval()) {

     switch (get_bit_width(ti)) {

       case 8:

         std::memcpy(literal_buffer + offset, &d.tinyintval, sizeof(int8_t));

         break;

       case 16:

         std::memcpy(literal_buffer + offset, &d.smallintval, sizeof(int16_t));

         break;

       case 32:

         std::memcpy(literal_buffer + offset, &d.intval, sizeof(int32_t));

         break;

       case 64:

         std::memcpy(literal_buffer + offset, &d.bigintval, sizeof(int64_t));

         break;

       default:

         UNREACHABLE();

     }

   } else if (ti.is_boolean()) {

     std::memcpy(literal_buffer + offset, &d.boolval, sizeof(int8_t));

   } else if (ti.is_text_encoding_none()) {

     auto string_size = d.stringval->size();

     std::memcpy(literal_buffer + offset, &string_size, sizeof(int64_t));

     std::memcpy(

         literal_buffer + offset + sizeof(int64_t), d.stringval->data(), string_size);

   } else {

     throw TableFunctionError("Literal value " + DatumToString(d, ti) +

                              " is not yet supported.");

   }

 }


 size_t get_output_row_count(const TableFunctionExecutionUnit& exe_unit,

                             size_t input_element_count) {

   size_t allocated_output_row_count = 0;

   switch (exe_unit.table_func.getOutputRowSizeType()) {

     case table_functions::OutputBufferSizeType::kConstant:

     case table_functions::OutputBufferSizeType::kUserSpecifiedConstantParameter:

     case table_functions::OutputBufferSizeType::kPreFlightParameter: {

       allocated_output_row_count = exe_unit.output_buffer_size_param;

       break;

     }

     case table_functions::OutputBufferSizeType::kUserSpecifiedRowMultiplier: {

       allocated_output_row_count =

           exe_unit.output_buffer_size_param * input_element_count;

       break;

     }

     case table_functions::OutputBufferSizeType::kTableFunctionSpecifiedParameter: {

       allocated_output_row_count = input_element_count;

       break;

     }

     default: {

       UNREACHABLE();

     }

   }

   return allocated_output_row_count;

 }


 }  // namespace


 ResultSetPtr TableFunctionExecutionContext::execute(

     const TableFunctionExecutionUnit& exe_unit,

     const std::vector<InputTableInfo>& table_infos,

     const std::shared_ptr<CompilationContext>& compilation_context,

     const ColumnFetcher& column_fetcher,

     const ExecutorDeviceType device_type,

     Executor* executor,

     bool is_pre_launch_udtf) {

   auto timer = DEBUG_TIMER(__func__);

   CHECK(compilation_context);

   std::vector<std::shared_ptr<Chunk_NS::Chunk>> chunks_owner;

   std::vector<std::unique_ptr<char[]>> literals_owner;


   const int device_id = 0;  // TODO(adb): support multi-gpu table functions

   std::unique_ptr<CudaAllocator> device_allocator;

   if (device_type == ExecutorDeviceType::GPU) {

     auto data_mgr = executor->getDataMgr();

     device_allocator.reset(new CudaAllocator(

         data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id)));

   }

   std::vector<const int8_t*> col_buf_ptrs;

   std::vector<int64_t> col_sizes;

   std::vector<const int8_t*> input_str_dict_proxy_ptrs;

   std::optional<size_t> input_num_rows;


   int col_index = -1;

   // TODO: col_list_bufs are allocated on CPU memory, so UDTFs with column_list

   // arguments are not supported on GPU atm.

   std::vector<std::vector<const int8_t*>> col_list_bufs;

   std::vector<std::vector<const int8_t*>> input_col_list_str_dict_proxy_ptrs;


   for (const auto& input_expr : exe_unit.input_exprs) {

     auto ti = input_expr->get_type_info();

     if (!ti.is_column_list()) {

       CHECK_EQ(col_index, -1);

     }

     if (auto col_var = dynamic_cast<Analyzer::ColumnVar*>(input_expr)) {

       CHECK(ti.is_column_list() || ti.is_column()) << "ti=" << ti;

       const auto& table_key = col_var->getTableKey();

       auto table_info_it = std::find_if(

           table_infos.begin(), table_infos.end(), [&table_key](const auto& table_info) {

             return table_info.table_key == table_key;

           });

       CHECK(table_info_it != table_infos.end());

       auto [col_buf, buf_elem_count] = ColumnFetcher::getOneColumnFragment(

           executor,

           *col_var,

           table_info_it->info.fragments.front(),

           device_type == ExecutorDeviceType::CPU ? Data_Namespace::MemoryLevel::CPU_LEVEL

                                                  : Data_Namespace::MemoryLevel::GPU_LEVEL,

           device_id,

           device_allocator.get(),

           /*thread_idx=*/0,

           chunks_owner,

           column_fetcher.columnarized_table_cache_);

       // We use the number of entries in the first column to be the number of rows to base

       // the output off of (optionally depending on the sizing parameter)

       if (!input_num_rows) {

         input_num_rows = (buf_elem_count > 0 ? buf_elem_count : 1);

       }


       int8_t* input_str_dict_proxy_ptr = nullptr;

       if (ti.is_subtype_dict_encoded_string()) {

         const auto input_string_dictionary_proxy = executor->getStringDictionaryProxy(

             ti.getStringDictKey(), executor->getRowSetMemoryOwner(), true);

         input_str_dict_proxy_ptr =

             reinterpret_cast<int8_t*>(input_string_dictionary_proxy);

       }

       if (ti.is_column_list()) {

         if (col_index == -1) {

           col_list_bufs.emplace_back();

           input_col_list_str_dict_proxy_ptrs.emplace_back();

           col_list_bufs.back().reserve(ti.get_dimension());

           input_col_list_str_dict_proxy_ptrs.back().reserve(ti.get_dimension());

         } else {

           CHECK_EQ(col_sizes.back(), buf_elem_count);

         }

         col_index++;

         col_list_bufs.back().push_back(col_buf);

         input_col_list_str_dict_proxy_ptrs.back().push_back(input_str_dict_proxy_ptr);

         // append col_buf to column_list col_buf

         if (col_index + 1 == ti.get_dimension()) {

           col_index = -1;

         }

         // columns in the same column_list point to column_list data

         col_buf_ptrs.push_back((const int8_t*)col_list_bufs.back().data());

         input_str_dict_proxy_ptrs.push_back(

             (const int8_t*)input_col_list_str_dict_proxy_ptrs.back().data());

       } else {

         col_buf_ptrs.push_back(col_buf);

         input_str_dict_proxy_ptrs.push_back(input_str_dict_proxy_ptr);

       }

       col_sizes.push_back(buf_elem_count);

     } else {

       // literals

       col_sizes.push_back(0);

       input_str_dict_proxy_ptrs.push_back(nullptr);

       size_t literal_buffer_size = 0;

       int8_t* cpu_literal_buf_ptr = nullptr;


       if (const auto& constant_val = dynamic_cast<Analyzer::Constant*>(input_expr)) {

         // TODO(adb): Unify literal handling with rest of system, either in Codegen or as

         // a separate serialization component

         const auto const_val_datum = constant_val->get_constval();

         const auto& ti = constant_val->get_type_info();

         if (ti.is_text_encoding_none()) {

           // clang-format off

           /*

             Literal string is encoded in a contiguous buffer with the

             following memory layout:


             | <string size> | <string data>       |

             |<-- 8 bytes -->|<-- <string size> -->|

           */

           // clang-format on

           literal_buffer_size =

               sizeof(int64_t) + ((const_val_datum.stringval->size() + 7) / 8) * 8;

         } else {

           literal_buffer_size = ((get_bit_width(ti) / 8 + 7) / 8) * 8;

         }

         // literal_buffer_size is round up to the next multiple of 8

         literals_owner.emplace_back(std::make_unique<char[]>(literal_buffer_size));

         cpu_literal_buf_ptr = reinterpret_cast<int8_t*>(literals_owner.back().get());

         append_literal_buffer(const_val_datum, ti, cpu_literal_buf_ptr, 0);

       } else if (const auto& array_expr =

                      dynamic_cast<Analyzer::ArrayExpr*>(input_expr)) {

         const auto& ti = input_expr->get_type_info().get_elem_type();

         // clang-format off

         /*

           Literal array expression is encoded in a contiguous buffer

           with the following memory layout:


           | <array size> | <array is_null> |  <array data>                             |

           |<-- 8 bytes ->|<-- 8 bytes ---->|<-- <array size> * <array element size> -->|

         */

         // clang-format on

         int64_t size = array_expr->getElementCount();

         int64_t is_null = (array_expr->isNull() ? 0xffffffffffffffff : 0);

         const auto elem_size = get_bit_width(ti) / 8;

         // literal_buffer_size is round up to the next multiple of 8

         literal_buffer_size = 2 * sizeof(int64_t) + (((size + 7) / 8) * 8) * elem_size;

         literals_owner.emplace_back(std::make_unique<char[]>(literal_buffer_size));

         cpu_literal_buf_ptr = reinterpret_cast<int8_t*>(literals_owner.back().get());

         std::memcpy(cpu_literal_buf_ptr, &size, sizeof(int64_t));

         std::memcpy(cpu_literal_buf_ptr + sizeof(int64_t), &is_null, sizeof(int64_t));

         for (int64_t i = 0; i < size; i++) {

           if (const auto& constant_val =

                   dynamic_cast<const Analyzer::Constant*>(array_expr->getElement(i))) {

             const auto const_val_datum = constant_val->get_constval();

             append_literal_buffer(const_val_datum,

                                   ti,

                                   cpu_literal_buf_ptr,

                                   sizeof(int64_t) * 2 + i * elem_size);

           } else {

             UNREACHABLE();

           }

         }

       } else {

         throw TableFunctionError("Unsupported expression as input to table function: " +

                                  input_expr->toString() +

                                  "\n Only literal constants and columns are supported!");

       }

       if (device_type == ExecutorDeviceType::GPU) {

         auto* gpu_allocator = device_allocator.get();

         const auto gpu_literal_buf_ptr = gpu_allocator->alloc(literal_buffer_size);

         gpu_allocator->copyToDevice(

             gpu_literal_buf_ptr, cpu_literal_buf_ptr, literal_buffer_size);

         col_buf_ptrs.push_back(gpu_literal_buf_ptr);

       } else {

         CHECK_EQ(device_type, ExecutorDeviceType::CPU);

         col_buf_ptrs.push_back(cpu_literal_buf_ptr);

       }

     }

   }

   CHECK_EQ(col_buf_ptrs.size(), exe_unit.input_exprs.size());

   CHECK_EQ(col_sizes.size(), exe_unit.input_exprs.size());

   if (!exe_unit.table_func

            .hasOutputSizeIndependentOfInputSize()) {  // includes compile-time constants,

                                                       // user-specified constants,

                                                       // and runtime table funtion

                                                       // specified sizing, only

                                                       // user-specified row-multipliers

                                                       // currently take into account input

                                                       // row size

     CHECK(input_num_rows);

   }

   std::vector<int8_t*> output_str_dict_proxy_ptrs;

   for (const auto& output_expr : exe_unit.target_exprs) {

     int8_t* output_str_dict_proxy_ptr = nullptr;

     auto ti = output_expr->get_type_info();

     if (ti.is_dict_encoded_string()) {

       const auto output_string_dictionary_proxy = executor->getStringDictionaryProxy(

           ti.getStringDictKey(), executor->getRowSetMemoryOwner(), true);

       output_str_dict_proxy_ptr =

           reinterpret_cast<int8_t*>(output_string_dictionary_proxy);

     }

     output_str_dict_proxy_ptrs.emplace_back(output_str_dict_proxy_ptr);

   }


   if (is_pre_launch_udtf) {

     CHECK(exe_unit.table_func.containsPreFlightFn());

     launchPreCodeOnCpu(

         exe_unit,

         std::dynamic_pointer_cast<CpuCompilationContext>(compilation_context),

         col_buf_ptrs,

         col_sizes,

         input_str_dict_proxy_ptrs,

         *input_num_rows,

         executor);

     return nullptr;

   } else {

     switch (device_type) {

       case ExecutorDeviceType::CPU:

         return launchCpuCode(

             exe_unit,

             std::dynamic_pointer_cast<CpuCompilationContext>(compilation_context),

             col_buf_ptrs,

             col_sizes,

             input_str_dict_proxy_ptrs,

             *input_num_rows,

             output_str_dict_proxy_ptrs,

             executor);

       case ExecutorDeviceType::GPU:

         return launchGpuCode(

             exe_unit,

             std::dynamic_pointer_cast<GpuCompilationContext>(compilation_context),

             col_buf_ptrs,

             col_sizes,

             input_str_dict_proxy_ptrs,

             *input_num_rows,

             output_str_dict_proxy_ptrs,

             /*device_id=*/0,

             executor);

     }

   }

   UNREACHABLE();

   return nullptr;

 }


 std::mutex TableFunctionManager_singleton_mutex;


 void TableFunctionExecutionContext::launchPreCodeOnCpu(

     const TableFunctionExecutionUnit& exe_unit,

     const std::shared_ptr<CpuCompilationContext>& compilation_context,

     std::vector<const int8_t*>& col_buf_ptrs,

     std::vector<int64_t>& col_sizes,

     std::vector<const int8_t*>& input_str_dict_proxy_ptrs,

     const size_t elem_count,  // taken from first source only currently

     Executor* executor) {

   auto timer = DEBUG_TIMER(__func__);

   int64_t output_row_count = 0;


   // If TableFunctionManager must be a singleton but it has been

   // initialized from another thread, TableFunctionManager constructor

   // blocks via TableFunctionManager_singleton_mutex until the

   // existing singleton is deconstructed.

   auto mgr = std::make_unique<TableFunctionManager>(

       exe_unit,

       executor,

       col_buf_ptrs,

       row_set_mem_owner_,

       /*is_singleton=*/!exe_unit.table_func.usesManager());


   // setup the inputs

   // We can have an empty col_buf_ptrs vector if there are no arguments to the function

   const auto byte_stream_ptr = !col_buf_ptrs.empty()

                                    ? reinterpret_cast<const int8_t**>(col_buf_ptrs.data())

                                    : nullptr;

   if (!col_buf_ptrs.empty()) {

     CHECK(byte_stream_ptr);

   }

   const auto col_sizes_ptr = !col_sizes.empty() ? col_sizes.data() : nullptr;

   if (!col_sizes.empty()) {

     CHECK(col_sizes_ptr);

   }

   const auto input_str_dict_proxy_byte_stream_ptr =

       !input_str_dict_proxy_ptrs.empty()

           ? reinterpret_cast<const int8_t**>(input_str_dict_proxy_ptrs.data())

           : nullptr;


   // execute

   const auto err = compilation_context->table_function_entry_point()(

       reinterpret_cast<const int8_t*>(mgr.get()),

       byte_stream_ptr,                       // input columns buffer

       col_sizes_ptr,                         // input column sizes

       input_str_dict_proxy_byte_stream_ptr,  // input string dictionary proxy ptrs

       nullptr,

       nullptr,  // output string dictionary proxy ptrs - not supported for pre-flights yet

       &output_row_count);

   if (err == TableFunctionErrorCode::NotAnError) {

     // table_function_entry_point does not initialize output_row_count

     // when a UDTF returns NotAnError, so we'll set it here.

     output_row_count = mgr->get_nrows();

   }

   if (exe_unit.table_func.hasPreFlightOutputSizer()) {

     exe_unit.output_buffer_size_param = output_row_count;

   }


   if (err == TableFunctionErrorCode::NotAnError) {

   } else if (err == TableFunctionErrorCode::GenericError) {

     throw UserTableFunctionError("Error executing table function pre flight check: " +

                                  std::string(mgr->get_error_message()));

   } else if (err) {

     throw UserTableFunctionError("Error executing table function pre flight check: " +

                                  std::to_string(err));

   }

 }


 // clang-format off

 /*

   Managing the output buffers from table functions

   ------------------------------------------------


   In general, the results of a query (a list of columns) is hold by a

   ResultSet instance. While ResultSet is a rather complicated

   structure, its most important members are


     std::vector<TargetInfo> targets_ that holds the type of output

       columns (recall: `struct TargetInfo {..., SQLTypeInfo sql_type,

       ...};`)


     std::unique_ptr<ResultSetStorage> storage_ that stores the

       underlying buffer for a result set (recall: `struct

       ResultSetStorage {..., int8_t* buff_, ...};`)


     QueryMemoryDescriptor query_mem_desc_ that describes the format of

       the storage for a result set.


   QueryMemoryDescriptor structure contains the following relevant

   members:


     QueryDescriptionType query_desc_type_ is equal to one of

       GroupByPerfectHash, GroupByBaselineHash, Projection,

       TableFunction, NonGroupedAggregate, Estimator. In the following,

       we assume query_desc_type_ == TableFunction.


     bool output_columnar_ is always true for table function result

       sets.


     size_t entry_count_ is the number of entries in the storage

       buffer. This typically corresponds to the number of output rows.


     ColSlotContext col_slot_context_ describes the internal structure

       of the storage buffer using the following members:


         std::vector<SlotSize> slot_sizes_ where we have `struct SlotSize

           { int8_t padded_size; int8_t logical_size; };`


         std::vector<std::vector<size_t>> col_to_slot_map_ describes the

           mapping of a column to possibly multiple slots.


         std::unordered_map<SlotIndex, ArraySize> varlen_output_slot_map_


   In the case of table function result sets, the QueryMemoryDescriptor

   instance is created in TableFunctionManager::allocate_output_buffers

   method and we have query_desc_type_ == TableFunction.


   Depending on the target info of an output column, the internal

   structure of the storage buffer has two variants:


     - traditional where the buffer size of a particular column is

       described by entry_count_ and

       col_slot_context_.slot_sizes_. This variant is used for output

       columns of fixed-width scalar types such as integers, floats,

       boolean, text encoded dicts, etc. For the corresponding column

       with col_idx, we have


         col_to_slot_map_[col_idx] == {slot_idx}

         slot_sizes_[slot_idx] == {column_width, column_width}


       where column_width is targets_[col_idx].sql_type.get_size().


     - flatbuffer where the buffer size of a particular column is

       described by varlen_output_slot_map_. This variant is used for

       output columns of variable length composite types such as arrays

       of ints, floats, etc. For the corresponding column with col_idx,

       we have


         col_to_slot_map_[col_idx] == {slot_idx}

         slot_sizes_[slot_idx] == {0, 0}

         varlen_output_slot_map_ contains an item col_idx:flatbuffer_size


   Only table functions produce result sets that may contain both

   variants. The variants can be distinguished via

   `getPaddedSlotWidthBytes(slot_idx) == 0` test.


   In the case of table function result sets, col_idx == slot_idx holds.


 */

 // clang-format on


 ResultSetPtr TableFunctionExecutionContext::launchCpuCode(

     const TableFunctionExecutionUnit& exe_unit,

     const std::shared_ptr<CpuCompilationContext>& compilation_context,

     std::vector<const int8_t*>& col_buf_ptrs,

     std::vector<int64_t>& col_sizes,

     std::vector<const int8_t*>& input_str_dict_proxy_ptrs,

     const size_t elem_count,  // taken from first source only currently

     std::vector<int8_t*>& output_str_dict_proxy_ptrs,

     Executor* executor) {

   auto timer = DEBUG_TIMER(__func__);

   int64_t output_row_count = 0;


   // If TableFunctionManager must be a singleton but it has been

   // initialized from another thread, TableFunctionManager constructor

   // blocks via TableFunctionManager_singleton_mutex until the

   // existing singleton is deconstructed.

   auto mgr = std::make_unique<TableFunctionManager>(

       exe_unit,

       executor,

       col_buf_ptrs,

       row_set_mem_owner_,

       /*is_singleton=*/!exe_unit.table_func.usesManager());


   if (exe_unit.table_func.hasOutputSizeKnownPreLaunch()) {

     // allocate output buffers because the size is known up front, from

     // user specified parameters (and table size in the case of a user

     // specified row multiplier)

     output_row_count = get_output_row_count(exe_unit, elem_count);

   } else if (exe_unit.table_func.hasPreFlightOutputSizer()) {

     output_row_count = exe_unit.output_buffer_size_param;

   }


   // setup the inputs

   // We can have an empty col_buf_ptrs vector if there are no arguments to the function

   const auto byte_stream_ptr = !col_buf_ptrs.empty()

                                    ? reinterpret_cast<const int8_t**>(col_buf_ptrs.data())

                                    : nullptr;

   if (!col_buf_ptrs.empty()) {

     CHECK(byte_stream_ptr);

   }

   const auto col_sizes_ptr = !col_sizes.empty() ? col_sizes.data() : nullptr;

   if (!col_sizes.empty()) {

     CHECK(col_sizes_ptr);

   }

   const auto input_str_dict_proxy_byte_stream_ptr =

       !input_str_dict_proxy_ptrs.empty()

           ? reinterpret_cast<const int8_t**>(input_str_dict_proxy_ptrs.data())

           : nullptr;


   const auto output_str_dict_proxy_byte_stream_ptr =

       !output_str_dict_proxy_ptrs.empty()

           ? reinterpret_cast<int8_t**>(output_str_dict_proxy_ptrs.data())

           : nullptr;


   // execute

   int32_t err;

   try {

     err = compilation_context->table_function_entry_point()(

         reinterpret_cast<const int8_t*>(mgr.get()),

         byte_stream_ptr,                       // input columns buffer

         col_sizes_ptr,                         // input column sizes

         input_str_dict_proxy_byte_stream_ptr,  // input str dictionary proxies

         nullptr,

         output_str_dict_proxy_byte_stream_ptr,

         &output_row_count);

   } catch (std::exception const& e) {

     throw UserTableFunctionError("Error executing table function: " +

                                  std::string(e.what()));

   }


   if (err == TableFunctionErrorCode::NotAnError) {

     // table_function_entry_point does not initialize output_row_count

     // when a UDTF returns NotAnError, so we'll set it here.

     output_row_count = mgr->get_nrows();

   } else if (err == TableFunctionErrorCode::GenericError) {

     throw UserTableFunctionError("Error executing table function: " +

                                  std::string(mgr->get_error_message()));

   }


   else if (err) {

     throw UserTableFunctionError("Error executing table function: " +

                                  std::to_string(err));

   }


   if (exe_unit.table_func.hasCompileTimeOutputSizeConstant()) {

     if (static_cast<size_t>(output_row_count) != mgr->get_nrows()) {

       throw TableFunctionError(

           "Table function with constant sizing parameter must return " +

           std::to_string(mgr->get_nrows()) + " (got " + std::to_string(output_row_count) +

           ")");

     }

   } else {

     if (output_row_count < 0 || (size_t)output_row_count > mgr->get_nrows()) {

       output_row_count = mgr->get_nrows();

     }

   }

   // Update entry count, it may differ from allocated mem size

   if (exe_unit.table_func.hasTableFunctionSpecifiedParameter() && !mgr->query_buffers) {

     // set_output_row_size has not been called

     if (output_row_count == 0) {

       // allocate for empty output columns

       mgr->allocate_output_buffers(0);

     } else {

       throw TableFunctionError("Table function must call set_output_row_size");

     }

   }


   mgr->query_buffers->getResultSet(0)->updateStorageEntryCount(output_row_count);


   auto group_by_buffers_ptr = mgr->query_buffers->getGroupByBuffersPtr();

   CHECK(group_by_buffers_ptr);

   auto output_buffers_ptr = reinterpret_cast<int64_t*>(group_by_buffers_ptr[0]);


   auto num_out_columns = exe_unit.target_exprs.size();

   int8_t* src = reinterpret_cast<int8_t*>(output_buffers_ptr);

   int8_t* dst = reinterpret_cast<int8_t*>(output_buffers_ptr);

   // Todo (todd): Consolidate this column byte offset logic that occurs in at least 4

   // places


   for (size_t col_idx = 0; col_idx < num_out_columns; col_idx++) {

     auto ti = exe_unit.target_exprs[col_idx]->get_type_info();

     if (ti.usesFlatBuffer()) {

       // TODO: implement FlatBuffer normalization when the

       // max_nof_values is larger than the nof specified values.

       //

       // TODO: implement flatbuffer resize when output_row_count < mgr->get_nrows()

       CHECK_EQ(mgr->get_nrows(), output_row_count);

       FlatBufferManager m{src};

       const size_t allocated_column_size = m.getBufferSize();

       const size_t actual_column_size = allocated_column_size;

       src = align_to_int64(src + allocated_column_size);

       dst = align_to_int64(dst + actual_column_size);

       if (ti.is_text_encoding_dict_array()) {

         const auto* ti_lite =

             reinterpret_cast<const SQLTypeInfoLite*>(m.get_user_data_buffer());

         CHECK(ti_lite);

         CHECK_EQ(*ti_lite, ti.toLite());  // ensure dict/db_id are preserved

       }

     } else {

       const size_t target_width = ti.get_size();

       const size_t allocated_column_size = target_width * mgr->get_nrows();

       const size_t actual_column_size = target_width * output_row_count;

       if (src != dst) {

         auto t = memmove(dst, src, actual_column_size);

         CHECK_EQ(dst, t);

       }

       src = align_to_int64(src + allocated_column_size);

       dst = align_to_int64(dst + actual_column_size);

     }

   }

   return mgr->query_buffers->getResultSetOwned(0);

 }


 namespace {

 enum {

   MANAGER,

   ERROR_BUFFER,

   COL_BUFFERS,

   COL_SIZES,

   INPUT_STR_DICT_PROXIES,

   OUTPUT_BUFFERS,

   OUTPUT_STR_DICT_PROXIES,

   OUTPUT_ROW_COUNT,

   KERNEL_PARAM_COUNT,

 };

 }


 ResultSetPtr TableFunctionExecutionContext::launchGpuCode(

     const TableFunctionExecutionUnit& exe_unit,

     const std::shared_ptr<GpuCompilationContext>& compilation_context,

     std::vector<const int8_t*>& col_buf_ptrs,

     std::vector<int64_t>& col_sizes,

     std::vector<const int8_t*>& input_str_dict_proxy_ptrs,

     const size_t elem_count,

     std::vector<int8_t*>& output_str_dict_proxy_ptrs,

     const int device_id,

     Executor* executor) {

 #ifdef HAVE_CUDA

   auto timer = DEBUG_TIMER(__func__);

   if (exe_unit.table_func.hasTableFunctionSpecifiedParameter()) {

     throw QueryMustRunOnCpu();

   }


   auto num_out_columns = exe_unit.target_exprs.size();

   auto data_mgr = executor->getDataMgr();

   auto gpu_allocator = std::make_unique<CudaAllocator>(

       data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));

   CHECK(gpu_allocator);

   std::vector<CUdeviceptr> kernel_params(KERNEL_PARAM_COUNT, 0);


   // TODO: implement table function manager for CUDA

   // kernels. kernel_params[MANAGER] ought to contain a device pointer

   // to a struct that a table function kernel with a

   // TableFunctionManager argument can access from the device.

   kernel_params[MANAGER] =

       reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(sizeof(int8_t*)));


   // setup the inputs

   auto byte_stream_ptr = !(col_buf_ptrs.empty())

                              ? gpu_allocator->alloc(col_buf_ptrs.size() * sizeof(int64_t))

                              : nullptr;

   if (byte_stream_ptr) {

     gpu_allocator->copyToDevice(byte_stream_ptr,

                                 reinterpret_cast<int8_t*>(col_buf_ptrs.data()),

                                 col_buf_ptrs.size() * sizeof(int64_t));

   }

   kernel_params[COL_BUFFERS] = reinterpret_cast<CUdeviceptr>(byte_stream_ptr);


   auto col_sizes_ptr = !(col_sizes.empty())

                            ? gpu_allocator->alloc(col_sizes.size() * sizeof(int64_t))

                            : nullptr;

   if (col_sizes_ptr) {

     gpu_allocator->copyToDevice(col_sizes_ptr,

                                 reinterpret_cast<int8_t*>(col_sizes.data()),

                                 col_sizes.size() * sizeof(int64_t));

   }

   kernel_params[COL_SIZES] = reinterpret_cast<CUdeviceptr>(col_sizes_ptr);


   kernel_params[ERROR_BUFFER] =

       reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(sizeof(int32_t)));

   // initialize output memory

   QueryMemoryDescriptor query_mem_desc(

       executor, elem_count, QueryDescriptionType::TableFunction);


   for (size_t i = 0; i < num_out_columns; i++) {

     const size_t col_width = exe_unit.target_exprs[i]->get_type_info().get_size();

     query_mem_desc.addColSlotInfo({std::make_tuple(col_width, col_width)});

   }

   const auto allocated_output_row_count = get_output_row_count(exe_unit, elem_count);

   auto query_buffers = std::make_unique<QueryMemoryInitializer>(

       exe_unit,

       query_mem_desc,

       device_id,

       ExecutorDeviceType::GPU,

       (allocated_output_row_count == 0 ? 1 : allocated_output_row_count),

       std::vector<std::vector<const int8_t*>>{col_buf_ptrs},

       std::vector<std::vector<uint64_t>>{{0}},  // frag offsets

       row_set_mem_owner_,

       gpu_allocator.get(),

       executor);


   // setup the output

   int64_t output_row_count = allocated_output_row_count;


   kernel_params[OUTPUT_ROW_COUNT] =

       reinterpret_cast<CUdeviceptr>(gpu_allocator->alloc(sizeof(int64_t*)));

   gpu_allocator->copyToDevice(reinterpret_cast<int8_t*>(kernel_params[OUTPUT_ROW_COUNT]),

                               reinterpret_cast<int8_t*>(&output_row_count),

                               sizeof(output_row_count));

   /*

   TODO: RBC generated runtime table functions do not support

    concurrent execution on a CUDA device. Hence, we'll force 1 as

    block/grid size in the case of runtime table functions.  To support

    this, in RBC, we'll need to expose threadIdx/blockIdx/blockDim to

    runtime table functions and these must do something sensible with

    this information..

  */

   const unsigned block_size_x =

       (exe_unit.table_func.isRuntime() ? 1 : executor->blockSize());

   const unsigned block_size_y = 1;

   const unsigned block_size_z = 1;

   const unsigned grid_size_x =

       (exe_unit.table_func.isRuntime() ? 1 : executor->gridSize());

   const unsigned grid_size_y = 1;

   const unsigned grid_size_z = 1;


   auto gpu_output_buffers =

       query_buffers->setupTableFunctionGpuBuffers(query_mem_desc,

                                                   device_id,

                                                   block_size_x,

                                                   grid_size_x,

                                                   true /* zero_initialize_buffers */);


   kernel_params[OUTPUT_BUFFERS] = reinterpret_cast<CUdeviceptr>(gpu_output_buffers.ptrs);


   // execute

   CHECK_EQ(static_cast<size_t>(KERNEL_PARAM_COUNT), kernel_params.size());


   std::vector<void*> param_ptrs;

   for (auto& param : kernel_params) {

     param_ptrs.push_back(&param);

   }


   // Get cu func


   CHECK(compilation_context);

   const auto native_code = compilation_context->getNativeCode(device_id);

   auto cu_func = static_cast<CUfunction>(native_code.first);

   auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);

   VLOG(1) << "Launch GPU table function kernel compiled with the following block and "

              "grid sizes: "

           << block_size_x << " and " << grid_size_x;

   checkCudaErrors(cuLaunchKernel(cu_func,

                                  grid_size_x,

                                  grid_size_y,

                                  grid_size_z,

                                  block_size_x,

                                  block_size_y,

                                  block_size_z,

                                  0,  // shared mem bytes

                                  qe_cuda_stream,

                                  &param_ptrs[0],

                                  nullptr));

   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));


   // read output row count from GPU

   gpu_allocator->copyFromDevice(

       reinterpret_cast<int8_t*>(&output_row_count),

       reinterpret_cast<int8_t*>(kernel_params[OUTPUT_ROW_COUNT]),

       sizeof(int64_t));

   if (exe_unit.table_func.hasNonUserSpecifiedOutputSize()) {

     if (static_cast<size_t>(output_row_count) != allocated_output_row_count) {

       throw TableFunctionError(

           "Table function with constant sizing parameter must return " +

           std::to_string(allocated_output_row_count) + " (got " +

           std::to_string(output_row_count) + ")");

     }

   } else {

     if (output_row_count < 0 || (size_t)output_row_count > allocated_output_row_count) {

       output_row_count = allocated_output_row_count;

     }

   }


   // Update entry count, it may differ from allocated mem size

   query_buffers->getResultSet(0)->updateStorageEntryCount(output_row_count);


   // Copy back to CPU storage

   query_buffers->copyFromTableFunctionGpuBuffers(data_mgr,

                                                  query_mem_desc,

                                                  output_row_count,

                                                  gpu_output_buffers,

                                                  device_id,

                                                  block_size_x,

                                                  grid_size_x);


   return query_buffers->getResultSetOwned(0);

 #else

   UNREACHABLE();

   return nullptr;

 #endif

 }

Datum::tinyintval
int8_t tinyintval
Definition: Datum.h:73

Analyzer.h
Defines data structures for the semantic analysis phase of query processing.

anonymous_namespace{TableFunctionExecutionContext.cpp}::KERNEL_PARAM_COUNT
Definition: TableFunctionExecutionContext.cpp:658

CHECK_EQ
#define CHECK_EQ(x, y)
Definition: Logger.h:301

anonymous_namespace{TableFunctionExecutionContext.cpp}::INPUT_STR_DICT_PROXIES
Definition: TableFunctionExecutionContext.cpp:654

anonymous_namespace{TableFunctionExecutionContext.cpp}::get_output_row_count
size_t get_output_row_count(const TableFunctionExecutionUnit &exe_unit, size_t input_element_count)
Definition: TableFunctionExecutionContext.cpp:76

DatumToString
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:460

SQLTypeInfo::is_timestamp
bool is_timestamp() const
Definition: sqltypes.h:1046

GenericError
Definition: TableFunctionManager.h:51

TableFunctionExecutionUnit::input_exprs
std::vector< Analyzer::Expr * > input_exprs
Definition: RelAlgExecutionUnit.h:212

TableFunctionExecutionUnit::table_func
const table_functions::TableFunction table_func
Definition: RelAlgExecutionUnit.h:216

CudaAllocator
Definition: CudaAllocator.h:42

QueryMemoryDescriptor
Definition: QueryMemoryDescriptor.h:68

checkCudaErrors
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38

SQLTypeInfo::is_fp
bool is_fp() const
Definition: sqltypes.h:573

FlatBuffer.h

Datum::boolval
int8_t boolval
Definition: Datum.h:72

table_functions::OutputBufferSizeType::kTableFunctionSpecifiedParameter

CUdeviceptr
unsigned long long CUdeviceptr
Definition: nocuda.h:28

UNREACHABLE
#define UNREACHABLE()
Definition: Logger.h:338

NotAnError
Definition: TableFunctionManager.h:52

ResultSetPtr
std::shared_ptr< ResultSet > ResultSetPtr
Definition: RelAlgExecutionUnit.h:231

anonymous_namespace{TableFunctionExecutionContext.cpp}::ERROR_BUFFER
Definition: TableFunctionExecutionContext.cpp:651

TableFunctionExecutionUnit::output_buffer_size_param
size_t output_buffer_size_param
Definition: RelAlgExecutionUnit.h:215

Data_Namespace::CPU_LEVEL
Definition: MemoryLevel.h:21

Datum::intval
int32_t intval
Definition: Datum.h:75

ExecutorDeviceType
ExecutorDeviceType
Definition: ExecutorDeviceType.h:23

to_string
std::string to_string(char const *&&v)
Definition: StringTransform.cpp:128

ExecutorDeviceType::GPU

TableFunctionError
Definition: TableFunctionManager.h:36

heavyai::TableFunction
TableFunction
Definition: enums.h:58

Datum::floatval
float floatval
Definition: Datum.h:77

ColumnFetcher.h

table_functions::TableFunction::containsPreFlightFn
bool containsPreFlightFn() const
Definition: TableFunctionsFactory.cpp:106

get_bit_width
size_t get_bit_width(const SQLTypeInfo &ti)
Definition: SqlTypesLayout.h:175

QueryMustRunOnCpu
Definition: Execute.h:313

is_null
CONSTEXPR DEVICE bool is_null(const T &value)
Definition: InlineNullValues.h:353

anonymous_namespace{TableFunctionExecutionContext.cpp}::COL_SIZES
Definition: TableFunctionExecutionContext.cpp:653

SQLTypeInfo::is_integer
bool is_integer() const
Definition: sqltypes.h:567

ColumnFetcher
Definition: ColumnFetcher.h:49

Datum::bigintval
int64_t bigintval
Definition: Datum.h:76

Logger.h

SQLTypeInfo::is_timeinterval
bool is_timeinterval() const
Definition: sqltypes.h:594

Datum::smallintval
int16_t smallintval
Definition: Datum.h:74

CUfunction
void * CUfunction
Definition: nocuda.h:25

table_functions::OutputBufferSizeType::kUserSpecifiedRowMultiplier

SQLTypeInfo::is_boolean
bool is_boolean() const
Definition: sqltypes.h:582

TableFunctionExecutionContext::row_set_mem_owner_
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: TableFunctionExecutionContext.h:75

TableFunctionManager_singleton_mutex
std::mutex TableFunctionManager_singleton_mutex
Definition: TableFunctionExecutionContext.cpp:343

TableFunctionExecutionUnit
Definition: RelAlgExecutionUnit.h:209

Datum::stringval
std::string * stringval
Definition: Datum.h:81

SQLTypeInfoLite
Definition: sqltypes_lite.h:29

anonymous_namespace{TableFunctionExecutionContext.cpp}::COL_BUFFERS
Definition: TableFunctionExecutionContext.cpp:652

TableFunctionExecutionContext::launchCpuCode
ResultSetPtr launchCpuCode(const TableFunctionExecutionUnit &exe_unit, const std::shared_ptr< CpuCompilationContext > &compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, std::vector< const int8_t * > &input_str_dict_proxy_ptrs, const size_t elem_count, std::vector< int8_t * > &output_str_dict_proxy_ptrs, Executor *executor)
Definition: TableFunctionExecutionContext.cpp:495

table_functions::OutputBufferSizeType::kPreFlightParameter

TableFunctionExecutionContext::execute
ResultSetPtr execute(const TableFunctionExecutionUnit &exe_unit, const std::vector< InputTableInfo > &table_infos, const std::shared_ptr< CompilationContext > &compilation_context, const ColumnFetcher &column_fetcher, const ExecutorDeviceType device_type, Executor *executor, bool is_pre_launch_udtf)
Definition: TableFunctionExecutionContext.cpp:104

FlatBufferManager
Definition: FlatBuffer.h:334

ColumnFetcher::getOneColumnFragment
static std::pair< const int8_t *, size_t > getOneColumnFragment(Executor *executor, const Analyzer::ColumnVar &hash_col, const Fragmenter_Namespace::FragmentInfo &fragment, const Data_Namespace::MemoryLevel effective_mem_lvl, const int device_id, DeviceAllocator *device_allocator, const size_t thread_idx, std::vector< std::shared_ptr< Chunk_NS::Chunk >> &chunks_owner, ColumnCacheMap &column_cache)
Gets one chunk&#39;s pointer and element count on either CPU or GPU.
Definition: ColumnFetcher.cpp:71

QueryEngine.h

TableFunctionExecutionContext::launchPreCodeOnCpu
void launchPreCodeOnCpu(const TableFunctionExecutionUnit &exe_unit, const std::shared_ptr< CpuCompilationContext > &compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, std::vector< const int8_t * > &input_str_dict_proxy_ptrs, const size_t elem_count, Executor *executor)
Definition: TableFunctionExecutionContext.cpp:345

anonymous_namespace{TableFunctionExecutionContext.cpp}::OUTPUT_BUFFERS
Definition: TableFunctionExecutionContext.cpp:655

anonymous_namespace{TableFunctionExecutionContext.cpp}::OUTPUT_STR_DICT_PROXIES
Definition: TableFunctionExecutionContext.cpp:656

ExecutorDeviceType::CPU

getQueryEngineCudaStreamForDevice
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7

table_functions::TableFunction::hasTableFunctionSpecifiedParameter
bool hasTableFunctionSpecifiedParameter() const
Definition: TableFunctionsFactory.h:195

GpuMemUtils.h

CHECK
#define CHECK(condition)
Definition: Logger.h:291

DEBUG_TIMER
#define DEBUG_TIMER(name)
Definition: Logger.h:412

SQLTypeInfo
Definition: sqltypes.h:332

funcannotations.h

TableFunctionManager.h

anonymous_namespace{TableFunctionExecutionContext.cpp}::MANAGER
Definition: TableFunctionExecutionContext.cpp:650

table_functions::OutputBufferSizeType::kConstant

TableFunctionCompilationContext.h

TableFunctionExecutionUnit::target_exprs
std::vector< Analyzer::Expr * > target_exprs
Definition: RelAlgExecutionUnit.h:214

anonymous_namespace{TableFunctionExecutionContext.cpp}::OUTPUT_ROW_COUNT
Definition: TableFunctionExecutionContext.cpp:657

TableFunctionExecutionContext.h

QueryMemoryDescriptor::addColSlotInfo
void addColSlotInfo(const std::vector< std::tuple< int8_t, int8_t >> &slots_for_col)
Definition: QueryMemoryDescriptor.cpp:1224

table_functions::TableFunction::hasOutputSizeIndependentOfInputSize
bool hasOutputSizeIndependentOfInputSize() const
Definition: TableFunctionsFactory.h:210

SQLTypeInfo::is_text_encoding_none
bool is_text_encoding_none() const
Definition: sqltypes.h:614

Datum
Definition: Datum.h:71

table_functions::TableFunction::getOutputRowSizeType
OutputBufferSizeType getOutputRowSizeType() const
Definition: TableFunctionsFactory.h:227

table_functions::OutputBufferSizeType::kUserSpecifiedConstantParameter

anonymous_namespace{TableFunctionExecutionContext.cpp}::append_literal_buffer
void append_literal_buffer(const Datum &d, const SQLTypeInfo &ti, int8_t *literal_buffer, int64_t offset)
Definition: TableFunctionExecutionContext.cpp:31

TableFunctionExecutionContext::launchGpuCode
ResultSetPtr launchGpuCode(const TableFunctionExecutionUnit &exe_unit, const std::shared_ptr< GpuCompilationContext > &compilation_context, std::vector< const int8_t * > &col_buf_ptrs, std::vector< int64_t > &col_sizes, std::vector< const int8_t * > &input_str_dict_proxy_ptrs, const size_t elem_count, std::vector< int8_t * > &output_str_dict_proxy_ptrs, const int device_id, Executor *executor)
Definition: TableFunctionExecutionContext.cpp:662

ColumnFetcher::columnarized_table_cache_
ColumnCacheMap & columnarized_table_cache_
Definition: ColumnFetcher.h:186

align_to_int64
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
Definition: BufferCompaction.h:42

query_mem_desc
query_mem_desc
Definition: QueryMemoryInitializer.cpp:479

VLOG
#define VLOG(n)
Definition: Logger.h:388

Datum::doubleval
double doubleval
Definition: Datum.h:78

Data_Namespace::GPU_LEVEL
Definition: MemoryLevel.h:21

FlatBufferManager::getBufferSize
static int64_t getBufferSize(const void *buffer)
Definition: FlatBuffer.h:553

UserTableFunctionError
Definition: TableFunctionManager.h:43