OmniSciDB  8a228a1076
ArrowResultSet.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "CompilationOptions.h"
20 #include "DataMgr/DataMgr.h"
22 #include "ResultSet.h"
23 #include "TargetMetaInfo.h"
24 #include "TargetValue.h"
25 
26 #include <type_traits>
27 
28 #include "arrow/api.h"
29 #include "arrow/ipc/api.h"
30 #ifdef HAVE_CUDA
31 #include <arrow/gpu/cuda_api.h>
32 #endif // HAVE_CUDA
33 
34 static_assert(ARROW_VERSION >= 16000, "Apache Arrow v0.16.0 or above is required.");
35 
36 // TODO(wamsi): ValueArray is not optimal. Remove it and inherrit from base vector class.
37 using ValueArray = boost::variant<std::vector<bool>,
38  std::vector<int8_t>,
39  std::vector<int16_t>,
40  std::vector<int32_t>,
41  std::vector<int64_t>,
42  std::vector<float>,
43  std::vector<double>,
44  std::vector<std::string>>;
45 
46 class ArrowResultSet;
47 
49  public:
50  using value_type = std::vector<TargetValue>;
51  using difference_type = std::ptrdiff_t;
52  using pointer = std::vector<TargetValue>*;
53  using reference = std::vector<TargetValue>&;
54  using iterator_category = std::input_iterator_tag;
55 
56  bool operator==(const ArrowResultSetRowIterator& other) const {
57  return result_set_ == other.result_set_ && crt_row_idx_ == other.crt_row_idx_;
58  }
59  bool operator!=(const ArrowResultSetRowIterator& other) const {
60  return !(*this == other);
61  }
62 
63  inline value_type operator*() const;
65  crt_row_idx_++;
66  return *this;
67  }
69  ArrowResultSetRowIterator iter(*this);
70  ++(*this);
71  return iter;
72  }
73 
74  private:
76  size_t crt_row_idx_;
77 
79  : result_set_(rs), crt_row_idx_(0){};
80 
81  friend class ArrowResultSet;
82 };
83 
84 struct ArrowResult {
85  std::vector<char> sm_handle;
86  int64_t sm_size;
87  std::vector<char> df_handle;
88  int64_t df_size;
89  std::string serialized_cuda_handle; // Only for GPU memory deallocation
90 };
91 
92 // Expose Arrow buffers as a subset of the ResultSet interface
93 // to make it work within the existing execution test framework.
95  public:
96  ArrowResultSet(const std::shared_ptr<ResultSet>& rows,
97  const std::vector<TargetMetaInfo>& targets_meta);
98  ArrowResultSet(const std::shared_ptr<ResultSet>& rows) : ArrowResultSet(rows, {}) {}
99 
101  bool translate_strings,
102  bool decimal_to_double) const {
103  ArrowResultSetRowIterator iter(this);
104  for (size_t i = 0; i < from_index; i++) {
105  ++iter;
106  }
107 
108  return iter;
109  }
110 
111  ArrowResultSetRowIterator rowIterator(bool translate_strings,
112  bool decimal_to_double) const {
113  return rowIterator(0, translate_strings, decimal_to_double);
114  }
115 
116  std::vector<TargetValue> getRowAt(const size_t index) const;
117 
118  std::vector<TargetValue> getNextRow(const bool translate_strings,
119  const bool decimal_to_double) const;
120 
121  size_t colCount() const;
122 
123  SQLTypeInfo getColType(const size_t col_idx) const;
124 
125  bool definitelyHasNoRows() const;
126 
127  size_t rowCount() const;
128 
129  static void deallocateArrowResultBuffer(
130  const ArrowResult& result,
131  const ExecutorDeviceType device_type,
132  const size_t device_id,
133  std::shared_ptr<Data_Namespace::DataMgr>& data_mgr);
134 
135  private:
136  void resultSetArrowLoopback();
137  template <typename Type, typename ArrayType>
138  void appendValue(std::vector<TargetValue>& row,
139  const arrow::Array& column,
140  const Type null_val,
141  const size_t idx) const;
142 
143  std::shared_ptr<ResultSet> rows_;
144  std::vector<TargetMetaInfo> targets_meta_;
145  std::shared_ptr<arrow::RecordBatch> record_batch_;
146  arrow::ipc::DictionaryMemo dictionary_memo_;
147 
148  // Boxed arrays from the record batch. The result of RecordBatch::column is
149  // temporary, so we cache these for better performance
150  std::vector<std::shared_ptr<arrow::Array>> columns_;
151  mutable size_t crt_row_idx_;
152  std::vector<TargetMetaInfo> column_metainfo_;
153 };
154 
157 }
158 
159 class ExecutionResult;
160 
161 // Take results from the executor, serializes them to Arrow and then deserialize
162 // them to ArrowResultSet, which can then be used by the existing test framework.
163 std::unique_ptr<ArrowResultSet> result_set_arrow_loopback(const ExecutionResult& results);
164 
165 // QUERYENGINE_// Take results from the executor, serializes them to Arrow and then
166 // deserialize them to ArrowResultSet, which can then be used by the existing test
167 // framework.
168 std::unique_ptr<ArrowResultSet> result_set_arrow_loopback(
169  const ExecutionResult* results,
170  const std::shared_ptr<ResultSet>& rows);
171 
173  public:
174  ArrowResultSetConverter(const std::shared_ptr<ResultSet>& results,
175  const std::shared_ptr<Data_Namespace::DataMgr> data_mgr,
176  const ExecutorDeviceType device_type,
177  const int32_t device_id,
178  const std::vector<std::string>& col_names,
179  const int32_t first_n)
180  : results_(results)
181  , data_mgr_(data_mgr)
182  , device_type_(device_type)
183  , device_id_(device_id)
184  , col_names_(col_names)
185  , top_n_(first_n) {}
186 
187  ArrowResult getArrowResult() const;
188 
189  // TODO(adb): Proper namespacing for this set of functionality. For now, make this
190  // public and leverage the converter class as namespace
191  struct ColumnBuilder {
192  std::shared_ptr<arrow::Field> field;
193  std::unique_ptr<arrow::ArrayBuilder> builder;
196  };
197 
198  private:
199  ArrowResultSetConverter(const std::shared_ptr<ResultSet>& results,
200  const std::vector<std::string>& col_names,
201  const int32_t first_n)
202  : results_(results), col_names_(col_names), top_n_(first_n) {}
203 
204  std::shared_ptr<arrow::RecordBatch> convertToArrow() const;
205 
206  std::shared_ptr<arrow::RecordBatch> getArrowBatch(
207  const std::shared_ptr<arrow::Schema>& schema) const;
208 
209  std::shared_ptr<arrow::Field> makeField(const std::string name,
210  const SQLTypeInfo& target_type) const;
211 
213  std::shared_ptr<arrow::Buffer> schema;
214  std::shared_ptr<arrow::Buffer> records;
215  };
216  SerializedArrowOutput getSerializedArrowOutput(arrow::ipc::DictionaryMemo* memo) const;
217 
218  void initializeColumnBuilder(ColumnBuilder& column_builder,
219  const SQLTypeInfo& col_type,
220  const std::shared_ptr<arrow::Field>& field) const;
221 
222  void append(ColumnBuilder& column_builder,
223  const ValueArray& values,
224  const std::shared_ptr<std::vector<bool>>& is_valid) const;
225 
226  inline std::shared_ptr<arrow::Array> finishColumnBuilder(
227  ColumnBuilder& column_builder) const;
228 
229  std::shared_ptr<ResultSet> results_;
230  std::shared_ptr<Data_Namespace::DataMgr> data_mgr_ = nullptr;
232  int32_t device_id_ = 0;
233  std::vector<std::string> col_names_;
234  int32_t top_n_;
235 
236  friend class ArrowResultSet;
237 };
238 
239 template <typename T>
240 constexpr auto scale_epoch_values() {
241  return std::is_same<T, arrow::Date32Builder>::value ||
242  std::is_same<T, arrow::Date64Builder>::value;
243 }
std::unique_ptr< arrow::ArrayBuilder > builder
ArrowResultSetRowIterator rowIterator(bool translate_strings, bool decimal_to_double) const
ArrowResultSetRowIterator & operator++(void)
SQLTypes
Definition: sqltypes.h:40
double decimal_to_double(const SQLTypeInfo &otype, int64_t oval)
std::vector< char > sm_handle
ExecutorDeviceType
std::shared_ptr< ResultSet > rows_
ArrowResultSet(const std::shared_ptr< ResultSet > &rows)
std::shared_ptr< arrow::Field > field
arrow::ipc::DictionaryMemo dictionary_memo_
value_type operator*() const
std::unique_ptr< ArrowResultSet > result_set_arrow_loopback(const ExecutionResult &results)
const rapidjson::Value & field(const rapidjson::Value &obj, const char field[]) noexcept
Definition: JsonAccessors.h:31
std::vector< std::string > col_names_
std::shared_ptr< arrow::Buffer > records
std::vector< TargetValue > & reference
std::vector< char > df_handle
bool operator==(const ArrowResultSetRowIterator &other) const
boost::variant< std::vector< bool >, std::vector< int8_t >, std::vector< int16_t >, std::vector< int32_t >, std::vector< int64_t >, std::vector< float >, std::vector< double >, std::vector< std::string > > ValueArray
size_t append(FILE *f, const size_t size, int8_t *buf)
Appends the specified number of bytes to the end of the file f from buf.
Definition: File.cpp:140
std::vector< TargetValue > getRowAt(const size_t index) const
std::vector< TargetValue > * pointer
std::ptrdiff_t difference_type
ArrowResultSetConverter(const std::shared_ptr< ResultSet > &results, const std::vector< std::string > &col_names, const int32_t first_n)
ArrowResultSetRowIterator rowIterator(size_t from_index, bool translate_strings, bool decimal_to_double) const
std::input_iterator_tag iterator_category
std::shared_ptr< ResultSet > results_
int64_t sm_size
ArrowResultSetRowIterator(const ArrowResultSet *rs)
std::vector< TargetValue > value_type
std::string serialized_cuda_handle
std::vector< TargetMetaInfo > column_metainfo_
int64_t df_size
ArrowResultSetRowIterator operator++(int)
Basic constructors and methods of the row set interface.
std::shared_ptr< arrow::RecordBatch > record_batch_
const ArrowResultSet * result_set_
bool operator!=(const ArrowResultSetRowIterator &other) const
std::shared_ptr< arrow::Buffer > schema
std::vector< TargetMetaInfo > targets_meta_
constexpr auto scale_epoch_values()
std::vector< std::shared_ptr< arrow::Array > > columns_
ArrowResultSetConverter(const std::shared_ptr< ResultSet > &results, const std::shared_ptr< Data_Namespace::DataMgr > data_mgr, const ExecutorDeviceType device_type, const int32_t device_id, const std::vector< std::string > &col_names, const int32_t first_n)