OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
NoneEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef NONE_ENCODER_H
18 #define NONE_ENCODER_H
19 
20 #include "AbstractBuffer.h"
21 #include "Encoder.h"
22 
23 #include <Shared/DatumFetchers.h>
24 #include <Shared/Iteration.h>
25 
26 #include <tbb/parallel_for.h>
27 #include <tbb/parallel_reduce.h>
28 #include <tuple>
29 
30 template <typename T>
32  return std::is_integral<T>::value ? inline_int_null_value<T>()
33  : inline_fp_null_value<T>();
34 }
35 
36 template <typename T>
37 class NoneEncoder : public Encoder {
38  public:
41  }
42 
43  size_t getNumElemsForBytesEncodedDataAtIndices(const int8_t* index_data,
44  const std::vector<size_t>& selected_idx,
45  const size_t byte_limit) override {
46  UNREACHABLE()
47  << "getNumElemsForBytesEncodedDataAtIndices unexpectedly called for non varlen"
48  " encoder";
49  return {};
50  }
51 
52  std::shared_ptr<ChunkMetadata> appendEncodedDataAtIndices(
53  const int8_t*,
54  int8_t* data,
55  const std::vector<size_t>& selected_idx) override {
56  std::shared_ptr<ChunkMetadata> chunk_metadata;
57  // NOTE: the use of `execute_over_contiguous_indices` is an optimization;
58  // it prevents having to copy or move the indexed data and instead performs
59  // an append over contiguous sections of indices.
61  selected_idx, [&](const size_t start_pos, const size_t end_pos) {
62  size_t elem_count = end_pos - start_pos;
63  auto data_ptr = data + sizeof(T) * selected_idx[start_pos];
64  chunk_metadata = appendData(data_ptr, elem_count, SQLTypeInfo{}, false);
65  });
66 
67  return chunk_metadata;
68  }
69 
70  std::shared_ptr<ChunkMetadata> appendEncodedData(const int8_t*,
71  int8_t* data,
72  const size_t start_idx,
73  const size_t num_elements) override {
74  auto current_data = data + sizeof(T) * start_idx;
75  return appendValidatedOrNonValidatedData(current_data, num_elements, false, -1, true);
76  }
77 
78  std::shared_ptr<ChunkMetadata> appendData(int8_t*& src_data,
79  const size_t num_elems_to_append,
80  const SQLTypeInfo&,
81  const bool replicating = false,
82  const int64_t offset = -1) override {
84  src_data, num_elems_to_append, replicating, offset, false);
85  }
86 
87  void getMetadata(const std::shared_ptr<ChunkMetadata>& chunkMetadata) override {
88  Encoder::getMetadata(chunkMetadata); // call on parent class
89  chunkMetadata->fillChunkStats(dataMin, dataMax, has_nulls);
90  }
91 
92  // Only called from the executor for synthesized meta-information.
93  std::shared_ptr<ChunkMetadata> getMetadata(const SQLTypeInfo& ti) override {
94  auto chunk_metadata = std::make_shared<ChunkMetadata>(ti, 0, 0, ChunkStats{});
95  chunk_metadata->fillChunkStats(dataMin, dataMax, has_nulls);
96  return chunk_metadata;
97  }
98 
99  // Only called from the executor for synthesized meta-information.
100  void updateStats(const int64_t val, const bool is_null) override {
101  if (is_null) {
102  has_nulls = true;
103  } else {
104  const auto data = static_cast<T>(val);
105  dataMin = std::min(dataMin, data);
106  dataMax = std::max(dataMax, data);
107  }
108  }
109 
110  // Only called from the executor for synthesized meta-information.
111  void updateStats(const double val, const bool is_null) override {
112  if (is_null) {
113  has_nulls = true;
114  } else {
115  const auto data = static_cast<T>(val);
116  dataMin = std::min(dataMin, data);
117  dataMax = std::max(dataMax, data);
118  }
119  }
120 
121  void updateStats(const int8_t* const src_data, const size_t num_elements) override {
122  updateStats(src_data, num_elements, false);
123  }
124 
125  void updateStatsEncoded(const int8_t* const dst_data,
126  const size_t num_elements) override {
127  const T* data = reinterpret_cast<const T*>(dst_data);
128 
130  tbb::blocked_range(size_t(0), num_elements),
131  std::tuple(dataMin, dataMax, has_nulls),
132  [&](const auto& range, auto init) {
133  auto [min, max, nulls] = init;
134  for (size_t i = range.begin(); i < range.end(); i++) {
135  if (data[i] != none_encoded_null_value<T>()) {
137  min = std::min(min, data[i]);
138  max = std::max(max, data[i]);
139  } else {
140  nulls = true;
141  }
142  }
143  return std::tuple(min, max, nulls);
144  },
145  [&](auto lhs, auto rhs) {
146  const auto [lhs_min, lhs_max, lhs_nulls] = lhs;
147  const auto [rhs_min, rhs_max, rhs_nulls] = rhs;
148  return std::tuple(std::min(lhs_min, rhs_min),
149  std::max(lhs_max, rhs_max),
150  lhs_nulls || rhs_nulls);
151  });
152  }
153 
154  void updateStats(const std::vector<std::string>* const src_data,
155  const size_t start_idx,
156  const size_t num_elements) override {
157  UNREACHABLE();
158  }
159 
160  void updateStats(const std::vector<ArrayDatum>* const src_data,
161  const size_t start_idx,
162  const size_t num_elements) override {
163  UNREACHABLE();
164  }
165 
166  // Only called from the executor for synthesized meta-information.
167  void reduceStats(const Encoder& that) override {
168  const auto that_typed = static_cast<const NoneEncoder&>(that);
169  if (that_typed.has_nulls) {
170  has_nulls = true;
171  }
172  dataMin = std::min(dataMin, that_typed.dataMin);
173  dataMax = std::max(dataMax, that_typed.dataMax);
174  }
175 
176  void writeMetadata(FILE* f) override {
177  // assumes pointer is already in right place
178  fwrite((int8_t*)&num_elems_, sizeof(size_t), 1, f);
179  fwrite((int8_t*)&dataMin, sizeof(T), 1, f);
180  fwrite((int8_t*)&dataMax, sizeof(T), 1, f);
181  fwrite((int8_t*)&has_nulls, sizeof(bool), 1, f);
182  }
183 
184  void readMetadata(FILE* f) override {
185  // assumes pointer is already in right place
186  fread((int8_t*)&num_elems_, sizeof(size_t), 1, f);
187  fread((int8_t*)&dataMin, sizeof(T), 1, f);
188  fread((int8_t*)&dataMax, sizeof(T), 1, f);
189  fread((int8_t*)&has_nulls, sizeof(bool), 1, f);
190  }
191 
192  bool resetChunkStats(const ChunkStats& stats) override {
193  const auto new_min = DatumFetcher::getDatumVal<T>(stats.min);
194  const auto new_max = DatumFetcher::getDatumVal<T>(stats.max);
195 
196  if (dataMin == new_min && dataMax == new_max && has_nulls == stats.has_nulls) {
197  return false;
198  }
199 
200  dataMin = new_min;
201  dataMax = new_max;
202  has_nulls = stats.has_nulls;
203  return true;
204  }
205 
206  void copyMetadata(const Encoder* copyFromEncoder) override {
207  num_elems_ = copyFromEncoder->getNumElems();
208  auto castedEncoder = reinterpret_cast<const NoneEncoder<T>*>(copyFromEncoder);
209  dataMin = castedEncoder->dataMin;
210  dataMax = castedEncoder->dataMax;
211  has_nulls = castedEncoder->has_nulls;
212  }
213 
214  void resetChunkStats() override {
215  dataMin = std::numeric_limits<T>::max();
216  dataMax = std::numeric_limits<T>::lowest();
217  has_nulls = false;
218  }
219 
222  bool has_nulls;
223 
224  private:
225  std::shared_ptr<ChunkMetadata> appendValidatedOrNonValidatedData(
226  int8_t*& src_data,
227  const size_t num_elems_to_append,
228  const bool replicating,
229  const int64_t offset,
230  const bool is_validated_data) {
231  if (offset == 0 && num_elems_to_append >= num_elems_) {
232  resetChunkStats();
233  }
234  T* unencodedData = reinterpret_cast<T*>(src_data);
235  std::vector<T> encoded_data;
236  if (replicating) {
237  if (num_elems_to_append > 0) {
238  encoded_data.resize(num_elems_to_append);
239  T data = validateDataAndUpdateStats(unencodedData[0]);
240  std::fill(encoded_data.begin(), encoded_data.end(), data);
241  }
242  } else {
243  updateStats(src_data, num_elems_to_append, is_validated_data);
244  }
245  if (offset == -1) {
246  auto append_data_size = num_elems_to_append * sizeof(T);
247  buffer_->reserve(buffer_->size() + append_data_size);
248  num_elems_ += num_elems_to_append;
249  buffer_->append(
250  replicating ? reinterpret_cast<int8_t*>(encoded_data.data()) : src_data,
251  append_data_size);
252  if (!replicating) {
253  src_data += num_elems_to_append * sizeof(T);
254  }
255  } else {
256  num_elems_ = offset + num_elems_to_append;
257  CHECK(!replicating);
258  CHECK_GE(offset, 0);
259  buffer_->write(
260  src_data, num_elems_to_append * sizeof(T), static_cast<size_t>(offset));
261  }
262  auto chunk_metadata = std::make_shared<ChunkMetadata>();
263  getMetadata(chunk_metadata);
264  return chunk_metadata;
265  }
266 
267  T validateDataAndUpdateStats(const T& unencoded_data,
268  const bool is_validated_data = false) {
269  if (unencoded_data == none_encoded_null_value<T>()) {
270  has_nulls = true;
271  } else {
272  if (!is_validated_data) { // does not need validation
273  decimal_overflow_validator_.validate(unencoded_data);
274  }
275  dataMin = std::min(dataMin, unencoded_data);
276  dataMax = std::max(dataMax, unencoded_data);
277  }
278  return unencoded_data;
279  }
280 
281  void updateStats(const int8_t* const src_data,
282  const size_t num_elements,
283  const bool is_validated_data) {
284  const T* unencoded_data = reinterpret_cast<const T*>(src_data);
285  for (size_t i = 0; i < num_elements; ++i) {
286  validateDataAndUpdateStats(unencoded_data[i], is_validated_data);
287  }
288  }
289 }; // class NoneEncoder
290 
291 #endif // NONE_ENCODER_H
void updateStats(const int8_t *const src_data, const size_t num_elements) override
Definition: NoneEncoder.h:121
size_t num_elems_
Definition: Encoder.h:288
void writeMetadata(FILE *f) override
Definition: NoneEncoder.h:176
DecimalOverflowValidator decimal_overflow_validator_
Definition: Encoder.h:292
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK_GE(x, y)
Definition: Logger.h:306
void updateStats(const int8_t *const src_data, const size_t num_elements, const bool is_validated_data)
Definition: NoneEncoder.h:281
bool has_nulls
Definition: ChunkMetadata.h:30
dictionary stats
Definition: report.py:116
void updateStats(const int64_t val, const bool is_null) override
Definition: NoneEncoder.h:100
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:231
void resetChunkStats() override
Definition: NoneEncoder.h:214
void execute_over_contiguous_indices(const std::vector< size_t > &indices, std::function< void(const size_t, const size_t)> to_execute)
Definition: Iteration.h:22
std::shared_ptr< ChunkMetadata > appendEncodedDataAtIndices(const int8_t *, int8_t *data, const std::vector< size_t > &selected_idx) override
Definition: NoneEncoder.h:52
void updateStats(const std::vector< std::string > *const src_data, const size_t start_idx, const size_t num_elements) override
Definition: NoneEncoder.h:154
DEVICE void fill(ARGS &&...args)
Definition: gpu_enabled.h:60
CONSTEXPR DEVICE bool is_null(const T &value)
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:290
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &, const bool replicating=false, const int64_t offset=-1) override
Definition: NoneEncoder.h:78
void init(LogOptions const &log_opts)
Definition: Logger.cpp:364
std::shared_ptr< ChunkMetadata > appendValidatedOrNonValidatedData(int8_t *&src_data, const size_t num_elems_to_append, const bool replicating, const int64_t offset, const bool is_validated_data)
Definition: NoneEncoder.h:225
size_t getNumElems() const
Definition: Encoder.h:284
An AbstractBuffer is a unit of data management for a data manager.
void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata) override
Definition: NoneEncoder.h:87
virtual void write(int8_t *src, const size_t num_bytes, const size_t offset=0, const MemoryLevel src_buffer_type=CPU_LEVEL, const int src_device_id=-1)=0
Value parallel_reduce(const blocked_range< Int > &range, const Value &identity, const RealBody &real_body, const Reduction &reduction, const Partitioner &p=Partitioner())
Parallel iteration with reduction.
T none_encoded_null_value()
Definition: NoneEncoder.h:31
bool resetChunkStats(const ChunkStats &stats) override
: Reset chunk level stats (min, max, nulls) using new values from the argument.
Definition: NoneEncoder.h:192
size_t getNumElemsForBytesEncodedDataAtIndices(const int8_t *index_data, const std::vector< size_t > &selected_idx, const size_t byte_limit) override
Definition: NoneEncoder.h:43
T validateDataAndUpdateStats(const T &unencoded_data, const bool is_validated_data=false)
Definition: NoneEncoder.h:267
bool has_nulls
Definition: NoneEncoder.h:222
void updateStats(const double val, const bool is_null) override
Definition: NoneEncoder.h:111
void updateStats(const std::vector< ArrayDatum > *const src_data, const size_t start_idx, const size_t num_elements) override
Definition: NoneEncoder.h:160
std::shared_ptr< ChunkMetadata > getMetadata(const SQLTypeInfo &ti) override
Definition: NoneEncoder.h:93
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
#define CHECK(condition)
Definition: Logger.h:291
std::shared_ptr< ChunkMetadata > appendEncodedData(const int8_t *, int8_t *data, const size_t start_idx, const size_t num_elements) override
Definition: NoneEncoder.h:70
NoneEncoder(Data_Namespace::AbstractBuffer *buffer)
Definition: NoneEncoder.h:39
void updateStatsEncoded(const int8_t *const dst_data, const size_t num_elements) override
Definition: NoneEncoder.h:125
void reduceStats(const Encoder &that) override
Definition: NoneEncoder.h:167
void copyMetadata(const Encoder *copyFromEncoder) override
Definition: NoneEncoder.h:206
void validate(T value) const
Definition: Encoder.h:54
virtual void reserve(size_t num_bytes)=0
void readMetadata(FILE *f) override
Definition: NoneEncoder.h:184