OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DateDaysEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef DATE_DAYS_ENCODER_H
18 #define DATE_DAYS_ENCODER_H
19 
20 #include "Logger/Logger.h"
21 
22 #include <iostream>
23 #include <memory>
24 #include "AbstractBuffer.h"
25 #include "Encoder.h"
26 
27 #include <Shared/DatumFetchers.h>
28 #include "Shared/Iteration.h"
29 
30 template <typename T, typename V>
31 class DateDaysEncoder : public Encoder {
32  public:
35  }
36 
37  size_t getNumElemsForBytesEncodedDataAtIndices(const int8_t* index_data,
38  const std::vector<size_t>& selected_idx,
39  const size_t byte_limit) override {
40  UNREACHABLE()
41  << "getNumElemsForBytesEncodedDataAtIndices unexpectedly called for non varlen"
42  " encoder";
43  return {};
44  }
45 
46  std::shared_ptr<ChunkMetadata> appendEncodedDataAtIndices(
47  const int8_t*,
48  int8_t* data,
49  const std::vector<size_t>& selected_idx) override {
50  std::shared_ptr<ChunkMetadata> chunk_metadata;
51  // NOTE: the use of `execute_over_contiguous_indices` is an optimization;
52  // it prevents having to copy or move the indexed data and instead performs
53  // an append over contiguous sections of indices.
55  selected_idx, [&](const size_t start_pos, const size_t end_pos) {
56  size_t elem_count = end_pos - start_pos;
57  chunk_metadata =
58  appendEncodedData(nullptr, data, selected_idx[start_pos], elem_count);
59  });
60  return chunk_metadata;
61  }
62 
63  std::shared_ptr<ChunkMetadata> appendEncodedData(const int8_t*,
64  int8_t* data,
65  const size_t start_idx,
66  const size_t num_elements) override {
67  auto current_data = data + sizeof(V) * start_idx;
69  current_data, num_elements, SQLTypeInfo{}, false, -1, true);
70  }
71 
72  std::shared_ptr<ChunkMetadata> appendData(int8_t*& src_data,
73  const size_t num_elems_to_append,
74  const SQLTypeInfo& ti,
75  const bool replicating = false,
76  const int64_t offset = -1) override {
78  src_data, num_elems_to_append, ti, replicating, offset, false);
79  }
80 
81  void getMetadata(const std::shared_ptr<ChunkMetadata>& chunkMetadata) override {
82  Encoder::getMetadata(chunkMetadata);
83  chunkMetadata->fillChunkStats(dataMin, dataMax, has_nulls);
84  }
85 
86  // Only called from the executor for synthesized meta-information.
87  std::shared_ptr<ChunkMetadata> getMetadata(const SQLTypeInfo& ti) override {
88  auto chunk_metadata = std::make_shared<ChunkMetadata>(ti, 0, 0, ChunkStats{});
89  chunk_metadata->fillChunkStats(dataMin, dataMax, has_nulls);
90  return chunk_metadata;
91  }
92 
93  // Only called from the executor for synthesized meta-information.
94  void updateStats(const int64_t val, const bool is_null) override {
95  if (is_null) {
96  has_nulls = true;
97  } else {
98  const auto data = static_cast<T>(val);
99  dataMin = std::min(dataMin, data);
100  dataMax = std::max(dataMax, data);
101  }
102  }
103 
104  // Only called from the executor for synthesized meta-information.
105  void updateStats(const double val, const bool is_null) override {
106  if (is_null) {
107  has_nulls = true;
108  } else {
109  const auto data = static_cast<T>(val);
110  dataMin = std::min(dataMin, data);
111  dataMax = std::max(dataMax, data);
112  }
113  }
114 
115  void updateStats(const int8_t* const src_data, const size_t num_elements) override {
116  const T* unencoded_data = reinterpret_cast<const T*>(src_data);
117  for (size_t i = 0; i < num_elements; ++i) {
118  encodeDataAndUpdateStats(unencoded_data[i]);
119  }
120  }
121 
122  void updateStats(const std::vector<std::string>* const src_data,
123  const size_t start_idx,
124  const size_t num_elements) override {
125  UNREACHABLE();
126  }
127 
128  void updateStats(const std::vector<ArrayDatum>* const src_data,
129  const size_t start_idx,
130  const size_t num_elements) override {
131  UNREACHABLE();
132  }
133 
134  // Only called from the executor for synthesized meta-information.
135  void reduceStats(const Encoder& that) override {
136  const auto that_typed = static_cast<const DateDaysEncoder<T, V>&>(that);
137  if (that_typed.has_nulls) {
138  has_nulls = true;
139  }
140  dataMin = std::min(dataMin, that_typed.dataMin);
141  dataMax = std::max(dataMax, that_typed.dataMax);
142  }
143 
144  void copyMetadata(const Encoder* copyFromEncoder) override {
145  num_elems_ = copyFromEncoder->getNumElems();
146  auto castedEncoder = reinterpret_cast<const DateDaysEncoder<T, V>*>(copyFromEncoder);
147  dataMin = castedEncoder->dataMin;
148  dataMax = castedEncoder->dataMax;
149  has_nulls = castedEncoder->has_nulls;
150  }
151 
152  void writeMetadata(FILE* f) override {
153  // assumes pointer is already in right place
154  fwrite((int8_t*)&num_elems_, sizeof(size_t), 1, f);
155  fwrite((int8_t*)&dataMin, sizeof(T), 1, f);
156  fwrite((int8_t*)&dataMax, sizeof(T), 1, f);
157  fwrite((int8_t*)&has_nulls, sizeof(bool), 1, f);
158  }
159 
160  void readMetadata(FILE* f) override {
161  // assumes pointer is already in right place
162  fread((int8_t*)&num_elems_, sizeof(size_t), 1, f);
163  fread((int8_t*)&dataMin, 1, sizeof(T), f);
164  fread((int8_t*)&dataMax, 1, sizeof(T), f);
165  fread((int8_t*)&has_nulls, 1, sizeof(bool), f);
166  }
167 
168  bool resetChunkStats(const ChunkStats& stats) override {
169  const auto new_min = DatumFetcher::getDatumVal<T>(stats.min);
170  const auto new_max = DatumFetcher::getDatumVal<T>(stats.max);
171 
172  if (dataMin == new_min && dataMax == new_max && has_nulls == stats.has_nulls) {
173  return false;
174  }
175 
176  dataMin = new_min;
177  dataMax = new_max;
178  has_nulls = stats.has_nulls;
179  return true;
180  }
181 
182  void resetChunkStats() override {
183  dataMin = std::numeric_limits<T>::max();
184  dataMax = std::numeric_limits<T>::lowest();
185  has_nulls = false;
186  }
187 
190  bool has_nulls;
191 
192  private:
193  std::shared_ptr<ChunkMetadata> appendEncodedOrUnencodedData(
194  int8_t*& src_data,
195  const size_t num_elems_to_append,
196  const SQLTypeInfo& ti,
197  const bool replicating,
198  const int64_t offset,
199  const bool is_encoded) {
200  if (offset == 0 && num_elems_to_append >= num_elems_) {
201  resetChunkStats();
202  }
203 
204  CHECK(!is_encoded || !replicating); // do not support replicating of encoded data
205 
206  T* unencoded_data = reinterpret_cast<T*>(src_data);
207  std::vector<V> encoded_data;
208  V* data_to_write = nullptr;
209  if (!is_encoded) {
210  encoded_data.resize(num_elems_to_append);
211  data_to_write = encoded_data.data();
212  for (size_t i = 0; i < num_elems_to_append; ++i) {
213  size_t ri = replicating ? 0 : i;
214  encoded_data[i] = encodeDataAndUpdateStats(unencoded_data[ri]);
215  }
216  } else {
217  data_to_write = reinterpret_cast<V*>(src_data);
218  for (size_t i = 0; i < num_elems_to_append; ++i) {
219  updateStatsWithAlreadyEncoded(data_to_write[i]);
220  }
221  }
222 
223  if (offset == -1) {
224  auto append_data_size = num_elems_to_append * sizeof(V);
225  buffer_->reserve(buffer_->size() + append_data_size);
226  num_elems_ += num_elems_to_append;
227  buffer_->append(reinterpret_cast<int8_t*>(data_to_write), append_data_size);
228  if (!replicating) {
229  src_data += num_elems_to_append * sizeof(T);
230  }
231  } else {
232  num_elems_ = offset + num_elems_to_append;
233  CHECK(!replicating);
234  CHECK_GE(offset, 0);
235  buffer_->write(reinterpret_cast<int8_t*>(data_to_write),
236  num_elems_to_append * sizeof(V),
237  static_cast<size_t>(offset));
238  }
239 
240  auto chunk_metadata = std::make_shared<ChunkMetadata>();
241  getMetadata(chunk_metadata);
242  return chunk_metadata;
243  }
244 
245  void updateStatsWithAlreadyEncoded(const V& encoded_data) {
246  if (encoded_data == std::numeric_limits<V>::min()) {
247  has_nulls = true;
248  } else {
249  const T data = DateConverters::get_epoch_seconds_from_days(encoded_data);
250  dataMax = std::max(dataMax, data);
251  dataMin = std::min(dataMin, data);
252  }
253  }
254 
255  V encodeDataAndUpdateStats(const T& unencoded_data) {
256  V encoded_data;
257  if (unencoded_data == std::numeric_limits<V>::min()) {
258  has_nulls = true;
259  encoded_data = static_cast<V>(unencoded_data);
260  } else {
261  date_days_overflow_validator_.validate(unencoded_data);
262  encoded_data = DateConverters::get_epoch_days_from_seconds(unencoded_data);
263  const T data = DateConverters::get_epoch_seconds_from_days(encoded_data);
264  dataMax = std::max(dataMax, data);
265  dataMin = std::min(dataMin, data);
266  }
267  return encoded_data;
268  }
269 }; // DateDaysEncoder
270 
271 #endif // DATE_DAYS_ENCODER_H
size_t num_elems_
Definition: Encoder.h:288
void updateStats(const int8_t *const src_data, const size_t num_elements) override
void updateStats(const std::vector< ArrayDatum > *const src_data, const size_t start_idx, const size_t num_elements) override
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
void updateStats(const int64_t val, const bool is_null) override
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK_GE(x, y)
Definition: Logger.h:306
bool has_nulls
Definition: ChunkMetadata.h:30
dictionary stats
Definition: report.py:116
void resetChunkStats() override
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:231
void updateStats(const std::vector< std::string > *const src_data, const size_t start_idx, const size_t num_elements) override
void execute_over_contiguous_indices(const std::vector< size_t > &indices, std::function< void(const size_t, const size_t)> to_execute)
Definition: Iteration.h:22
void readMetadata(FILE *f) override
int64_t get_epoch_seconds_from_days(const int64_t days)
CONSTEXPR DEVICE bool is_null(const T &value)
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:290
V encodeDataAndUpdateStats(const T &unencoded_data)
void writeMetadata(FILE *f) override
size_t getNumElems() const
Definition: Encoder.h:284
size_t getNumElemsForBytesEncodedDataAtIndices(const int8_t *index_data, const std::vector< size_t > &selected_idx, const size_t byte_limit) override
void updateStats(const double val, const bool is_null) override
void validate(T value)
Definition: Encoder.h:122
An AbstractBuffer is a unit of data management for a data manager.
virtual void write(int8_t *src, const size_t num_bytes, const size_t offset=0, const MemoryLevel src_buffer_type=CPU_LEVEL, const int src_device_id=-1)=0
DateDaysOverflowValidator date_days_overflow_validator_
Definition: Encoder.h:293
std::shared_ptr< ChunkMetadata > getMetadata(const SQLTypeInfo &ti) override
void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata) override
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
std::shared_ptr< ChunkMetadata > appendEncodedData(const int8_t *, int8_t *data, const size_t start_idx, const size_t num_elements) override
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
void reduceStats(const Encoder &that) override
void copyMetadata(const Encoder *copyFromEncoder) override
#define CHECK(condition)
Definition: Logger.h:291
bool resetChunkStats(const ChunkStats &stats) override
: Reset chunk level stats (min, max, nulls) using new values from the argument.
void updateStatsWithAlreadyEncoded(const V &encoded_data)
int64_t get_epoch_days_from_seconds(const int64_t seconds)
std::shared_ptr< ChunkMetadata > appendEncodedOrUnencodedData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating, const int64_t offset, const bool is_encoded)
virtual void reserve(size_t num_bytes)=0
DateDaysEncoder(Data_Namespace::AbstractBuffer *buffer)
std::shared_ptr< ChunkMetadata > appendEncodedDataAtIndices(const int8_t *, int8_t *data, const std::vector< size_t > &selected_idx) override