OmniSciDB  91042dcc5b
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
FixedLengthArrayNoneEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2018 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
24 #ifndef FIXED_LENGTH_ARRAY_NONE_ENCODER_H
25 #define FIXED_LENGTH_ARRAY_NONE_ENCODER_H
26 
27 #include "Logger/Logger.h"
28 
29 #include <cassert>
30 #include <cstring>
31 #include <memory>
32 #include <mutex>
33 #include <string>
34 #include <vector>
35 #include "AbstractBuffer.h"
36 #include "ChunkMetadata.h"
37 #include "Encoder.h"
38 
40 
42  public:
44  : Encoder(buffer), has_nulls(false), initialized(false), array_size(as) {}
45 
46  size_t getNumElemsForBytesEncodedData(const int8_t* index_data,
47  const int start_idx,
48  const size_t num_elements,
49  const size_t byte_limit) override {
50  size_t data_size = num_elements * array_size;
51  if (data_size > byte_limit) {
52  data_size = byte_limit;
53  }
54  return data_size / array_size;
55  }
56 
57  size_t getNumElemsForBytesInsertData(const std::vector<ArrayDatum>* srcData,
58  const int start_idx,
59  const size_t numAppendElems,
60  const size_t byteLimit,
61  const bool replicating = false) {
62  size_t dataSize = numAppendElems * array_size;
63  if (dataSize > byteLimit) {
64  dataSize = byteLimit;
65  }
66  return dataSize / array_size;
67  }
68 
69  std::shared_ptr<ChunkMetadata> appendEncodedDataAtIndices(
70  const int8_t* index_data,
71  int8_t* data,
72  const std::vector<size_t>& selected_idx) override {
73  std::vector<ArrayDatum> data_subset;
74  data_subset.reserve(selected_idx.size());
75  for (const auto& index : selected_idx) {
76  auto current_data = data + array_size * (index);
77  data_subset.emplace_back(
78  ArrayDatum(array_size, current_data, false, DoNothingDeleter{}));
79  }
80  return appendData(&data_subset, 0, selected_idx.size(), false);
81  }
82 
83  std::shared_ptr<ChunkMetadata> appendEncodedData(const int8_t* index_data,
84  int8_t* data,
85  const size_t start_idx,
86  const size_t num_elements) override {
87  std::vector<ArrayDatum> data_subset;
88  data_subset.reserve(num_elements);
89  for (size_t count = 0; count < num_elements; ++count) {
90  auto current_data = data + array_size * (start_idx + count);
91  data_subset.emplace_back(
92  ArrayDatum(array_size, current_data, false, DoNothingDeleter{}));
93  }
94  return appendData(&data_subset, 0, num_elements, false);
95  }
96 
97  std::shared_ptr<ChunkMetadata> appendData(int8_t*& src_data,
98  const size_t num_elems_to_append,
99  const SQLTypeInfo& ti,
100  const bool replicating = false,
101  const int64_t offset = -1) override {
102  UNREACHABLE(); // should never be called for arrays
103  return nullptr;
104  }
105 
106  std::shared_ptr<ChunkMetadata> appendData(const std::vector<ArrayDatum>* srcData,
107  const int start_idx,
108  const size_t numAppendElems,
109  const bool replicating = false) {
110  // Todo: The reserve call was changed to take into account the existing data size,
111  // but in other encoders (like ArrayNoneEncoder) we only reserve the append size,
112  // which will be a no-op likely after the first append on a chunk. This probably
113  // won't matter for disk writes as we just have static (default 2MB) page sizes, but
114  // could be an issue for temporary in-memory tables, as buffers for multi-column
115  // imports will likely need to be repeatedly migrated to grow them if they are
116  // "landlocked" amidst other buffers. We should follow-up with work to call reserve
117  // properly, accounting for both the new append size and existing size, for that
118  // reason and just for overall semantic correctness.
119 
120  const size_t existing_data_size = num_elems_ * array_size;
121  const size_t append_data_size = array_size * numAppendElems;
122  buffer_->reserve(existing_data_size + append_data_size);
123  std::vector<int8_t> append_buffer(append_data_size);
124  int8_t* append_ptr = append_buffer.data();
125 
126  // There was some worry about the change implemented to write the append data to an
127  // intermediate buffer, but testing on import and ctas of 20M points, we never append
128  // more than 1.6MB and 1MB of data at a time, respectively, so at least for fixed
129  // length types this should not be an issue (varlen types, which can be massive even
130  // for a single field/row, are a different story however)
131 
132  if (replicating) {
133  const size_t len = (*srcData)[0].length;
134  CHECK_EQ(len, array_size);
135  const int8_t* replicated_ptr = (*srcData)[0].pointer;
136  for (size_t i = 0; i < numAppendElems; ++i) {
137  std::memcpy(append_ptr + i * array_size, replicated_ptr, array_size);
138  }
139  } else {
140  for (size_t i = 0; i < numAppendElems; ++i) {
141  // Length of the appended array should be equal to the fixed length,
142  // all others should have been discarded, assert if something slips through
143  const size_t source_idx = start_idx + i;
144  const size_t len = (*srcData)[source_idx].length;
145  CHECK_EQ(len, array_size);
146  // NULL arrays have been filled with subtype's NULL sentinels,
147  // should be appended as regular data, same size
148  std::memcpy(
149  append_ptr + i * array_size, (*srcData)[source_idx].pointer, array_size);
150  }
151  }
152 
153  buffer_->append(append_ptr, append_data_size);
154 
155  if (replicating) {
156  updateStats(srcData, 0, 1);
157  } else {
158  updateStats(srcData, start_idx, numAppendElems);
159  }
160 
161  // make sure buffer_ is flushed even if no new data is appended to it
162  // (e.g. empty strings) because the metadata needs to be flushed.
163  if (!buffer_->isDirty()) {
164  buffer_->setDirty();
165  }
166 
167  num_elems_ += numAppendElems;
168  auto chunk_metadata = std::make_shared<ChunkMetadata>();
169  getMetadata(chunk_metadata);
170  return chunk_metadata;
171  }
172 
173  void getMetadata(const std::shared_ptr<ChunkMetadata>& chunkMetadata) override {
174  Encoder::getMetadata(chunkMetadata); // call on parent class
175  chunkMetadata->fillChunkStats(elem_min, elem_max, has_nulls);
176  }
177 
178  // Only called from the executor for synthesized meta-information.
179  std::shared_ptr<ChunkMetadata> getMetadata(const SQLTypeInfo& ti) override {
180  auto chunk_metadata = std::make_shared<ChunkMetadata>(
181  ti, 0, 0, ChunkStats{elem_min, elem_max, has_nulls});
182  return chunk_metadata;
183  }
184 
185  void updateStats(const int64_t, const bool) override { CHECK(false); }
186 
187  void updateStats(const double, const bool) override { CHECK(false); }
188 
189  void reduceStats(const Encoder&) override { CHECK(false); }
190 
191  void updateStats(const int8_t* const src_data, const size_t num_elements) override {
192  UNREACHABLE();
193  }
194 
195  void updateStats(const std::vector<std::string>* const src_data,
196  const size_t start_idx,
197  const size_t num_elements) override {
198  UNREACHABLE();
199  }
200 
201  void updateStats(const std::vector<ArrayDatum>* const src_data,
202  const size_t start_idx,
203  const size_t num_elements) override {
204  for (size_t n = start_idx; n < start_idx + num_elements; n++) {
205  update_elem_stats((*src_data)[n]);
206  }
207  }
208 
209  void writeMetadata(FILE* f) override {
210  // assumes pointer is already in right place
211  fwrite((int8_t*)&num_elems_, sizeof(size_t), 1, f);
212  fwrite((int8_t*)&elem_min, sizeof(Datum), 1, f);
213  fwrite((int8_t*)&elem_max, sizeof(Datum), 1, f);
214  fwrite((int8_t*)&has_nulls, sizeof(bool), 1, f);
215  fwrite((int8_t*)&initialized, sizeof(bool), 1, f);
216  }
217 
218  void readMetadata(FILE* f) override {
219  // assumes pointer is already in right place
220  fread((int8_t*)&num_elems_, sizeof(size_t), 1, f);
221  fread((int8_t*)&elem_min, sizeof(Datum), 1, f);
222  fread((int8_t*)&elem_max, sizeof(Datum), 1, f);
223  fread((int8_t*)&has_nulls, sizeof(bool), 1, f);
224  fread((int8_t*)&initialized, sizeof(bool), 1, f);
225  }
226 
227  void copyMetadata(const Encoder* copyFromEncoder) override {
228  num_elems_ = copyFromEncoder->getNumElems();
229  auto array_encoder =
230  dynamic_cast<const FixedLengthArrayNoneEncoder*>(copyFromEncoder);
231  elem_min = array_encoder->elem_min;
232  elem_max = array_encoder->elem_max;
233  has_nulls = array_encoder->has_nulls;
234  initialized = array_encoder->initialized;
235  }
236 
237  void updateMetadata(int8_t* array) {
239  }
240 
241  static bool is_null(const SQLTypeInfo& type, int8_t* array) {
242  if (type.get_notnull()) {
243  return false;
244  }
245  switch (type.get_subtype()) {
246  case kBOOLEAN: {
247  return (array[0] == NULL_ARRAY_BOOLEAN);
248  }
249  case kINT: {
250  const int32_t* int_array = (int32_t*)array;
251  return (int_array[0] == NULL_ARRAY_INT);
252  }
253  case kSMALLINT: {
254  const int16_t* smallint_array = (int16_t*)array;
255  return (smallint_array[0] == NULL_ARRAY_SMALLINT);
256  }
257  case kTINYINT: {
258  const int8_t* tinyint_array = (int8_t*)array;
259  return (tinyint_array[0] == NULL_ARRAY_TINYINT);
260  }
261  case kBIGINT:
262  case kNUMERIC:
263  case kDECIMAL: {
264  const int64_t* bigint_array = (int64_t*)array;
265  return (bigint_array[0] == NULL_ARRAY_BIGINT);
266  }
267  case kFLOAT: {
268  const float* flt_array = (float*)array;
269  return (flt_array[0] == NULL_ARRAY_FLOAT);
270  }
271  case kDOUBLE: {
272  const double* dbl_array = (double*)array;
273  return (dbl_array[0] == NULL_ARRAY_DOUBLE);
274  }
275  case kTIME:
276  case kTIMESTAMP:
277  case kDATE: {
278  const int64_t* tm_array = reinterpret_cast<int64_t*>(array);
279  return (tm_array[0] == NULL_ARRAY_BIGINT);
280  }
281  case kCHAR:
282  case kVARCHAR:
283  case kTEXT: {
285  const int32_t* int_array = (int32_t*)array;
286  return (int_array[0] == NULL_ARRAY_INT);
287  }
288  default:
289  UNREACHABLE();
290  }
291  return false;
292  }
293 
294  bool resetChunkStats(const ChunkStats& stats) override {
295  auto elem_type = buffer_->getSqlType().get_elem_type();
296  if (initialized && DatumEqual(elem_min, stats.min, elem_type) &&
297  DatumEqual(elem_max, stats.max, elem_type) && has_nulls == stats.has_nulls) {
298  return false;
299  }
300  elem_min = stats.min;
301  elem_max = stats.max;
302  has_nulls = stats.has_nulls;
303  return true;
304  }
305 
306  void resetChunkStats() override {
307  has_nulls = false;
308  initialized = false;
309  }
310 
313  bool has_nulls;
315 
316  private:
317  std::mutex EncoderMutex_;
318  std::mutex print_mutex_;
319  size_t array_size;
320 
321  bool is_null(int8_t* array) { return is_null(buffer_->getSqlType(), array); }
322 
323  void update_elem_stats(const ArrayDatum& array) {
324  if (array.is_null) {
325  has_nulls = true;
326  }
327  switch (buffer_->getSqlType().get_subtype()) {
328  case kBOOLEAN: {
329  if (!initialized) {
330  elem_min.boolval = true;
331  elem_max.boolval = false;
332  }
333  if (array.is_null) {
334  break;
335  }
336  const int8_t* bool_array = array.pointer;
337  for (size_t i = 0; i < array.length / sizeof(bool); i++) {
338  if (bool_array[i] == NULL_BOOLEAN) {
339  has_nulls = true;
340  } else if (initialized) {
341  elem_min.boolval = std::min(elem_min.boolval, bool_array[i]);
342  elem_max.boolval = std::max(elem_max.boolval, bool_array[i]);
343  } else {
344  elem_min.boolval = bool_array[i];
345  elem_max.boolval = bool_array[i];
346  initialized = true;
347  }
348  }
349  break;
350  }
351  case kINT: {
352  if (!initialized) {
353  elem_min.intval = 1;
354  elem_max.intval = 0;
355  }
356  if (array.is_null) {
357  break;
358  }
359  const int32_t* int_array = (int32_t*)array.pointer;
360  for (size_t i = 0; i < array.length / sizeof(int32_t); i++) {
361  if (int_array[i] == NULL_INT) {
362  has_nulls = true;
363  } else if (initialized) {
364  elem_min.intval = std::min(elem_min.intval, int_array[i]);
365  elem_max.intval = std::max(elem_max.intval, int_array[i]);
366  } else {
367  elem_min.intval = int_array[i];
368  elem_max.intval = int_array[i];
369  initialized = true;
370  }
371  }
372  break;
373  }
374  case kSMALLINT: {
375  if (!initialized) {
376  elem_min.smallintval = 1;
377  elem_max.smallintval = 0;
378  }
379  if (array.is_null) {
380  break;
381  }
382  const int16_t* smallint_array = (int16_t*)array.pointer;
383  for (size_t i = 0; i < array.length / sizeof(int16_t); i++) {
384  if (smallint_array[i] == NULL_SMALLINT) {
385  has_nulls = true;
386  } else if (initialized) {
387  elem_min.smallintval = std::min(elem_min.smallintval, smallint_array[i]);
388  elem_max.smallintval = std::max(elem_max.smallintval, smallint_array[i]);
389  } else {
390  elem_min.smallintval = smallint_array[i];
391  elem_max.smallintval = smallint_array[i];
392  initialized = true;
393  }
394  }
395  break;
396  }
397  case kTINYINT: {
398  if (!initialized) {
399  elem_min.tinyintval = 1;
400  elem_max.tinyintval = 0;
401  }
402  if (array.is_null) {
403  break;
404  }
405  const int8_t* tinyint_array = (int8_t*)array.pointer;
406  for (size_t i = 0; i < array.length / sizeof(int8_t); i++) {
407  if (tinyint_array[i] == NULL_TINYINT) {
408  has_nulls = true;
409  } else if (initialized) {
410  elem_min.tinyintval = std::min(elem_min.tinyintval, tinyint_array[i]);
411  elem_max.tinyintval = std::max(elem_max.tinyintval, tinyint_array[i]);
412  } else {
413  elem_min.tinyintval = tinyint_array[i];
414  elem_max.tinyintval = tinyint_array[i];
415  initialized = true;
416  }
417  }
418  break;
419  }
420  case kBIGINT:
421  case kNUMERIC:
422  case kDECIMAL: {
423  if (!initialized) {
424  elem_min.bigintval = 1;
425  elem_max.bigintval = 0;
426  }
427  if (array.is_null) {
428  break;
429  }
430  const int64_t* bigint_array = (int64_t*)array.pointer;
431  for (size_t i = 0; i < array.length / sizeof(int64_t); i++) {
432  if (bigint_array[i] == NULL_BIGINT) {
433  has_nulls = true;
434  } else if (initialized) {
435  decimal_overflow_validator_.validate(bigint_array[i]);
436  elem_min.bigintval = std::min(elem_min.bigintval, bigint_array[i]);
437  elem_max.bigintval = std::max(elem_max.bigintval, bigint_array[i]);
438  } else {
439  decimal_overflow_validator_.validate(bigint_array[i]);
440  elem_min.bigintval = bigint_array[i];
441  elem_max.bigintval = bigint_array[i];
442  initialized = true;
443  }
444  }
445  break;
446  }
447  case kFLOAT: {
448  if (!initialized) {
449  elem_min.floatval = 1.0;
450  elem_max.floatval = 0.0;
451  }
452  if (array.is_null) {
453  break;
454  }
455  const float* flt_array = (float*)array.pointer;
456  for (size_t i = 0; i < array.length / sizeof(float); i++) {
457  if (flt_array[i] == NULL_FLOAT) {
458  has_nulls = true;
459  } else if (initialized) {
460  elem_min.floatval = std::min(elem_min.floatval, flt_array[i]);
461  elem_max.floatval = std::max(elem_max.floatval, flt_array[i]);
462  } else {
463  elem_min.floatval = flt_array[i];
464  elem_max.floatval = flt_array[i];
465  initialized = true;
466  }
467  }
468  break;
469  }
470  case kDOUBLE: {
471  if (!initialized) {
472  elem_min.doubleval = 1.0;
473  elem_max.doubleval = 0.0;
474  }
475  if (array.is_null) {
476  break;
477  }
478  const double* dbl_array = (double*)array.pointer;
479  for (size_t i = 0; i < array.length / sizeof(double); i++) {
480  if (dbl_array[i] == NULL_DOUBLE) {
481  has_nulls = true;
482  } else if (initialized) {
483  elem_min.doubleval = std::min(elem_min.doubleval, dbl_array[i]);
484  elem_max.doubleval = std::max(elem_max.doubleval, dbl_array[i]);
485  } else {
486  elem_min.doubleval = dbl_array[i];
487  elem_max.doubleval = dbl_array[i];
488  initialized = true;
489  }
490  }
491  break;
492  }
493  case kTIME:
494  case kTIMESTAMP:
495  case kDATE: {
496  if (!initialized) {
497  elem_min.bigintval = 1;
498  elem_max.bigintval = 0;
499  }
500  if (array.is_null) {
501  break;
502  }
503  const int64_t* tm_array = reinterpret_cast<int64_t*>(array.pointer);
504  for (size_t i = 0; i < array.length / sizeof(int64_t); i++) {
505  if (tm_array[i] == NULL_BIGINT) {
506  has_nulls = true;
507  } else if (initialized) {
508  elem_min.bigintval = std::min(elem_min.bigintval, tm_array[i]);
509  elem_max.bigintval = std::max(elem_max.bigintval, tm_array[i]);
510  } else {
511  elem_min.bigintval = tm_array[i];
512  elem_max.bigintval = tm_array[i];
513  initialized = true;
514  }
515  }
516  break;
517  }
518  case kCHAR:
519  case kVARCHAR:
520  case kTEXT: {
522  if (!initialized) {
523  elem_min.intval = 1;
524  elem_max.intval = 0;
525  }
526  if (array.is_null) {
527  break;
528  }
529  const int32_t* int_array = (int32_t*)array.pointer;
530  for (size_t i = 0; i < array.length / sizeof(int32_t); i++) {
531  if (int_array[i] == NULL_INT) {
532  has_nulls = true;
533  } else if (initialized) {
534  elem_min.intval = std::min(elem_min.intval, int_array[i]);
535  elem_max.intval = std::max(elem_max.intval, int_array[i]);
536  } else {
537  elem_min.intval = int_array[i];
538  elem_max.intval = int_array[i];
539  initialized = true;
540  }
541  }
542  break;
543  }
544  default:
545  UNREACHABLE();
546  }
547  };
548 
549 }; // class FixedLengthArrayNoneEncoder
550 
551 #endif // FIXED_LENGTH_ARRAY_NONE_ENCODER_H
int8_t tinyintval
Definition: sqltypes.h:212
HOST DEVICE SQLTypes get_subtype() const
Definition: sqltypes.h:330
#define CHECK_EQ(x, y)
Definition: Logger.h:219
#define NULL_DOUBLE
void updateStats(const int8_t *const src_data, const size_t num_elements) override
size_t num_elems_
Definition: Encoder.h:289
Definition: sqltypes.h:49
#define NULL_ARRAY_INT
#define NULL_FLOAT
DecimalOverflowValidator decimal_overflow_validator_
Definition: Encoder.h:293
#define NULL_BIGINT
std::shared_ptr< ChunkMetadata > appendEncodedData(const int8_t *index_data, int8_t *data, const size_t start_idx, const size_t num_elements) override
size_t getNumElemsForBytesEncodedData(const int8_t *index_data, const int start_idx, const size_t num_elements, const size_t byte_limit) override
#define NULL_ARRAY_SMALLINT
int8_t boolval
Definition: sqltypes.h:211
#define UNREACHABLE()
Definition: Logger.h:255
bool has_nulls
Definition: ChunkMetadata.h:28
#define NULL_ARRAY_TINYINT
int32_t intval
Definition: sqltypes.h:214
#define NULL_INT
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:227
void updateStats(const double, const bool) override
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:208
float floatval
Definition: sqltypes.h:216
FixedLengthArrayNoneEncoder(AbstractBuffer *buffer, size_t as)
void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata) override
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:291
std::shared_ptr< ChunkMetadata > getMetadata(const SQLTypeInfo &ti) override
bool DatumEqual(const Datum a, const Datum b, const SQLTypeInfo &ti)
Definition: Datum.cpp:342
int count
void copyMetadata(const Encoder *copyFromEncoder) override
int64_t bigintval
Definition: sqltypes.h:215
size_t getNumElems() const
Definition: Encoder.h:285
#define NULL_ARRAY_FLOAT
std::shared_ptr< ChunkMetadata > appendEncodedDataAtIndices(const int8_t *index_data, int8_t *data, const std::vector< size_t > &selected_idx) override
int16_t smallintval
Definition: sqltypes.h:213
An AbstractBuffer is a unit of data management for a data manager.
#define NULL_BOOLEAN
Definition: sqltypes.h:52
Definition: sqltypes.h:53
void updateStats(const std::vector< std::string > *const src_data, const size_t start_idx, const size_t num_elements) override
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
Definition: sqltypes.h:41
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
SQLTypeInfo getSqlType() const
#define NULL_TINYINT
#define NULL_ARRAY_DOUBLE
void update_elem_stats(const ArrayDatum &array)
void reduceStats(const Encoder &) override
bool g_enable_watchdog false
Definition: Execute.cpp:77
#define CHECK(condition)
Definition: Logger.h:211
void updateStats(const int64_t, const bool) override
#define NULL_SMALLINT
char * f
#define NULL_ARRAY_BIGINT
static bool is_null(const SQLTypeInfo &type, int8_t *array)
Definition: sqltypes.h:45
void updateStats(const std::vector< ArrayDatum > *const src_data, const size_t start_idx, const size_t num_elements) override
#define NULL_ARRAY_BOOLEAN
constexpr double n
Definition: Utm.h:38
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:336
size_t getNumElemsForBytesInsertData(const std::vector< ArrayDatum > *srcData, const int start_idx, const size_t numAppendElems, const size_t byteLimit, const bool replicating=false)
void validate(T value) const
Definition: Encoder.h:54
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:861
virtual void reserve(size_t num_bytes)=0
double doubleval
Definition: sqltypes.h:217
std::shared_ptr< ChunkMetadata > appendData(const std::vector< ArrayDatum > *srcData, const int start_idx, const size_t numAppendElems, const bool replicating=false)
bool resetChunkStats(const ChunkStats &stats) override
: Reset chunk level stats (min, max, nulls) using new values from the argument.