OmniSciDB  85c2d10cdc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ArrayNoneEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
24 #ifndef ARRAY_NONE_ENCODER_H
25 #define ARRAY_NONE_ENCODER_H
26 
27 #include "Logger/Logger.h"
28 
29 #include <cassert>
30 #include <cstring>
31 #include <memory>
32 #include <mutex>
33 #include <string>
34 #include <vector>
35 #include "AbstractBuffer.h"
36 #include "ChunkMetadata.h"
37 #include "Encoder.h"
38 
40 
41 class ArrayNoneEncoder : public Encoder {
42  public:
44  : Encoder(buffer)
45  , has_nulls(false)
47  , index_buf(nullptr)
48  , last_offset(-1) {}
49 
50  size_t getNumElemsForBytesInsertData(const std::vector<ArrayDatum>* srcData,
51  const int start_idx,
52  const size_t numAppendElems,
53  const size_t byteLimit,
54  const bool replicating = false) {
55  size_t dataSize = 0;
56 
57  size_t n = start_idx;
58  for (; n < start_idx + numAppendElems; n++) {
59  size_t len = (*srcData)[replicating ? 0 : n].length;
60  if (dataSize + len > byteLimit) {
61  break;
62  }
63  dataSize += len;
64  }
65  return n - start_idx;
66  }
67 
68  std::shared_ptr<ChunkMetadata> appendData(int8_t*& src_data,
69  const size_t num_elems_to_append,
70  const SQLTypeInfo& ti,
71  const bool replicating = false,
72  const int64_t offset = -1) override {
73  UNREACHABLE(); // should never be called for arrays
74  return nullptr;
75  }
76 
77  std::shared_ptr<ChunkMetadata> appendData(const std::vector<ArrayDatum>* srcData,
78  const int start_idx,
79  const size_t numAppendElems,
80  const bool replicating) {
81  CHECK(index_buf != nullptr); // index_buf must be set before this.
82  size_t index_size = numAppendElems * sizeof(ArrayOffsetT);
83  if (num_elems_ == 0) {
84  index_size += sizeof(ArrayOffsetT); // plus one for the initial offset
85  }
86  index_buf->reserve(index_size);
87 
88  bool first_elem_is_null = false;
89  ArrayOffsetT initial_offset = 0;
90  if (num_elems_ == 0) {
91  // If the very first ArrayDatum is NULL, initial offset will be set to 8
92  // so we could negate it and write it out to index buffer to convey NULLness
93  if ((*srcData)[0].is_null) {
94  initial_offset = DEFAULT_NULL_PADDING_SIZE;
95  first_elem_is_null = true;
96  }
97  index_buf->append((int8_t*)&initial_offset,
98  sizeof(ArrayOffsetT)); // write the inital offset
99  last_offset = initial_offset;
100  } else {
101  // Valid last_offset is never negative
102  // always need to read a valid last offset from buffer/disk
103  // b/c now due to vacuum "last offset" may go backward and if
104  // index chunk was not reloaded last_offset would go way off!
105  index_buf->read((int8_t*)&last_offset,
106  sizeof(ArrayOffsetT),
107  index_buf->size() - sizeof(ArrayOffsetT),
109  CHECK(last_offset != -1);
110  // If the loaded offset is negative it means the last value was a NULL array,
111  // convert to a valid last offset
112  if (last_offset < 0) {
114  }
115  }
116  // Need to start data from 8 byte offset if first array encoded is a NULL array
117  size_t data_size = (first_elem_is_null) ? DEFAULT_NULL_PADDING_SIZE : 0;
118  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
119  // NULL arrays don't take any space so don't add to the data size
120  if ((*srcData)[replicating ? 0 : n].is_null) {
121  continue;
122  }
123  data_size += (*srcData)[replicating ? 0 : n].length;
124  }
125  buffer_->reserve(data_size);
126 
127  size_t inbuf_size =
128  std::min(std::max(index_size, data_size), (size_t)MAX_INPUT_BUF_SIZE);
129  auto gc_inbuf = std::make_unique<int8_t[]>(inbuf_size);
130  auto inbuf = gc_inbuf.get();
131  for (size_t num_appended = 0; num_appended < numAppendElems;) {
132  ArrayOffsetT* p = (ArrayOffsetT*)inbuf;
133  size_t i;
134  for (i = 0; num_appended < numAppendElems && i < inbuf_size / sizeof(ArrayOffsetT);
135  i++, num_appended++) {
136  p[i] =
137  last_offset + (*srcData)[replicating ? 0 : num_appended + start_idx].length;
138  last_offset = p[i];
139  if ((*srcData)[replicating ? 0 : num_appended + start_idx].is_null) {
140  // Record array NULLness in the index buffer
141  p[i] = -p[i];
142  }
143  }
144  index_buf->append(inbuf, i * sizeof(ArrayOffsetT));
145  }
146 
147  // Pad buffer_ with 8 bytes if first encoded array is a NULL array
148  if (first_elem_is_null) {
150  }
151  for (size_t num_appended = 0; num_appended < numAppendElems;) {
152  size_t size = 0;
153  for (int i = start_idx + num_appended;
154  num_appended < numAppendElems && size < inbuf_size;
155  i++, num_appended++) {
156  if ((*srcData)[replicating ? 0 : i].is_null) {
157  continue; // NULL arrays don't take up any space in the data buffer
158  }
159  size_t len = (*srcData)[replicating ? 0 : i].length;
160  if (len > inbuf_size) {
161  // for large strings, append on its own
162  if (size > 0) {
163  buffer_->append(inbuf, size);
164  }
165  size = 0;
166  buffer_->append((*srcData)[replicating ? 0 : i].pointer, len);
167  num_appended++;
168  break;
169  } else if (size + len > inbuf_size) {
170  break;
171  }
172  char* dest = (char*)inbuf + size;
173  if (len > 0) {
174  std::memcpy((void*)dest, (void*)(*srcData)[replicating ? 0 : i].pointer, len);
175  size += len;
176  }
177  }
178  if (size > 0) {
179  buffer_->append(inbuf, size);
180  }
181  }
182  // make sure buffer_ is flushed even if no new data is appended to it
183  // (e.g. empty strings) because the metadata needs to be flushed.
184  if (!buffer_->isDirty()) {
185  buffer_->setDirty();
186  }
187 
188  // keep Chunk statistics with array elements
189  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
190  update_elem_stats((*srcData)[replicating ? 0 : n]);
191  }
192  num_elems_ += numAppendElems;
193  auto chunk_metadata = std::make_shared<ChunkMetadata>();
194  getMetadata(chunk_metadata);
195  return chunk_metadata;
196  }
197 
198  void getMetadata(const std::shared_ptr<ChunkMetadata>& chunkMetadata) override {
199  Encoder::getMetadata(chunkMetadata); // call on parent class
200  chunkMetadata->fillChunkStats(elem_min, elem_max, has_nulls);
201  }
202 
203  // Only called from the executor for synthesized meta-information.
204  std::shared_ptr<ChunkMetadata> getMetadata(const SQLTypeInfo& ti) override {
205  auto chunk_metadata = std::make_shared<ChunkMetadata>(
206  ti, 0, 0, ChunkStats{elem_min, elem_max, has_nulls});
207  return chunk_metadata;
208  }
209 
210  void updateStats(const int64_t, const bool) override { CHECK(false); }
211 
212  void updateStats(const double, const bool) override { CHECK(false); }
213 
214  void updateStats(const int8_t* const src_data, const size_t num_elements) override {
215  CHECK(false);
216  }
217 
218  void updateStats(const std::vector<std::string>* const src_data,
219  const size_t start_idx,
220  const size_t num_elements) override {
221  UNREACHABLE();
222  }
223 
224  void updateStats(const std::vector<ArrayDatum>* const src_data,
225  const size_t start_idx,
226  const size_t num_elements) override {
227  for (size_t n = start_idx; n < start_idx + num_elements; n++) {
228  update_elem_stats((*src_data)[n]);
229  }
230  }
231 
232  void reduceStats(const Encoder&) override { CHECK(false); }
233 
234  void writeMetadata(FILE* f) override {
235  // assumes pointer is already in right place
236  fwrite((int8_t*)&num_elems_, sizeof(size_t), 1, f);
237  fwrite((int8_t*)&elem_min, sizeof(Datum), 1, f);
238  fwrite((int8_t*)&elem_max, sizeof(Datum), 1, f);
239  fwrite((int8_t*)&has_nulls, sizeof(bool), 1, f);
240  fwrite((int8_t*)&initialized, sizeof(bool), 1, f);
241  }
242 
243  void readMetadata(FILE* f) override {
244  // assumes pointer is already in right place
245  fread((int8_t*)&num_elems_, sizeof(size_t), 1, f);
246  fread((int8_t*)&elem_min, sizeof(Datum), 1, f);
247  fread((int8_t*)&elem_max, sizeof(Datum), 1, f);
248  fread((int8_t*)&has_nulls, sizeof(bool), 1, f);
249  fread((int8_t*)&initialized, sizeof(bool), 1, f);
250  }
251 
252  void copyMetadata(const Encoder* copyFromEncoder) override {
253  num_elems_ = copyFromEncoder->getNumElems();
254  auto array_encoder = dynamic_cast<const ArrayNoneEncoder*>(copyFromEncoder);
255  elem_min = array_encoder->elem_min;
256  elem_max = array_encoder->elem_max;
257  has_nulls = array_encoder->has_nulls;
258  initialized = array_encoder->initialized;
259  }
260 
261  AbstractBuffer* getIndexBuf() const { return index_buf; }
262 
263  bool resetChunkStats(const ChunkStats& stats) override {
264  auto elem_type = buffer_->getSqlType().get_elem_type();
265  if (DatumEqual(elem_min, stats.min, elem_type) &&
266  DatumEqual(elem_max, stats.max, elem_type) && has_nulls == stats.has_nulls) {
267  return false;
268  }
269  elem_min = stats.min;
270  elem_max = stats.max;
271  has_nulls = stats.has_nulls;
272  return true;
273  }
274 
275  void resetChunkStats() override {
276  has_nulls = false;
277  initialized = false;
278  }
279 
282  bool has_nulls;
285  std::unique_lock<std::mutex> lock(EncoderMutex_);
286  index_buf = buf;
287  }
288 
289  static constexpr size_t DEFAULT_NULL_PADDING_SIZE{8};
290 
291  private:
292  std::mutex EncoderMutex_;
295 
296  void update_elem_stats(const ArrayDatum& array) {
297  if (array.is_null) {
298  has_nulls = true;
299  }
300  switch (buffer_->getSqlType().get_subtype()) {
301  case kBOOLEAN: {
302  if (!initialized) {
303  elem_min.boolval = true;
304  elem_max.boolval = false;
305  }
306  if (array.is_null || array.length == 0) {
307  break;
308  }
309  const bool* bool_array = (bool*)array.pointer;
310  for (size_t i = 0; i < array.length / sizeof(bool); i++) {
311  if ((int8_t)bool_array[i] == NULL_BOOLEAN) {
312  has_nulls = true;
313  } else if (initialized) {
314  elem_min.boolval = std::min(elem_min.boolval, bool_array[i]);
315  elem_max.boolval = std::max(elem_max.boolval, bool_array[i]);
316  } else {
317  elem_min.boolval = bool_array[i];
318  elem_max.boolval = bool_array[i];
319  initialized = true;
320  }
321  }
322  break;
323  }
324  case kINT: {
325  if (!initialized) {
326  elem_min.intval = 1;
327  elem_max.intval = 0;
328  }
329  if (array.is_null || array.length == 0) {
330  break;
331  }
332  const int32_t* int_array = (int32_t*)array.pointer;
333  for (size_t i = 0; i < array.length / sizeof(int32_t); i++) {
334  if (int_array[i] == NULL_INT) {
335  has_nulls = true;
336  } else if (initialized) {
337  elem_min.intval = std::min(elem_min.intval, int_array[i]);
338  elem_max.intval = std::max(elem_max.intval, int_array[i]);
339  } else {
340  elem_min.intval = int_array[i];
341  elem_max.intval = int_array[i];
342  initialized = true;
343  }
344  }
345  break;
346  }
347  case kSMALLINT: {
348  if (!initialized) {
349  elem_min.smallintval = 1;
350  elem_max.smallintval = 0;
351  }
352  if (array.is_null || array.length == 0) {
353  break;
354  }
355  const int16_t* int_array = (int16_t*)array.pointer;
356  for (size_t i = 0; i < array.length / sizeof(int16_t); i++) {
357  if (int_array[i] == NULL_SMALLINT) {
358  has_nulls = true;
359  } else if (initialized) {
360  elem_min.smallintval = std::min(elem_min.smallintval, int_array[i]);
361  elem_max.smallintval = std::max(elem_max.smallintval, int_array[i]);
362  } else {
363  elem_min.smallintval = int_array[i];
364  elem_max.smallintval = int_array[i];
365  initialized = true;
366  }
367  }
368  break;
369  }
370  case kTINYINT: {
371  if (!initialized) {
372  elem_min.tinyintval = 1;
373  elem_max.tinyintval = 0;
374  }
375  if (array.is_null || array.length == 0) {
376  break;
377  }
378  const int8_t* int_array = (int8_t*)array.pointer;
379  for (size_t i = 0; i < array.length / sizeof(int8_t); i++) {
380  if (int_array[i] == NULL_TINYINT) {
381  has_nulls = true;
382  } else if (initialized) {
383  elem_min.tinyintval = std::min(elem_min.tinyintval, int_array[i]);
384  elem_max.tinyintval = std::max(elem_max.tinyintval, int_array[i]);
385  } else {
386  elem_min.tinyintval = int_array[i];
387  elem_max.tinyintval = int_array[i];
388  initialized = true;
389  }
390  }
391  break;
392  }
393  case kBIGINT:
394  case kNUMERIC:
395  case kDECIMAL: {
396  if (!initialized) {
397  elem_min.bigintval = 1;
398  elem_max.bigintval = 0;
399  }
400  if (array.is_null || array.length == 0) {
401  break;
402  }
403  const int64_t* int_array = (int64_t*)array.pointer;
404  for (size_t i = 0; i < array.length / sizeof(int64_t); i++) {
405  if (int_array[i] == NULL_BIGINT) {
406  has_nulls = true;
407  } else if (initialized) {
408  elem_min.bigintval = std::min(elem_min.bigintval, int_array[i]);
409  elem_max.bigintval = std::max(elem_max.bigintval, int_array[i]);
410  } else {
411  elem_min.bigintval = int_array[i];
412  elem_max.bigintval = int_array[i];
413  initialized = true;
414  }
415  }
416  break;
417  }
418  case kFLOAT: {
419  if (!initialized) {
420  elem_min.floatval = 1.0;
421  elem_max.floatval = 0.0;
422  }
423  if (array.is_null || array.length == 0) {
424  break;
425  }
426  const float* flt_array = (float*)array.pointer;
427  for (size_t i = 0; i < array.length / sizeof(float); i++) {
428  if (flt_array[i] == NULL_FLOAT) {
429  has_nulls = true;
430  } else if (initialized) {
431  elem_min.floatval = std::min(elem_min.floatval, flt_array[i]);
432  elem_max.floatval = std::max(elem_max.floatval, flt_array[i]);
433  } else {
434  elem_min.floatval = flt_array[i];
435  elem_max.floatval = flt_array[i];
436  initialized = true;
437  }
438  }
439  break;
440  }
441  case kDOUBLE: {
442  if (!initialized) {
443  elem_min.doubleval = 1.0;
444  elem_max.doubleval = 0.0;
445  }
446  if (array.is_null || array.length == 0) {
447  break;
448  }
449  const double* dbl_array = (double*)array.pointer;
450  for (size_t i = 0; i < array.length / sizeof(double); i++) {
451  if (dbl_array[i] == NULL_DOUBLE) {
452  has_nulls = true;
453  } else if (initialized) {
454  elem_min.doubleval = std::min(elem_min.doubleval, dbl_array[i]);
455  elem_max.doubleval = std::max(elem_max.doubleval, dbl_array[i]);
456  } else {
457  elem_min.doubleval = dbl_array[i];
458  elem_max.doubleval = dbl_array[i];
459  initialized = true;
460  }
461  }
462  break;
463  }
464  case kTIME:
465  case kTIMESTAMP:
466  case kDATE: {
467  if (!initialized) {
468  elem_min.bigintval = 1;
469  elem_max.bigintval = 0;
470  }
471  if (array.is_null || array.length == 0) {
472  break;
473  }
474  const auto tm_array = reinterpret_cast<int64_t*>(array.pointer);
475  for (size_t i = 0; i < array.length / sizeof(int64_t); i++) {
476  if (tm_array[i] == NULL_BIGINT) {
477  has_nulls = true;
478  } else if (initialized) {
479  elem_min.bigintval = std::min(elem_min.bigintval, tm_array[i]);
480  elem_max.bigintval = std::max(elem_max.bigintval, tm_array[i]);
481  } else {
482  elem_min.bigintval = tm_array[i];
483  elem_max.bigintval = tm_array[i];
484  initialized = true;
485  }
486  }
487  break;
488  }
489  case kCHAR:
490  case kVARCHAR:
491  case kTEXT: {
493  if (!initialized) {
494  elem_min.intval = 1;
495  elem_max.intval = 0;
496  }
497  if (array.is_null || array.length == 0) {
498  break;
499  }
500  const int32_t* int_array = (int32_t*)array.pointer;
501  for (size_t i = 0; i < array.length / sizeof(int32_t); i++) {
502  if (int_array[i] == NULL_INT) {
503  has_nulls = true;
504  } else if (initialized) {
505  elem_min.intval = std::min(elem_min.intval, int_array[i]);
506  elem_max.intval = std::max(elem_max.intval, int_array[i]);
507  } else {
508  elem_min.intval = int_array[i];
509  elem_max.intval = int_array[i];
510  initialized = true;
511  }
512  }
513  break;
514  }
515  default:
516  UNREACHABLE();
517  }
518  };
519 
520 }; // class ArrayNoneEncoder
521 
522 #endif // ARRAY_NONE_ENCODER_H
int8_t tinyintval
Definition: sqltypes.h:206
void update_elem_stats(const ArrayDatum &array)
HOST DEVICE SQLTypes get_subtype() const
Definition: sqltypes.h:315
#define CHECK_EQ(x, y)
Definition: Logger.h:205
size_t getNumElemsForBytesInsertData(const std::vector< ArrayDatum > *srcData, const int start_idx, const size_t numAppendElems, const size_t byteLimit, const bool replicating=false)
#define NULL_DOUBLE
size_t num_elems_
Definition: Encoder.h:237
Definition: sqltypes.h:48
std::shared_ptr< ChunkMetadata > appendData(const std::vector< ArrayDatum > *srcData, const int start_idx, const size_t numAppendElems, const bool replicating)
#define NULL_FLOAT
#define MAX_INPUT_BUF_SIZE
Definition: Encoder.h:36
void updateStats(const double, const bool) override
#define NULL_BIGINT
std::mutex EncoderMutex_
bool boolval
Definition: sqltypes.h:205
void updateStats(const int8_t *const src_data, const size_t num_elements) override
#define UNREACHABLE()
Definition: Logger.h:241
bool has_nulls
Definition: ChunkMetadata.h:28
int32_t intval
Definition: sqltypes.h:208
#define NULL_INT
virtual void read(int8_t *const dst, const size_t num_bytes, const size_t offset=0, const MemoryLevel dst_buffer_type=CPU_LEVEL, const int dst_device_id=-1)=0
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:227
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:202
float floatval
Definition: sqltypes.h:210
CONSTEXPR DEVICE bool is_null(const T &value)
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:239
bool DatumEqual(const Datum a, const Datum b, const SQLTypeInfo &ti)
Definition: Datum.cpp:306
void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata) override
void updateStats(const std::vector< ArrayDatum > *const src_data, const size_t start_idx, const size_t num_elements) override
void resetChunkStats() override
int64_t bigintval
Definition: sqltypes.h:209
size_t getNumElems() const
Definition: Encoder.h:233
bool resetChunkStats(const ChunkStats &stats) override
: Reset chunk level stats (min, max, nulls) using new values from the argument.
int16_t smallintval
Definition: sqltypes.h:207
An AbstractBuffer is a unit of data management for a data manager.
ArrayNoneEncoder(AbstractBuffer *buffer)
#define NULL_BOOLEAN
void setIndexBuffer(AbstractBuffer *buf)
AbstractBuffer * index_buf
Definition: sqltypes.h:51
Definition: sqltypes.h:52
void readMetadata(FILE *f) override
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:322
static constexpr size_t DEFAULT_NULL_PADDING_SIZE
int32_t ArrayOffsetT
Definition: sqltypes.h:937
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
std::shared_ptr< ChunkMetadata > getMetadata(const SQLTypeInfo &ti) override
Definition: sqltypes.h:40
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
SQLTypeInfo getSqlType() const
#define NULL_TINYINT
void reduceStats(const Encoder &) override
bool g_enable_watchdog false
Definition: Execute.cpp:76
ArrayOffsetT last_offset
#define CHECK(condition)
Definition: Logger.h:197
#define NULL_SMALLINT
char * f
void writeMetadata(FILE *f) override
Definition: sqltypes.h:44
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:712
virtual void reserve(size_t num_bytes)=0
AbstractBuffer * getIndexBuf() const
void updateStats(const int64_t, const bool) override
void copyMetadata(const Encoder *copyFromEncoder) override
void updateStats(const std::vector< std::string > *const src_data, const size_t start_idx, const size_t num_elements) override
double doubleval
Definition: sqltypes.h:211