OmniSciDB  04ee39c94c
ArrayNoneEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
24 #ifndef ARRAY_NONE_ENCODER_H
25 #define ARRAY_NONE_ENCODER_H
26 
27 #include "Shared/Logger.h"
28 
29 #include <cassert>
30 #include <cstring>
31 #include <memory>
32 #include <mutex>
33 #include <string>
34 #include <vector>
35 #include "AbstractBuffer.h"
36 #include "ChunkMetadata.h"
37 #include "Encoder.h"
38 
40 
41 class ArrayNoneEncoder : public Encoder {
42  public:
44  : Encoder(buffer)
45  , has_nulls(false)
46  , initialized(false)
47  , index_buf(nullptr)
48  , last_offset(-1) {}
49 
50  size_t getNumElemsForBytesInsertData(const std::vector<ArrayDatum>* srcData,
51  const int start_idx,
52  const size_t numAppendElems,
53  const size_t byteLimit,
54  const bool replicating = false) {
55  size_t dataSize = 0;
56 
57  size_t n = start_idx;
58  for (; n < start_idx + numAppendElems; n++) {
59  size_t len = (*srcData)[replicating ? 0 : n].length;
60  if (dataSize + len > byteLimit) {
61  break;
62  }
63  dataSize += len;
64  }
65  return n - start_idx;
66  }
67 
68  ChunkMetadata appendData(int8_t*& srcData,
69  const size_t numAppendElems,
70  const SQLTypeInfo&,
71  const bool replicating = false) override {
72  assert(false); // should never be called for arrays
73  return ChunkMetadata{};
74  }
75 
76  ChunkMetadata appendData(const std::vector<ArrayDatum>* srcData,
77  const int start_idx,
78  const size_t numAppendElems,
79  const bool replicating) {
80  assert(index_buf != nullptr); // index_buf must be set before this.
81  size_t index_size = numAppendElems * sizeof(ArrayOffsetT);
82  if (num_elems_ == 0) {
83  index_size += sizeof(ArrayOffsetT); // plus one for the initial offset
84  }
85  index_buf->reserve(index_size);
86 
87  bool first_elem_is_null = false;
88  ArrayOffsetT initial_offset = 0;
89  if (num_elems_ == 0) {
90  // If the very first ArrayDatum is NULL, initial offset will be set to 4
91  // so we could negate it and write it out to index buffer to convey NULLness
92  if ((*srcData)[0].is_null) {
93  initial_offset = 4;
94  first_elem_is_null = true;
95  }
96  index_buf->append((int8_t*)&initial_offset,
97  sizeof(ArrayOffsetT)); // write the inital offset
98  last_offset = initial_offset;
99  } else {
100  // Valid last_offset is never negative
101  // always need to read a valid last offset from buffer/disk
102  // b/c now due to vacuum "last offset" may go backward and if
103  // index chunk was not reloaded last_offset would go way off!
104  index_buf->read((int8_t*)&last_offset,
105  sizeof(ArrayOffsetT),
106  index_buf->size() - sizeof(ArrayOffsetT),
108  assert(last_offset != -1);
109  // If the loaded offset is negative it means the last value was a NULL array,
110  // convert to a valid last offset
111  if (last_offset < 0) {
112  last_offset = -last_offset;
113  }
114  }
115  // Need to start data from 4 byte offset if first array encoded is a NULL array
116  size_t data_size = (first_elem_is_null) ? 4 : 0;
117  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
118  // NULL arrays don't take any space so don't add to the data size
119  if ((*srcData)[replicating ? 0 : n].is_null) {
120  continue;
121  }
122  data_size += (*srcData)[replicating ? 0 : n].length;
123  }
124  buffer_->reserve(data_size);
125 
126  size_t inbuf_size =
127  std::min(std::max(index_size, data_size), (size_t)MAX_INPUT_BUF_SIZE);
128  auto inbuf = new int8_t[inbuf_size];
129  std::unique_ptr<int8_t[]> gc_inbuf(inbuf);
130  for (size_t num_appended = 0; num_appended < numAppendElems;) {
131  ArrayOffsetT* p = (ArrayOffsetT*)inbuf;
132  size_t i;
133  for (i = 0; num_appended < numAppendElems && i < inbuf_size / sizeof(ArrayOffsetT);
134  i++, num_appended++) {
135  p[i] =
136  last_offset + (*srcData)[replicating ? 0 : num_appended + start_idx].length;
137  last_offset = p[i];
138  if ((*srcData)[replicating ? 0 : num_appended + start_idx].is_null) {
139  // Record array NULLness in the index buffer
140  p[i] = -p[i];
141  }
142  }
143  index_buf->append(inbuf, i * sizeof(ArrayOffsetT));
144  }
145 
146  // Pad buffer_ with 4 bytes if first encoded array is a NULL array
147  if (first_elem_is_null) {
148  buffer_->append(inbuf, 4);
149  }
150  for (size_t num_appended = 0; num_appended < numAppendElems;) {
151  size_t size = 0;
152  for (int i = start_idx + num_appended;
153  num_appended < numAppendElems && size < inbuf_size;
154  i++, num_appended++) {
155  if ((*srcData)[replicating ? 0 : i].is_null) {
156  continue; // NULL arrays don't take up any space in the data buffer
157  }
158  size_t len = (*srcData)[replicating ? 0 : i].length;
159  if (len > inbuf_size) {
160  // for large strings, append on its own
161  if (size > 0) {
162  buffer_->append(inbuf, size);
163  }
164  size = 0;
165  buffer_->append((*srcData)[replicating ? 0 : i].pointer, len);
166  num_appended++;
167  break;
168  } else if (size + len > inbuf_size) {
169  break;
170  }
171  char* dest = (char*)inbuf + size;
172  if (len > 0) {
173  std::memcpy((void*)dest, (void*)(*srcData)[replicating ? 0 : i].pointer, len);
174  size += len;
175  }
176  }
177  if (size > 0) {
178  buffer_->append(inbuf, size);
179  }
180  }
181  // make sure buffer_ is flushed even if no new data is appended to it
182  // (e.g. empty strings) because the metadata needs to be flushed.
183  if (!buffer_->isDirty()) {
184  buffer_->setDirty();
185  }
186 
187  // keep Chunk statistics with array elements
188  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
189  update_elem_stats((*srcData)[replicating ? 0 : n]);
190  }
191  num_elems_ += numAppendElems;
192  ChunkMetadata chunkMetadata;
193  getMetadata(chunkMetadata);
194  return chunkMetadata;
195  }
196 
197  void getMetadata(ChunkMetadata& chunkMetadata) override {
198  Encoder::getMetadata(chunkMetadata); // call on parent class
199  chunkMetadata.fillChunkStats(elem_min, elem_max, has_nulls);
200  }
201 
202  // Only called from the executor for synthesized meta-information.
203  ChunkMetadata getMetadata(const SQLTypeInfo& ti) override {
204  ChunkMetadata chunk_metadata{ti, 0, 0, ChunkStats{elem_min, elem_max, has_nulls}};
205  return chunk_metadata;
206  }
207 
208  void updateStats(const int64_t, const bool) override { CHECK(false); }
209 
210  void updateStats(const double, const bool) override { CHECK(false); }
211 
212  void reduceStats(const Encoder&) override { CHECK(false); }
213 
214  void writeMetadata(FILE* f) override {
215  // assumes pointer is already in right place
216  fwrite((int8_t*)&num_elems_, sizeof(size_t), 1, f);
217  fwrite((int8_t*)&elem_min, sizeof(Datum), 1, f);
218  fwrite((int8_t*)&elem_max, sizeof(Datum), 1, f);
219  fwrite((int8_t*)&has_nulls, sizeof(bool), 1, f);
220  fwrite((int8_t*)&initialized, sizeof(bool), 1, f);
221  }
222 
223  void readMetadata(FILE* f) override {
224  // assumes pointer is already in right place
225  fread((int8_t*)&num_elems_, sizeof(size_t), 1, f);
226  fread((int8_t*)&elem_min, sizeof(Datum), 1, f);
227  fread((int8_t*)&elem_max, sizeof(Datum), 1, f);
228  fread((int8_t*)&has_nulls, sizeof(bool), 1, f);
229  fread((int8_t*)&initialized, sizeof(bool), 1, f);
230  }
231 
232  void copyMetadata(const Encoder* copyFromEncoder) override {
233  num_elems_ = copyFromEncoder->getNumElems();
234  auto array_encoder = dynamic_cast<const ArrayNoneEncoder*>(copyFromEncoder);
235  elem_min = array_encoder->elem_min;
236  elem_max = array_encoder->elem_max;
237  has_nulls = array_encoder->has_nulls;
238  initialized = array_encoder->initialized;
239  }
240 
242 
245  bool has_nulls;
248  std::unique_lock<std::mutex> lock(EncoderMutex_);
249  index_buf = buf;
250  }
251 
252  private:
253  std::mutex EncoderMutex_;
256 
257  void update_elem_stats(const ArrayDatum& array) {
258  if (array.is_null) {
259  has_nulls = true;
260  }
261  switch (buffer_->sqlType.get_subtype()) {
262  case kBOOLEAN: {
263  if (!initialized) {
264  elem_min.boolval = true;
265  elem_max.boolval = false;
266  }
267  if (array.is_null || array.length == 0) {
268  break;
269  }
270  const bool* bool_array = (bool*)array.pointer;
271  for (size_t i = 0; i < array.length / sizeof(bool); i++) {
272  if ((int8_t)bool_array[i] == NULL_BOOLEAN) {
273  has_nulls = true;
274  } else if (initialized) {
275  elem_min.boolval = std::min(elem_min.boolval, bool_array[i]);
276  elem_max.boolval = std::max(elem_max.boolval, bool_array[i]);
277  } else {
278  elem_min.boolval = bool_array[i];
279  elem_max.boolval = bool_array[i];
280  initialized = true;
281  }
282  }
283  break;
284  }
285  case kINT: {
286  if (!initialized) {
287  elem_min.intval = 1;
288  elem_max.intval = 0;
289  }
290  if (array.is_null || array.length == 0) {
291  break;
292  }
293  const int32_t* int_array = (int32_t*)array.pointer;
294  for (size_t i = 0; i < array.length / sizeof(int32_t); i++) {
295  if (int_array[i] == NULL_INT) {
296  has_nulls = true;
297  } else if (initialized) {
298  elem_min.intval = std::min(elem_min.intval, int_array[i]);
299  elem_max.intval = std::max(elem_max.intval, int_array[i]);
300  } else {
301  elem_min.intval = int_array[i];
302  elem_max.intval = int_array[i];
303  initialized = true;
304  }
305  }
306  break;
307  }
308  case kSMALLINT: {
309  if (!initialized) {
310  elem_min.smallintval = 1;
311  elem_max.smallintval = 0;
312  }
313  if (array.is_null || array.length == 0) {
314  break;
315  }
316  const int16_t* int_array = (int16_t*)array.pointer;
317  for (size_t i = 0; i < array.length / sizeof(int16_t); i++) {
318  if (int_array[i] == NULL_SMALLINT) {
319  has_nulls = true;
320  } else if (initialized) {
321  elem_min.smallintval = std::min(elem_min.smallintval, int_array[i]);
322  elem_max.smallintval = std::max(elem_max.smallintval, int_array[i]);
323  } else {
324  elem_min.smallintval = int_array[i];
325  elem_max.smallintval = int_array[i];
326  initialized = true;
327  }
328  }
329  break;
330  }
331  case kTINYINT: {
332  if (!initialized) {
333  elem_min.tinyintval = 1;
334  elem_max.tinyintval = 0;
335  }
336  if (array.is_null || array.length == 0) {
337  break;
338  }
339  const int8_t* int_array = (int8_t*)array.pointer;
340  for (size_t i = 0; i < array.length / sizeof(int8_t); i++) {
341  if (int_array[i] == NULL_TINYINT) {
342  has_nulls = true;
343  } else if (initialized) {
344  elem_min.tinyintval = std::min(elem_min.tinyintval, int_array[i]);
345  elem_max.tinyintval = std::max(elem_max.tinyintval, int_array[i]);
346  } else {
347  elem_min.tinyintval = int_array[i];
348  elem_max.tinyintval = int_array[i];
349  initialized = true;
350  }
351  }
352  break;
353  }
354  case kBIGINT:
355  case kNUMERIC:
356  case kDECIMAL: {
357  if (!initialized) {
358  elem_min.bigintval = 1;
359  elem_max.bigintval = 0;
360  }
361  if (array.is_null || array.length == 0) {
362  break;
363  }
364  const int64_t* int_array = (int64_t*)array.pointer;
365  for (size_t i = 0; i < array.length / sizeof(int64_t); i++) {
366  if (int_array[i] == NULL_BIGINT) {
367  has_nulls = true;
368  } else if (initialized) {
369  elem_min.bigintval = std::min(elem_min.bigintval, int_array[i]);
370  elem_max.bigintval = std::max(elem_max.bigintval, int_array[i]);
371  } else {
372  elem_min.bigintval = int_array[i];
373  elem_max.bigintval = int_array[i];
374  initialized = true;
375  }
376  }
377  break;
378  }
379  case kFLOAT: {
380  if (!initialized) {
381  elem_min.floatval = 1.0;
382  elem_max.floatval = 0.0;
383  }
384  if (array.is_null || array.length == 0) {
385  break;
386  }
387  const float* flt_array = (float*)array.pointer;
388  for (size_t i = 0; i < array.length / sizeof(float); i++) {
389  if (flt_array[i] == NULL_FLOAT) {
390  has_nulls = true;
391  } else if (initialized) {
392  elem_min.floatval = std::min(elem_min.floatval, flt_array[i]);
393  elem_max.floatval = std::max(elem_max.floatval, flt_array[i]);
394  } else {
395  elem_min.floatval = flt_array[i];
396  elem_max.floatval = flt_array[i];
397  initialized = true;
398  }
399  }
400  break;
401  }
402  case kDOUBLE: {
403  if (!initialized) {
404  elem_min.doubleval = 1.0;
405  elem_max.doubleval = 0.0;
406  }
407  if (array.is_null || array.length == 0) {
408  break;
409  }
410  const double* dbl_array = (double*)array.pointer;
411  for (size_t i = 0; i < array.length / sizeof(double); i++) {
412  if (dbl_array[i] == NULL_DOUBLE) {
413  has_nulls = true;
414  } else if (initialized) {
415  elem_min.doubleval = std::min(elem_min.doubleval, dbl_array[i]);
416  elem_max.doubleval = std::max(elem_max.doubleval, dbl_array[i]);
417  } else {
418  elem_min.doubleval = dbl_array[i];
419  elem_max.doubleval = dbl_array[i];
420  initialized = true;
421  }
422  }
423  break;
424  }
425  case kTIME:
426  case kTIMESTAMP:
427  case kDATE: {
428  if (!initialized) {
429  elem_min.bigintval = 1;
430  elem_max.bigintval = 0;
431  }
432  if (array.is_null || array.length == 0) {
433  break;
434  }
435  const auto tm_array = reinterpret_cast<int64_t*>(array.pointer);
436  for (size_t i = 0; i < array.length / sizeof(int64_t); i++) {
437  if (tm_array[i] == NULL_BIGINT) {
438  has_nulls = true;
439  } else if (initialized) {
440  elem_min.bigintval = std::min(elem_min.bigintval, tm_array[i]);
441  elem_max.bigintval = std::max(elem_max.bigintval, tm_array[i]);
442  } else {
443  elem_min.bigintval = tm_array[i];
444  elem_max.bigintval = tm_array[i];
445  initialized = true;
446  }
447  }
448  break;
449  }
450  case kCHAR:
451  case kVARCHAR:
452  case kTEXT: {
454  if (!initialized) {
455  elem_min.intval = 1;
456  elem_max.intval = 0;
457  }
458  if (array.is_null || array.length == 0) {
459  break;
460  }
461  const int32_t* int_array = (int32_t*)array.pointer;
462  for (size_t i = 0; i < array.length / sizeof(int32_t); i++) {
463  if (int_array[i] == NULL_INT) {
464  has_nulls = true;
465  } else if (initialized) {
466  elem_min.intval = std::min(elem_min.intval, int_array[i]);
467  elem_max.intval = std::max(elem_max.intval, int_array[i]);
468  } else {
469  elem_min.intval = int_array[i];
470  elem_max.intval = int_array[i];
471  initialized = true;
472  }
473  }
474  break;
475  }
476  default:
477  assert(false);
478  }
479  };
480 
481 }; // class ArrayNoneEncoder
482 
483 #endif // ARRAY_NONE_ENCODER_H
int8_t tinyintval
Definition: sqltypes.h:123
void update_elem_stats(const ArrayDatum &array)
size_t getNumElemsForBytesInsertData(const std::vector< ArrayDatum > *srcData, const int start_idx, const size_t numAppendElems, const size_t byteLimit, const bool replicating=false)
#define NULL_DOUBLE
Definition: sqltypes.h:176
size_t num_elems_
Definition: Encoder.h:179
Definition: sqltypes.h:51
#define MAX_INPUT_BUF_SIZE
Definition: Encoder.h:36
void updateStats(const double, const bool) override
#define NULL_BIGINT
Definition: sqltypes.h:174
std::mutex EncoderMutex_
bool boolval
Definition: sqltypes.h:122
virtual size_t size() const =0
void fillChunkStats(const T min, const T max, const bool has_nulls)
Definition: ChunkMetadata.h:38
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:331
int32_t intval
Definition: sqltypes.h:125
AbstractBuffer * get_index_buf() const
void set_index_buf(AbstractBuffer *buf)
virtual void reserve(size_t numBytes)=0
ChunkMetadata appendData(const std::vector< ArrayDatum > *srcData, const int start_idx, const size_t numAppendElems, const bool replicating)
float floatval
Definition: sqltypes.h:127
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:181
#define NULL_TINYINT
Definition: sqltypes.h:171
virtual void getMetadata(ChunkMetadata &chunkMetadata)
Definition: Encoder.cpp:227
virtual void append(int8_t *src, const size_t numBytes, const MemoryLevel srcBufferType=CPU_LEVEL, const int deviceId=-1)=0
HOST DEVICE SQLTypes get_subtype() const
Definition: sqltypes.h:324
int64_t bigintval
Definition: sqltypes.h:126
#define NULL_FLOAT
Definition: sqltypes.h:175
int16_t smallintval
Definition: sqltypes.h:124
An AbstractBuffer is a unit of data management for a data manager.
ArrayNoneEncoder(AbstractBuffer *buffer)
#define NULL_INT
Definition: sqltypes.h:173
AbstractBuffer * index_buf
Definition: sqltypes.h:54
Definition: sqltypes.h:55
void readMetadata(FILE *f) override
bool is_null(const T &v, const SQLTypeInfo &t)
int32_t ArrayOffsetT
Definition: sqltypes.h:878
virtual void read(int8_t *const dst, const size_t numBytes, const size_t offset=0, const MemoryLevel dstBufferType=CPU_LEVEL, const int dstDeviceId=-1)=0
ChunkMetadata appendData(int8_t *&srcData, const size_t numAppendElems, const SQLTypeInfo &, const bool replicating=false) override
Definition: sqltypes.h:43
void reduceStats(const Encoder &) override
#define NULL_SMALLINT
Definition: sqltypes.h:172
size_t getNumElems() const
Definition: Encoder.h:175
ArrayOffsetT last_offset
#define CHECK(condition)
Definition: Logger.h:187
ChunkMetadata getMetadata(const SQLTypeInfo &ti) override
virtual bool isDirty() const
void getMetadata(ChunkMetadata &chunkMetadata) override
void writeMetadata(FILE *f) override
Definition: sqltypes.h:47
void updateStats(const int64_t, const bool) override
#define NULL_BOOLEAN
Definition: sqltypes.h:170
void copyMetadata(const Encoder *copyFromEncoder) override
std::conditional_t< isCudaCC(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:119
double doubleval
Definition: sqltypes.h:128