OmniSciDB  72180abbfe
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ArrayNoneEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
24 #ifndef ARRAY_NONE_ENCODER_H
25 #define ARRAY_NONE_ENCODER_H
26 
27 #include "Shared/Logger.h"
28 
29 #include <cassert>
30 #include <cstring>
31 #include <memory>
32 #include <mutex>
33 #include <string>
34 #include <vector>
35 #include "AbstractBuffer.h"
36 #include "ChunkMetadata.h"
37 #include "Encoder.h"
38 
40 
41 class ArrayNoneEncoder : public Encoder {
42  public:
44  : Encoder(buffer)
45  , has_nulls(false)
47  , index_buf(nullptr)
48  , last_offset(-1) {}
49 
50  size_t getNumElemsForBytesInsertData(const std::vector<ArrayDatum>* srcData,
51  const int start_idx,
52  const size_t numAppendElems,
53  const size_t byteLimit,
54  const bool replicating = false) {
55  size_t dataSize = 0;
56 
57  size_t n = start_idx;
58  for (; n < start_idx + numAppendElems; n++) {
59  size_t len = (*srcData)[replicating ? 0 : n].length;
60  if (dataSize + len > byteLimit) {
61  break;
62  }
63  dataSize += len;
64  }
65  return n - start_idx;
66  }
67 
68  std::shared_ptr<ChunkMetadata> appendData(int8_t*& src_data,
69  const size_t num_elems_to_append,
70  const SQLTypeInfo& ti,
71  const bool replicating = false,
72  const int64_t offset = -1) override {
73  UNREACHABLE(); // should never be called for arrays
74  return nullptr;
75  }
76 
77  std::shared_ptr<ChunkMetadata> appendData(const std::vector<ArrayDatum>* srcData,
78  const int start_idx,
79  const size_t numAppendElems,
80  const bool replicating) {
81  CHECK(index_buf != nullptr); // index_buf must be set before this.
82  size_t index_size = numAppendElems * sizeof(ArrayOffsetT);
83  if (num_elems_ == 0) {
84  index_size += sizeof(ArrayOffsetT); // plus one for the initial offset
85  }
86  index_buf->reserve(index_size);
87 
88  bool first_elem_is_null = false;
89  ArrayOffsetT initial_offset = 0;
90  if (num_elems_ == 0) {
91  // If the very first ArrayDatum is NULL, initial offset will be set to 8
92  // so we could negate it and write it out to index buffer to convey NULLness
93  if ((*srcData)[0].is_null) {
94  initial_offset = 8;
95  first_elem_is_null = true;
96  }
97  index_buf->append((int8_t*)&initial_offset,
98  sizeof(ArrayOffsetT)); // write the inital offset
99  last_offset = initial_offset;
100  } else {
101  // Valid last_offset is never negative
102  // always need to read a valid last offset from buffer/disk
103  // b/c now due to vacuum "last offset" may go backward and if
104  // index chunk was not reloaded last_offset would go way off!
105  index_buf->read((int8_t*)&last_offset,
106  sizeof(ArrayOffsetT),
107  index_buf->size() - sizeof(ArrayOffsetT),
109  CHECK(last_offset != -1);
110  // If the loaded offset is negative it means the last value was a NULL array,
111  // convert to a valid last offset
112  if (last_offset < 0) {
114  }
115  }
116  // Need to start data from 8 byte offset if first array encoded is a NULL array
117  size_t data_size = (first_elem_is_null) ? 8 : 0;
118  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
119  // NULL arrays don't take any space so don't add to the data size
120  if ((*srcData)[replicating ? 0 : n].is_null) {
121  continue;
122  }
123  data_size += (*srcData)[replicating ? 0 : n].length;
124  }
125  buffer_->reserve(data_size);
126 
127  size_t inbuf_size =
128  std::min(std::max(index_size, data_size), (size_t)MAX_INPUT_BUF_SIZE);
129  auto gc_inbuf = std::make_unique<int8_t[]>(inbuf_size);
130  auto inbuf = gc_inbuf.get();
131  for (size_t num_appended = 0; num_appended < numAppendElems;) {
132  ArrayOffsetT* p = (ArrayOffsetT*)inbuf;
133  size_t i;
134  for (i = 0; num_appended < numAppendElems && i < inbuf_size / sizeof(ArrayOffsetT);
135  i++, num_appended++) {
136  p[i] =
137  last_offset + (*srcData)[replicating ? 0 : num_appended + start_idx].length;
138  last_offset = p[i];
139  if ((*srcData)[replicating ? 0 : num_appended + start_idx].is_null) {
140  // Record array NULLness in the index buffer
141  p[i] = -p[i];
142  }
143  }
144  index_buf->append(inbuf, i * sizeof(ArrayOffsetT));
145  }
146 
147  // Pad buffer_ with 8 bytes if first encoded array is a NULL array
148  if (first_elem_is_null) {
149  buffer_->append(inbuf, 8);
150  }
151  for (size_t num_appended = 0; num_appended < numAppendElems;) {
152  size_t size = 0;
153  for (int i = start_idx + num_appended;
154  num_appended < numAppendElems && size < inbuf_size;
155  i++, num_appended++) {
156  if ((*srcData)[replicating ? 0 : i].is_null) {
157  continue; // NULL arrays don't take up any space in the data buffer
158  }
159  size_t len = (*srcData)[replicating ? 0 : i].length;
160  if (len > inbuf_size) {
161  // for large strings, append on its own
162  if (size > 0) {
163  buffer_->append(inbuf, size);
164  }
165  size = 0;
166  buffer_->append((*srcData)[replicating ? 0 : i].pointer, len);
167  num_appended++;
168  break;
169  } else if (size + len > inbuf_size) {
170  break;
171  }
172  char* dest = (char*)inbuf + size;
173  if (len > 0) {
174  std::memcpy((void*)dest, (void*)(*srcData)[replicating ? 0 : i].pointer, len);
175  size += len;
176  }
177  }
178  if (size > 0) {
179  buffer_->append(inbuf, size);
180  }
181  }
182  // make sure buffer_ is flushed even if no new data is appended to it
183  // (e.g. empty strings) because the metadata needs to be flushed.
184  if (!buffer_->isDirty()) {
185  buffer_->setDirty();
186  }
187 
188  // keep Chunk statistics with array elements
189  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
190  update_elem_stats((*srcData)[replicating ? 0 : n]);
191  }
192  num_elems_ += numAppendElems;
193  auto chunk_metadata = std::make_shared<ChunkMetadata>();
194  getMetadata(chunk_metadata);
195  return chunk_metadata;
196  }
197 
198  void getMetadata(const std::shared_ptr<ChunkMetadata>& chunkMetadata) override {
199  Encoder::getMetadata(chunkMetadata); // call on parent class
200  chunkMetadata->fillChunkStats(elem_min, elem_max, has_nulls);
201  }
202 
203  // Only called from the executor for synthesized meta-information.
204  std::shared_ptr<ChunkMetadata> getMetadata(const SQLTypeInfo& ti) override {
205  auto chunk_metadata = std::make_shared<ChunkMetadata>(
206  ti, 0, 0, ChunkStats{elem_min, elem_max, has_nulls});
207  return chunk_metadata;
208  }
209 
210  void updateStats(const int64_t, const bool) override { CHECK(false); }
211 
212  void updateStats(const double, const bool) override { CHECK(false); }
213 
214  void updateStats(const int8_t* const src_data, const size_t num_elements) override {
215  CHECK(false);
216  }
217 
218  void updateStats(const std::vector<std::string>* const src_data,
219  const size_t start_idx,
220  const size_t num_elements) override {
221  UNREACHABLE();
222  }
223 
224  void updateStats(const std::vector<ArrayDatum>* const src_data,
225  const size_t start_idx,
226  const size_t num_elements) override {
227  for (size_t n = start_idx; n < start_idx + num_elements; n++) {
228  update_elem_stats((*src_data)[n]);
229  }
230  }
231 
232  void reduceStats(const Encoder&) override { CHECK(false); }
233 
234  void writeMetadata(FILE* f) override {
235  // assumes pointer is already in right place
236  fwrite((int8_t*)&num_elems_, sizeof(size_t), 1, f);
237  fwrite((int8_t*)&elem_min, sizeof(Datum), 1, f);
238  fwrite((int8_t*)&elem_max, sizeof(Datum), 1, f);
239  fwrite((int8_t*)&has_nulls, sizeof(bool), 1, f);
240  fwrite((int8_t*)&initialized, sizeof(bool), 1, f);
241  }
242 
243  void readMetadata(FILE* f) override {
244  // assumes pointer is already in right place
245  fread((int8_t*)&num_elems_, sizeof(size_t), 1, f);
246  fread((int8_t*)&elem_min, sizeof(Datum), 1, f);
247  fread((int8_t*)&elem_max, sizeof(Datum), 1, f);
248  fread((int8_t*)&has_nulls, sizeof(bool), 1, f);
249  fread((int8_t*)&initialized, sizeof(bool), 1, f);
250  }
251 
252  void copyMetadata(const Encoder* copyFromEncoder) override {
253  num_elems_ = copyFromEncoder->getNumElems();
254  auto array_encoder = dynamic_cast<const ArrayNoneEncoder*>(copyFromEncoder);
255  elem_min = array_encoder->elem_min;
256  elem_max = array_encoder->elem_max;
257  has_nulls = array_encoder->has_nulls;
258  initialized = array_encoder->initialized;
259  }
260 
261  AbstractBuffer* getIndexBuf() const { return index_buf; }
262 
265  bool has_nulls;
268  std::unique_lock<std::mutex> lock(EncoderMutex_);
269  index_buf = buf;
270  }
271 
272  private:
273  std::mutex EncoderMutex_;
276 
277  void update_elem_stats(const ArrayDatum& array) {
278  if (array.is_null) {
279  has_nulls = true;
280  }
281  switch (buffer_->sql_type.get_subtype()) {
282  case kBOOLEAN: {
283  if (!initialized) {
284  elem_min.boolval = true;
285  elem_max.boolval = false;
286  }
287  if (array.is_null || array.length == 0) {
288  break;
289  }
290  const bool* bool_array = (bool*)array.pointer;
291  for (size_t i = 0; i < array.length / sizeof(bool); i++) {
292  if ((int8_t)bool_array[i] == NULL_BOOLEAN) {
293  has_nulls = true;
294  } else if (initialized) {
295  elem_min.boolval = std::min(elem_min.boolval, bool_array[i]);
296  elem_max.boolval = std::max(elem_max.boolval, bool_array[i]);
297  } else {
298  elem_min.boolval = bool_array[i];
299  elem_max.boolval = bool_array[i];
300  initialized = true;
301  }
302  }
303  break;
304  }
305  case kINT: {
306  if (!initialized) {
307  elem_min.intval = 1;
308  elem_max.intval = 0;
309  }
310  if (array.is_null || array.length == 0) {
311  break;
312  }
313  const int32_t* int_array = (int32_t*)array.pointer;
314  for (size_t i = 0; i < array.length / sizeof(int32_t); i++) {
315  if (int_array[i] == NULL_INT) {
316  has_nulls = true;
317  } else if (initialized) {
318  elem_min.intval = std::min(elem_min.intval, int_array[i]);
319  elem_max.intval = std::max(elem_max.intval, int_array[i]);
320  } else {
321  elem_min.intval = int_array[i];
322  elem_max.intval = int_array[i];
323  initialized = true;
324  }
325  }
326  break;
327  }
328  case kSMALLINT: {
329  if (!initialized) {
330  elem_min.smallintval = 1;
331  elem_max.smallintval = 0;
332  }
333  if (array.is_null || array.length == 0) {
334  break;
335  }
336  const int16_t* int_array = (int16_t*)array.pointer;
337  for (size_t i = 0; i < array.length / sizeof(int16_t); i++) {
338  if (int_array[i] == NULL_SMALLINT) {
339  has_nulls = true;
340  } else if (initialized) {
341  elem_min.smallintval = std::min(elem_min.smallintval, int_array[i]);
342  elem_max.smallintval = std::max(elem_max.smallintval, int_array[i]);
343  } else {
344  elem_min.smallintval = int_array[i];
345  elem_max.smallintval = int_array[i];
346  initialized = true;
347  }
348  }
349  break;
350  }
351  case kTINYINT: {
352  if (!initialized) {
353  elem_min.tinyintval = 1;
354  elem_max.tinyintval = 0;
355  }
356  if (array.is_null || array.length == 0) {
357  break;
358  }
359  const int8_t* int_array = (int8_t*)array.pointer;
360  for (size_t i = 0; i < array.length / sizeof(int8_t); i++) {
361  if (int_array[i] == NULL_TINYINT) {
362  has_nulls = true;
363  } else if (initialized) {
364  elem_min.tinyintval = std::min(elem_min.tinyintval, int_array[i]);
365  elem_max.tinyintval = std::max(elem_max.tinyintval, int_array[i]);
366  } else {
367  elem_min.tinyintval = int_array[i];
368  elem_max.tinyintval = int_array[i];
369  initialized = true;
370  }
371  }
372  break;
373  }
374  case kBIGINT:
375  case kNUMERIC:
376  case kDECIMAL: {
377  if (!initialized) {
378  elem_min.bigintval = 1;
379  elem_max.bigintval = 0;
380  }
381  if (array.is_null || array.length == 0) {
382  break;
383  }
384  const int64_t* int_array = (int64_t*)array.pointer;
385  for (size_t i = 0; i < array.length / sizeof(int64_t); i++) {
386  if (int_array[i] == NULL_BIGINT) {
387  has_nulls = true;
388  } else if (initialized) {
389  elem_min.bigintval = std::min(elem_min.bigintval, int_array[i]);
390  elem_max.bigintval = std::max(elem_max.bigintval, int_array[i]);
391  } else {
392  elem_min.bigintval = int_array[i];
393  elem_max.bigintval = int_array[i];
394  initialized = true;
395  }
396  }
397  break;
398  }
399  case kFLOAT: {
400  if (!initialized) {
401  elem_min.floatval = 1.0;
402  elem_max.floatval = 0.0;
403  }
404  if (array.is_null || array.length == 0) {
405  break;
406  }
407  const float* flt_array = (float*)array.pointer;
408  for (size_t i = 0; i < array.length / sizeof(float); i++) {
409  if (flt_array[i] == NULL_FLOAT) {
410  has_nulls = true;
411  } else if (initialized) {
412  elem_min.floatval = std::min(elem_min.floatval, flt_array[i]);
413  elem_max.floatval = std::max(elem_max.floatval, flt_array[i]);
414  } else {
415  elem_min.floatval = flt_array[i];
416  elem_max.floatval = flt_array[i];
417  initialized = true;
418  }
419  }
420  break;
421  }
422  case kDOUBLE: {
423  if (!initialized) {
424  elem_min.doubleval = 1.0;
425  elem_max.doubleval = 0.0;
426  }
427  if (array.is_null || array.length == 0) {
428  break;
429  }
430  const double* dbl_array = (double*)array.pointer;
431  for (size_t i = 0; i < array.length / sizeof(double); i++) {
432  if (dbl_array[i] == NULL_DOUBLE) {
433  has_nulls = true;
434  } else if (initialized) {
435  elem_min.doubleval = std::min(elem_min.doubleval, dbl_array[i]);
436  elem_max.doubleval = std::max(elem_max.doubleval, dbl_array[i]);
437  } else {
438  elem_min.doubleval = dbl_array[i];
439  elem_max.doubleval = dbl_array[i];
440  initialized = true;
441  }
442  }
443  break;
444  }
445  case kTIME:
446  case kTIMESTAMP:
447  case kDATE: {
448  if (!initialized) {
449  elem_min.bigintval = 1;
450  elem_max.bigintval = 0;
451  }
452  if (array.is_null || array.length == 0) {
453  break;
454  }
455  const auto tm_array = reinterpret_cast<int64_t*>(array.pointer);
456  for (size_t i = 0; i < array.length / sizeof(int64_t); i++) {
457  if (tm_array[i] == NULL_BIGINT) {
458  has_nulls = true;
459  } else if (initialized) {
460  elem_min.bigintval = std::min(elem_min.bigintval, tm_array[i]);
461  elem_max.bigintval = std::max(elem_max.bigintval, tm_array[i]);
462  } else {
463  elem_min.bigintval = tm_array[i];
464  elem_max.bigintval = tm_array[i];
465  initialized = true;
466  }
467  }
468  break;
469  }
470  case kCHAR:
471  case kVARCHAR:
472  case kTEXT: {
474  if (!initialized) {
475  elem_min.intval = 1;
476  elem_max.intval = 0;
477  }
478  if (array.is_null || array.length == 0) {
479  break;
480  }
481  const int32_t* int_array = (int32_t*)array.pointer;
482  for (size_t i = 0; i < array.length / sizeof(int32_t); i++) {
483  if (int_array[i] == NULL_INT) {
484  has_nulls = true;
485  } else if (initialized) {
486  elem_min.intval = std::min(elem_min.intval, int_array[i]);
487  elem_max.intval = std::max(elem_max.intval, int_array[i]);
488  } else {
489  elem_min.intval = int_array[i];
490  elem_max.intval = int_array[i];
491  initialized = true;
492  }
493  }
494  break;
495  }
496  default:
497  UNREACHABLE();
498  }
499  };
500 
501 }; // class ArrayNoneEncoder
502 
503 #endif // ARRAY_NONE_ENCODER_H
int8_t tinyintval
Definition: sqltypes.h:133
void update_elem_stats(const ArrayDatum &array)
HOST DEVICE SQLTypes get_subtype() const
Definition: sqltypes.h:258
#define CHECK_EQ(x, y)
Definition: Logger.h:205
size_t getNumElemsForBytesInsertData(const std::vector< ArrayDatum > *srcData, const int start_idx, const size_t numAppendElems, const size_t byteLimit, const bool replicating=false)
#define NULL_DOUBLE
Definition: sqltypes.h:184
size_t num_elems_
Definition: Encoder.h:213
Definition: sqltypes.h:50
std::shared_ptr< ChunkMetadata > appendData(const std::vector< ArrayDatum > *srcData, const int start_idx, const size_t numAppendElems, const bool replicating)
#define MAX_INPUT_BUF_SIZE
Definition: Encoder.h:36
void updateStats(const double, const bool) override
#define NULL_BIGINT
Definition: sqltypes.h:182
std::mutex EncoderMutex_
bool boolval
Definition: sqltypes.h:132
virtual size_t size() const =0
void updateStats(const int8_t *const src_data, const size_t num_elements) override
#define UNREACHABLE()
Definition: Logger.h:241
int32_t intval
Definition: sqltypes.h:135
virtual bool isDirty() const
virtual void read(int8_t *const dst, const size_t num_bytes, const size_t offset=0, const MemoryLevel dst_buffer_type=CPU_LEVEL, const int dst_device_id=-1)=0
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:227
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:129
float floatval
Definition: sqltypes.h:137
CHECK(cgen_state)
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:215
#define NULL_TINYINT
Definition: sqltypes.h:179
void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata) override
void updateStats(const std::vector< ArrayDatum > *const src_data, const size_t start_idx, const size_t num_elements) override
int64_t bigintval
Definition: sqltypes.h:136
#define NULL_FLOAT
Definition: sqltypes.h:183
size_t getNumElems() const
Definition: Encoder.h:209
int16_t smallintval
Definition: sqltypes.h:134
An AbstractBuffer is a unit of data management for a data manager.
ArrayNoneEncoder(AbstractBuffer *buffer)
#define NULL_INT
Definition: sqltypes.h:181
void setIndexBuffer(AbstractBuffer *buf)
AbstractBuffer * index_buf
Definition: sqltypes.h:53
Definition: sqltypes.h:54
void readMetadata(FILE *f) override
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:265
bool is_null(const T &v, const SQLTypeInfo &t)
int32_t ArrayOffsetT
Definition: sqltypes.h:856
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
std::shared_ptr< ChunkMetadata > getMetadata(const SQLTypeInfo &ti) override
Definition: sqltypes.h:42
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
void reduceStats(const Encoder &) override
#define NULL_SMALLINT
Definition: sqltypes.h:180
bool g_enable_watchdog false
Definition: Execute.cpp:74
ArrayOffsetT last_offset
void writeMetadata(FILE *f) override
Definition: sqltypes.h:46
virtual void reserve(size_t num_bytes)=0
AbstractBuffer * getIndexBuf() const
void updateStats(const int64_t, const bool) override
#define NULL_BOOLEAN
Definition: sqltypes.h:178
void copyMetadata(const Encoder *copyFromEncoder) override
void updateStats(const std::vector< std::string > *const src_data, const size_t start_idx, const size_t num_elements) override
double doubleval
Definition: sqltypes.h:138