OmniSciDB  06b3bd477c
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StringNoneEncoder.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #include "StringNoneEncoder.h"
26 #include <algorithm>
27 #include <cstdlib>
28 #include <memory>
29 #include "MemoryLevel.h"
30 
32 
34  const std::vector<std::string>* srcData,
35  const int start_idx,
36  const size_t numAppendElems,
37  const size_t byteLimit,
38  const bool replicating) {
39  size_t dataSize = 0;
40  size_t n = start_idx;
41  for (; n < start_idx + numAppendElems; n++) {
42  size_t len = (*srcData)[replicating ? 0 : n].length();
43  if (dataSize + len > byteLimit) {
44  break;
45  }
46  dataSize += len;
47  }
48  return n - start_idx;
49 }
50 
51 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData(
52  const std::vector<std::string>* srcData,
53  const int start_idx,
54  const size_t numAppendElems,
55  const bool replicating) {
56  CHECK(index_buf); // index_buf must be set before this.
57  size_t index_size = numAppendElems * sizeof(StringOffsetT);
58  if (num_elems_ == 0) {
59  index_size += sizeof(StringOffsetT); // plus one for the initial offset of 0.
60  }
61  index_buf->reserve(index_size);
62  StringOffsetT offset = 0;
63  if (num_elems_ == 0) {
64  index_buf->append((int8_t*)&offset,
65  sizeof(StringOffsetT)); // write the inital 0 offset
66  last_offset = 0;
67  } else {
68  // always need to read a valid last offset from buffer/disk
69  // b/c now due to vacuum "last offset" may go backward and if
70  // index chunk was not reloaded last_offset would go way off!
71  index_buf->read((int8_t*)&last_offset,
72  sizeof(StringOffsetT),
73  index_buf->size() - sizeof(StringOffsetT),
76  }
77  size_t data_size = 0;
78  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
79  size_t len = (*srcData)[replicating ? 0 : n].length();
80  data_size += len;
81  }
82  buffer_->reserve(data_size);
83 
84  size_t inbuf_size =
85  std::min(std::max(index_size, data_size), (size_t)MAX_INPUT_BUF_SIZE);
86  auto inbuf = std::make_unique<int8_t[]>(inbuf_size);
87  for (size_t num_appended = 0; num_appended < numAppendElems;) {
88  StringOffsetT* p = reinterpret_cast<StringOffsetT*>(inbuf.get());
89  size_t i;
90  for (i = 0; num_appended < numAppendElems && i < inbuf_size / sizeof(StringOffsetT);
91  i++, num_appended++) {
92  p[i] =
93  last_offset + (*srcData)[replicating ? 0 : num_appended + start_idx].length();
94  last_offset = p[i];
95  }
96  index_buf->append(inbuf.get(), i * sizeof(StringOffsetT));
97  }
98 
99  for (size_t num_appended = 0; num_appended < numAppendElems;) {
100  size_t size = 0;
101  for (int i = start_idx + num_appended;
102  num_appended < numAppendElems && size < inbuf_size;
103  i++, num_appended++) {
104  size_t len = (*srcData)[replicating ? 0 : i].length();
105  if (len > inbuf_size) {
106  // for large strings, append on its own
107  if (size > 0) {
108  buffer_->append(inbuf.get(), size);
109  }
110  size = 0;
111  buffer_->append((int8_t*)(*srcData)[replicating ? 0 : i].data(), len);
112  num_appended++;
113  break;
114  } else if (size + len > inbuf_size) {
115  break;
116  }
117  char* dest = reinterpret_cast<char*>(inbuf.get()) + size;
118  if (len > 0) {
119  (*srcData)[replicating ? 0 : i].copy(dest, len);
120  size += len;
121  }
122  update_elem_stats((*srcData)[replicating ? 0 : i]);
123  }
124  if (size > 0) {
125  buffer_->append(inbuf.get(), size);
126  }
127  }
128  // make sure buffer_ is flushed even if no new data is appended to it
129  // (e.g. empty strings) because the metadata needs to be flushed.
130  if (!buffer_->isDirty()) {
131  buffer_->setDirty();
132  }
133 
134  num_elems_ += numAppendElems;
135  auto chunk_metadata = std::make_shared<ChunkMetadata>();
136  getMetadata(chunk_metadata);
137  return chunk_metadata;
138 }
139 
140 void StringNoneEncoder::updateStats(const std::vector<std::string>* const src_data,
141  const size_t start_idx,
142  const size_t num_elements) {
143  for (size_t n = start_idx; n < start_idx + num_elements; n++) {
144  update_elem_stats((*src_data)[n]);
145  if (has_nulls) {
146  break;
147  }
148  }
149 }
150 
151 void StringNoneEncoder::update_elem_stats(const std::string& elem) {
152  if (!has_nulls && elem.empty()) {
153  has_nulls = true;
154  }
155 }
size_t num_elems_
Definition: Encoder.h:213
void updateStats(const int64_t, const bool) override
#define MAX_INPUT_BUF_SIZE
Definition: Encoder.h:36
virtual size_t size() const =0
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
#define CHECK_GE(x, y)
Definition: Logger.h:210
virtual bool isDirty() const
size_t getNumElemsForBytesInsertData(const std::vector< std::string > *srcData, const int start_idx, const size_t numAppendElems, const size_t byteLimit, const bool replicating=false)
int32_t StringOffsetT
Definition: sqltypes.h:866
virtual void read(int8_t *const dst, const size_t num_bytes, const size_t offset=0, const MemoryLevel dst_buffer_type=CPU_LEVEL, const int dst_device_id=-1)=0
AbstractBuffer * index_buf
CHECK(cgen_state)
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:215
An AbstractBuffer is a unit of data management for a data manager.
void update_elem_stats(const std::string &elem)
StringOffsetT last_offset
void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata) override
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
For unencoded strings.
virtual void reserve(size_t num_bytes)=0