OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringNoneEncoder.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
23 #include "StringNoneEncoder.h"
24 #include <algorithm>
25 #include <cstdlib>
26 #include <memory>
27 #include "MemoryLevel.h"
28 
30 
32  const std::vector<std::string>* srcData,
33  const int start_idx,
34  const size_t numAppendElems,
35  const size_t byteLimit,
36  const bool replicating) {
37  size_t dataSize = 0;
38  size_t n = start_idx;
39  for (; n < start_idx + numAppendElems; n++) {
40  size_t len = (*srcData)[replicating ? 0 : n].length();
41  if (dataSize + len > byteLimit) {
42  break;
43  }
44  dataSize += len;
45  }
46  return n - start_idx;
47 }
48 
50  const int8_t* index_data,
51  const std::vector<size_t>& selected_idx,
52  const size_t byte_limit) {
53  size_t num_elements = 0;
54  size_t data_size = 0;
55  for (const auto& offset_index : selected_idx) {
56  auto element_size = getStringSizeAtIndex(index_data, offset_index);
57  if (data_size + element_size > byte_limit) {
58  break;
59  }
60  data_size += element_size;
61  num_elements++;
62  }
63  return num_elements;
64 }
65 
66 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendEncodedDataAtIndices(
67  const int8_t* index_data,
68  int8_t* data,
69  const std::vector<size_t>& selected_idx) {
70  std::vector<std::string_view> data_subset;
71  data_subset.reserve(selected_idx.size());
72  for (const auto& offset_index : selected_idx) {
73  data_subset.emplace_back(getStringAtIndex(index_data, data, offset_index));
74  }
75  return appendData(&data_subset, 0, selected_idx.size(), false);
76 }
77 
78 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendEncodedData(
79  const int8_t* index_data,
80  int8_t* data,
81  const size_t start_idx,
82  const size_t num_elements) {
83  std::vector<std::string_view> data_subset;
84  data_subset.reserve(num_elements);
85  for (size_t count = 0; count < num_elements; ++count) {
86  auto current_index = start_idx + count;
87  data_subset.emplace_back(getStringAtIndex(index_data, data, current_index));
88  }
89  return appendData(&data_subset, 0, num_elements, false);
90 }
91 
92 template <typename StringType>
93 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData(
94  const std::vector<StringType>* srcData,
95  const int start_idx,
96  const size_t numAppendElems,
97  const bool replicating) {
98  CHECK(index_buf); // index_buf must be set before this.
99  size_t index_size = numAppendElems * sizeof(StringOffsetT);
100  if (num_elems_ == 0) {
101  index_size += sizeof(StringOffsetT); // plus one for the initial offset of 0.
102  }
103  index_buf->reserve(index_size);
104  StringOffsetT offset = 0;
105  if (num_elems_ == 0) {
106  index_buf->append((int8_t*)&offset,
107  sizeof(StringOffsetT)); // write the inital 0 offset
108  last_offset = 0;
109  } else {
110  // always need to read a valid last offset from buffer/disk
111  // b/c now due to vacuum "last offset" may go backward and if
112  // index chunk was not reloaded last_offset would go way off!
113  index_buf->read((int8_t*)&last_offset,
114  sizeof(StringOffsetT),
115  index_buf->size() - sizeof(StringOffsetT),
117  CHECK_GE(last_offset, 0);
118  }
119  size_t data_size = 0;
120  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
121  size_t len = (*srcData)[replicating ? 0 : n].length();
122  data_size += len;
123  }
124  buffer_->reserve(data_size);
125 
126  size_t inbuf_size =
127  std::min(std::max(index_size, data_size), (size_t)MAX_INPUT_BUF_SIZE);
128  auto inbuf = std::make_unique<int8_t[]>(inbuf_size);
129  for (size_t num_appended = 0; num_appended < numAppendElems;) {
130  StringOffsetT* p = reinterpret_cast<StringOffsetT*>(inbuf.get());
131  size_t i;
132  for (i = 0; num_appended < numAppendElems && i < inbuf_size / sizeof(StringOffsetT);
133  i++, num_appended++) {
134  p[i] =
135  last_offset + (*srcData)[replicating ? 0 : num_appended + start_idx].length();
136  last_offset = p[i];
137  }
138  index_buf->append(inbuf.get(), i * sizeof(StringOffsetT));
139  }
140 
141  for (size_t num_appended = 0; num_appended < numAppendElems;) {
142  size_t size = 0;
143  for (int i = start_idx + num_appended;
144  num_appended < numAppendElems && size < inbuf_size;
145  i++, num_appended++) {
146  size_t len = (*srcData)[replicating ? 0 : i].length();
147  if (len > inbuf_size) {
148  // for large strings, append on its own
149  if (size > 0) {
150  buffer_->append(inbuf.get(), size);
151  }
152  size = 0;
153  buffer_->append((int8_t*)(*srcData)[replicating ? 0 : i].data(), len);
154  num_appended++;
155  break;
156  } else if (size + len > inbuf_size) {
157  break;
158  }
159  char* dest = reinterpret_cast<char*>(inbuf.get()) + size;
160  if (len > 0) {
161  (*srcData)[replicating ? 0 : i].copy(dest, len);
162  size += len;
163  }
164  update_elem_stats((*srcData)[replicating ? 0 : i]);
165  }
166  if (size > 0) {
167  buffer_->append(inbuf.get(), size);
168  }
169  }
170  // make sure buffer_ is flushed even if no new data is appended to it
171  // (e.g. empty strings) because the metadata needs to be flushed.
172  if (!buffer_->isDirty()) {
173  buffer_->setDirty();
174  }
175 
176  num_elems_ += numAppendElems;
177  auto chunk_metadata = std::make_shared<ChunkMetadata>();
178  getMetadata(chunk_metadata);
179  return chunk_metadata;
180 }
181 
182 void StringNoneEncoder::updateStats(const std::vector<std::string>* const src_data,
183  const size_t start_idx,
184  const size_t num_elements) {
185  for (size_t n = start_idx; n < start_idx + num_elements; n++) {
186  update_elem_stats((*src_data)[n]);
187  if (has_nulls) {
188  break;
189  }
190  }
191 }
192 
193 template <typename StringType>
194 void StringNoneEncoder::update_elem_stats(const StringType& elem) {
195  if (!has_nulls && elem.empty()) {
196  has_nulls = true;
197  }
198 }
199 
200 std::pair<StringOffsetT, StringOffsetT> StringNoneEncoder::getStringOffsets(
201  const int8_t* index_data,
202  size_t index) {
203  auto string_offsets = reinterpret_cast<const StringOffsetT*>(index_data);
204  auto current_index = index + 1;
205  auto offset = string_offsets[current_index];
206  CHECK(offset >= 0);
207  int64_t last_offset = string_offsets[current_index - 1];
208  CHECK(last_offset >= 0 && last_offset <= offset);
209  return {offset, last_offset};
210 }
211 
212 size_t StringNoneEncoder::getStringSizeAtIndex(const int8_t* index_data, size_t index) {
213  auto [offset, last_offset] = getStringOffsets(index_data, index);
214  size_t string_byte_size = offset - last_offset;
215  return string_byte_size;
216 }
217 
218 std::string_view StringNoneEncoder::getStringAtIndex(const int8_t* index_data,
219  const int8_t* data,
220  size_t index) {
221  auto [offset, last_offset] = getStringOffsets(index_data, index);
222  size_t string_byte_size = offset - last_offset;
223  auto current_data = reinterpret_cast<const char*>(data + last_offset);
224  return std::string_view{current_data, string_byte_size};
225 }
226 
227 template std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData<std::string>(
228  const std::vector<std::string>* srcData,
229  const int start_idx,
230  const size_t numAppendElems,
231  const bool replicating);
232 
233 template std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData<std::string_view>(
234  const std::vector<std::string_view>* srcData,
235  const int start_idx,
236  const size_t numAppendElems,
237  const bool replicating);
238 
239 template void StringNoneEncoder::update_elem_stats<std::string>(const std::string& elem);
240 template void StringNoneEncoder::update_elem_stats<std::string_view>(
241  const std::string_view& elem);
242 
243 void StringNoneEncoder::getMetadata(const std::shared_ptr<ChunkMetadata>& chunkMetadata) {
244  Encoder::getMetadata(chunkMetadata); // call on parent class
245  chunkMetadata->chunkStats.min.stringval = nullptr;
246  chunkMetadata->chunkStats.max.stringval = nullptr;
247  chunkMetadata->chunkStats.has_nulls = has_nulls;
248 }
249 
250 // Only called from the executor for synthesized meta-information.
251 std::shared_ptr<ChunkMetadata> StringNoneEncoder::getMetadata(const SQLTypeInfo& ti) {
252  auto chunk_stats = ChunkStats{};
253  chunk_stats.min.stringval = nullptr;
254  chunk_stats.max.stringval = nullptr;
255  chunk_stats.has_nulls = has_nulls;
256  return std::make_shared<ChunkMetadata>(ti, 0, 0, chunk_stats);
257 }
std::string_view getStringAtIndex(const int8_t *index_data, const int8_t *data, size_t index)
size_t num_elems_
Definition: Encoder.h:288
void updateStats(const int64_t, const bool) override
#define MAX_INPUT_BUF_SIZE
Definition: Encoder.h:36
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
#define CHECK_GE(x, y)
Definition: Logger.h:235
std::shared_ptr< ChunkMetadata > appendEncodedDataAtIndices(const int8_t *index_data, int8_t *data, const std::vector< size_t > &selected_idx) override
size_t getNumElemsForBytesInsertData(const std::vector< std::string > *srcData, const int start_idx, const size_t numAppendElems, const size_t byteLimit, const bool replicating=false)
int32_t StringOffsetT
Definition: sqltypes.h:1113
virtual void read(int8_t *const dst, const size_t num_bytes, const size_t offset=0, const MemoryLevel dst_buffer_type=CPU_LEVEL, const int dst_device_id=-1)=0
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:227
AbstractBuffer * index_buf
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:290
size_t getNumElemsForBytesEncodedDataAtIndices(const int8_t *index_data, const std::vector< size_t > &selected_idx, const size_t byte_limit) override
An AbstractBuffer is a unit of data management for a data manager.
size_t getStringSizeAtIndex(const int8_t *index_data, size_t index)
std::string * stringval
Definition: sqltypes.h:220
void update_elem_stats(const StringType &elem)
StringOffsetT last_offset
void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata) override
std::shared_ptr< ChunkMetadata > appendEncodedData(const int8_t *index_data, int8_t *data, const size_t start_idx, const size_t num_elements) override
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
#define CHECK(condition)
Definition: Logger.h:222
For unencoded strings.
constexpr double n
Definition: Utm.h:38
virtual void reserve(size_t num_bytes)=0
std::pair< StringOffsetT, StringOffsetT > getStringOffsets(const int8_t *index_data, size_t index)