OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringNoneEncoder.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #include "StringNoneEncoder.h"
26 #include <algorithm>
27 #include <cstdlib>
28 #include <memory>
29 #include "MemoryLevel.h"
30 
32 
34  const std::vector<std::string>* srcData,
35  const int start_idx,
36  const size_t numAppendElems,
37  const size_t byteLimit,
38  const bool replicating) {
39  size_t dataSize = 0;
40  size_t n = start_idx;
41  for (; n < start_idx + numAppendElems; n++) {
42  size_t len = (*srcData)[replicating ? 0 : n].length();
43  if (dataSize + len > byteLimit) {
44  break;
45  }
46  dataSize += len;
47  }
48  return n - start_idx;
49 }
50 
52  const int8_t* index_data,
53  const std::vector<size_t>& selected_idx,
54  const size_t byte_limit) {
55  size_t num_elements = 0;
56  size_t data_size = 0;
57  for (const auto& offset_index : selected_idx) {
58  auto element_size = getStringSizeAtIndex(index_data, offset_index);
59  if (data_size + element_size > byte_limit) {
60  break;
61  }
62  data_size += element_size;
63  num_elements++;
64  }
65  return num_elements;
66 }
67 
68 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendEncodedDataAtIndices(
69  const int8_t* index_data,
70  int8_t* data,
71  const std::vector<size_t>& selected_idx) {
72  std::vector<std::string_view> data_subset;
73  data_subset.reserve(selected_idx.size());
74  for (const auto& offset_index : selected_idx) {
75  data_subset.emplace_back(getStringAtIndex(index_data, data, offset_index));
76  }
77  return appendData(&data_subset, 0, selected_idx.size(), false);
78 }
79 
80 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendEncodedData(
81  const int8_t* index_data,
82  int8_t* data,
83  const size_t start_idx,
84  const size_t num_elements) {
85  std::vector<std::string_view> data_subset;
86  data_subset.reserve(num_elements);
87  for (size_t count = 0; count < num_elements; ++count) {
88  auto current_index = start_idx + count;
89  data_subset.emplace_back(getStringAtIndex(index_data, data, current_index));
90  }
91  return appendData(&data_subset, 0, num_elements, false);
92 }
93 
94 template <typename StringType>
95 std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData(
96  const std::vector<StringType>* srcData,
97  const int start_idx,
98  const size_t numAppendElems,
99  const bool replicating) {
100  CHECK(index_buf); // index_buf must be set before this.
101  size_t index_size = numAppendElems * sizeof(StringOffsetT);
102  if (num_elems_ == 0) {
103  index_size += sizeof(StringOffsetT); // plus one for the initial offset of 0.
104  }
105  index_buf->reserve(index_size);
106  StringOffsetT offset = 0;
107  if (num_elems_ == 0) {
108  index_buf->append((int8_t*)&offset,
109  sizeof(StringOffsetT)); // write the inital 0 offset
110  last_offset = 0;
111  } else {
112  // always need to read a valid last offset from buffer/disk
113  // b/c now due to vacuum "last offset" may go backward and if
114  // index chunk was not reloaded last_offset would go way off!
115  index_buf->read((int8_t*)&last_offset,
116  sizeof(StringOffsetT),
117  index_buf->size() - sizeof(StringOffsetT),
119  CHECK_GE(last_offset, 0);
120  }
121  size_t data_size = 0;
122  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
123  size_t len = (*srcData)[replicating ? 0 : n].length();
124  data_size += len;
125  }
126  buffer_->reserve(data_size);
127 
128  size_t inbuf_size =
129  std::min(std::max(index_size, data_size), (size_t)MAX_INPUT_BUF_SIZE);
130  auto inbuf = std::make_unique<int8_t[]>(inbuf_size);
131  for (size_t num_appended = 0; num_appended < numAppendElems;) {
132  StringOffsetT* p = reinterpret_cast<StringOffsetT*>(inbuf.get());
133  size_t i;
134  for (i = 0; num_appended < numAppendElems && i < inbuf_size / sizeof(StringOffsetT);
135  i++, num_appended++) {
136  p[i] =
137  last_offset + (*srcData)[replicating ? 0 : num_appended + start_idx].length();
138  last_offset = p[i];
139  }
140  index_buf->append(inbuf.get(), i * sizeof(StringOffsetT));
141  }
142 
143  for (size_t num_appended = 0; num_appended < numAppendElems;) {
144  size_t size = 0;
145  for (int i = start_idx + num_appended;
146  num_appended < numAppendElems && size < inbuf_size;
147  i++, num_appended++) {
148  size_t len = (*srcData)[replicating ? 0 : i].length();
149  if (len > inbuf_size) {
150  // for large strings, append on its own
151  if (size > 0) {
152  buffer_->append(inbuf.get(), size);
153  }
154  size = 0;
155  buffer_->append((int8_t*)(*srcData)[replicating ? 0 : i].data(), len);
156  num_appended++;
157  break;
158  } else if (size + len > inbuf_size) {
159  break;
160  }
161  char* dest = reinterpret_cast<char*>(inbuf.get()) + size;
162  if (len > 0) {
163  (*srcData)[replicating ? 0 : i].copy(dest, len);
164  size += len;
165  }
166  update_elem_stats((*srcData)[replicating ? 0 : i]);
167  }
168  if (size > 0) {
169  buffer_->append(inbuf.get(), size);
170  }
171  }
172  // make sure buffer_ is flushed even if no new data is appended to it
173  // (e.g. empty strings) because the metadata needs to be flushed.
174  if (!buffer_->isDirty()) {
175  buffer_->setDirty();
176  }
177 
178  num_elems_ += numAppendElems;
179  auto chunk_metadata = std::make_shared<ChunkMetadata>();
180  getMetadata(chunk_metadata);
181  return chunk_metadata;
182 }
183 
184 void StringNoneEncoder::updateStats(const std::vector<std::string>* const src_data,
185  const size_t start_idx,
186  const size_t num_elements) {
187  for (size_t n = start_idx; n < start_idx + num_elements; n++) {
188  update_elem_stats((*src_data)[n]);
189  if (has_nulls) {
190  break;
191  }
192  }
193 }
194 
195 template <typename StringType>
196 void StringNoneEncoder::update_elem_stats(const StringType& elem) {
197  if (!has_nulls && elem.empty()) {
198  has_nulls = true;
199  }
200 }
201 
202 std::pair<StringOffsetT, StringOffsetT> StringNoneEncoder::getStringOffsets(
203  const int8_t* index_data,
204  size_t index) {
205  auto string_offsets = reinterpret_cast<const StringOffsetT*>(index_data);
206  auto current_index = index + 1;
207  auto offset = string_offsets[current_index];
208  CHECK(offset >= 0);
209  int64_t last_offset = string_offsets[current_index - 1];
210  CHECK(last_offset >= 0 && last_offset <= offset);
211  return {offset, last_offset};
212 }
213 
214 size_t StringNoneEncoder::getStringSizeAtIndex(const int8_t* index_data, size_t index) {
215  auto [offset, last_offset] = getStringOffsets(index_data, index);
216  size_t string_byte_size = offset - last_offset;
217  return string_byte_size;
218 }
219 
220 std::string_view StringNoneEncoder::getStringAtIndex(const int8_t* index_data,
221  const int8_t* data,
222  size_t index) {
223  auto [offset, last_offset] = getStringOffsets(index_data, index);
224  size_t string_byte_size = offset - last_offset;
225  auto current_data = reinterpret_cast<const char*>(data + last_offset);
226  return std::string_view{current_data, string_byte_size};
227 }
228 
229 template std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData<std::string>(
230  const std::vector<std::string>* srcData,
231  const int start_idx,
232  const size_t numAppendElems,
233  const bool replicating);
234 
235 template std::shared_ptr<ChunkMetadata> StringNoneEncoder::appendData<std::string_view>(
236  const std::vector<std::string_view>* srcData,
237  const int start_idx,
238  const size_t numAppendElems,
239  const bool replicating);
240 
241 template void StringNoneEncoder::update_elem_stats<std::string>(const std::string& elem);
242 template void StringNoneEncoder::update_elem_stats<std::string_view>(
243  const std::string_view& elem);
std::string_view getStringAtIndex(const int8_t *index_data, const int8_t *data, size_t index)
size_t num_elems_
Definition: Encoder.h:288
void updateStats(const int64_t, const bool) override
#define MAX_INPUT_BUF_SIZE
Definition: Encoder.h:36
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
#define CHECK_GE(x, y)
Definition: Logger.h:236
std::shared_ptr< ChunkMetadata > appendEncodedDataAtIndices(const int8_t *index_data, int8_t *data, const std::vector< size_t > &selected_idx) override
size_t getNumElemsForBytesInsertData(const std::vector< std::string > *srcData, const int start_idx, const size_t numAppendElems, const size_t byteLimit, const bool replicating=false)
int32_t StringOffsetT
Definition: sqltypes.h:1113
virtual void read(int8_t *const dst, const size_t num_bytes, const size_t offset=0, const MemoryLevel dst_buffer_type=CPU_LEVEL, const int dst_device_id=-1)=0
AbstractBuffer * index_buf
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:290
size_t getNumElemsForBytesEncodedDataAtIndices(const int8_t *index_data, const std::vector< size_t > &selected_idx, const size_t byte_limit) override
An AbstractBuffer is a unit of data management for a data manager.
size_t getStringSizeAtIndex(const int8_t *index_data, size_t index)
void update_elem_stats(const StringType &elem)
StringOffsetT last_offset
void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata) override
std::shared_ptr< ChunkMetadata > appendEncodedData(const int8_t *index_data, int8_t *data, const size_t start_idx, const size_t num_elements) override
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
#define CHECK(condition)
Definition: Logger.h:223
For unencoded strings.
constexpr double n
Definition: Utm.h:38
virtual void reserve(size_t num_bytes)=0
std::pair< StringOffsetT, StringOffsetT > getStringOffsets(const int8_t *index_data, size_t index)