OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StringNoneEncoder.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #include "StringNoneEncoder.h"
26 #include <algorithm>
27 #include <cstdlib>
28 #include <memory>
29 #include "MemoryLevel.h"
30 
32 
34  const std::vector<std::string>* srcData,
35  const int start_idx,
36  const size_t numAppendElems,
37  const size_t byteLimit,
38  const bool replicating) {
39  size_t dataSize = 0;
40  size_t n = start_idx;
41  for (; n < start_idx + numAppendElems; n++) {
42  size_t len = (*srcData)[replicating ? 0 : n].length();
43  if (dataSize + len > byteLimit) {
44  break;
45  }
46  dataSize += len;
47  }
48  return n - start_idx;
49 }
50 
51 ChunkMetadata StringNoneEncoder::appendData(const std::vector<std::string>* srcData,
52  const int start_idx,
53  const size_t numAppendElems,
54  const bool replicating) {
55  CHECK(index_buf); // index_buf must be set before this.
56  size_t index_size = numAppendElems * sizeof(StringOffsetT);
57  if (num_elems_ == 0) {
58  index_size += sizeof(StringOffsetT); // plus one for the initial offset of 0.
59  }
60  index_buf->reserve(index_size);
61  StringOffsetT offset = 0;
62  if (num_elems_ == 0) {
63  index_buf->append((int8_t*)&offset,
64  sizeof(StringOffsetT)); // write the inital 0 offset
65  last_offset = 0;
66  } else {
67  // always need to read a valid last offset from buffer/disk
68  // b/c now due to vacuum "last offset" may go backward and if
69  // index chunk was not reloaded last_offset would go way off!
70  index_buf->read((int8_t*)&last_offset,
71  sizeof(StringOffsetT),
72  index_buf->size() - sizeof(StringOffsetT),
75  }
76  size_t data_size = 0;
77  for (size_t n = start_idx; n < start_idx + numAppendElems; n++) {
78  size_t len = (*srcData)[replicating ? 0 : n].length();
79  data_size += len;
80  }
81  buffer_->reserve(data_size);
82 
83  size_t inbuf_size =
84  std::min(std::max(index_size, data_size), (size_t)MAX_INPUT_BUF_SIZE);
85  auto inbuf = new int8_t[inbuf_size];
86  std::unique_ptr<int8_t[]> gc_inbuf(inbuf);
87  for (size_t num_appended = 0; num_appended < numAppendElems;) {
88  StringOffsetT* p = (StringOffsetT*)inbuf;
89  size_t i;
90  for (i = 0; num_appended < numAppendElems && i < inbuf_size / sizeof(StringOffsetT);
91  i++, num_appended++) {
92  p[i] =
93  last_offset + (*srcData)[replicating ? 0 : num_appended + start_idx].length();
94  last_offset = p[i];
95  }
96  index_buf->append(inbuf, i * sizeof(StringOffsetT));
97  }
98 
99  for (size_t num_appended = 0; num_appended < numAppendElems;) {
100  size_t size = 0;
101  for (int i = start_idx + num_appended;
102  num_appended < numAppendElems && size < inbuf_size;
103  i++, num_appended++) {
104  size_t len = (*srcData)[replicating ? 0 : i].length();
105  if (len > inbuf_size) {
106  // for large strings, append on its own
107  if (size > 0) {
108  buffer_->append(inbuf, size);
109  }
110  size = 0;
111  buffer_->append((int8_t*)(*srcData)[replicating ? 0 : i].data(), len);
112  num_appended++;
113  break;
114  } else if (size + len > inbuf_size) {
115  break;
116  }
117  char* dest = (char*)inbuf + size;
118  if (len > 0) {
119  (*srcData)[replicating ? 0 : i].copy(dest, len);
120  size += len;
121  } else {
122  has_nulls = true;
123  }
124  }
125  if (size > 0) {
126  buffer_->append(inbuf, size);
127  }
128  }
129  // make sure buffer_ is flushed even if no new data is appended to it
130  // (e.g. empty strings) because the metadata needs to be flushed.
131  if (!buffer_->isDirty()) {
132  buffer_->setDirty();
133  }
134 
135  num_elems_ += numAppendElems;
136  ChunkMetadata chunkMetadata;
137  getMetadata(chunkMetadata);
138  return chunkMetadata;
139 }
size_t num_elems_
Definition: Encoder.h:179
#define MAX_INPUT_BUF_SIZE
Definition: Encoder.h:36
virtual size_t size() const =0
#define CHECK_GE(x, y)
Definition: Logger.h:203
virtual bool isDirty() const
size_t getNumElemsForBytesInsertData(const std::vector< std::string > *srcData, const int start_idx, const size_t numAppendElems, const size_t byteLimit, const bool replicating=false)
int32_t StringOffsetT
Definition: sqltypes.h:912
virtual void read(int8_t *const dst, const size_t num_bytes, const size_t offset=0, const MemoryLevel dst_buffer_type=CPU_LEVEL, const int dst_device_id=-1)=0
AbstractBuffer * index_buf
CHECK(cgen_state)
Data_Namespace::AbstractBuffer * buffer_
Definition: Encoder.h:181
An AbstractBuffer is a unit of data management for a data manager.
StringOffsetT last_offset
void getMetadata(ChunkMetadata &chunkMetadata) override
virtual void append(int8_t *src, const size_t num_bytes, const MemoryLevel src_buffer_type=CPU_LEVEL, const int device_id=-1)=0
For unencoded strings.
virtual void reserve(size_t num_bytes)=0
ChunkMetadata appendData(int8_t *&srcData, const size_t numAppendElems, const SQLTypeInfo &, const bool replicating=false) override