OmniSciDB  c07336695a
SortedOrderFragmenter.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include <cstring>
17 #include <numeric>
18 
19 #include "../Catalog/Catalog.h"
20 #include "SortedOrderFragmenter.h"
21 
22 namespace Fragmenter_Namespace {
23 
24 template <typename T>
25 void shuffleByIndexesImpl(const std::vector<size_t>& indexes, T* buffer) {
26  std::vector<T> new_buffer;
27  new_buffer.reserve(indexes.size());
28  for (const auto i : indexes) {
29  new_buffer.push_back(buffer[i]);
30  }
31  std::memcpy(buffer, new_buffer.data(), indexes.size() * sizeof(T));
32 }
33 
34 template <typename T>
35 void shuffleByIndexesImpl(const std::vector<size_t>& indexes, std::vector<T>& buffer) {
36  std::vector<T> new_buffer;
37  new_buffer.reserve(indexes.size());
38  for (const auto i : indexes) {
39  new_buffer.push_back(buffer[i]);
40  }
41  buffer.swap(new_buffer);
42 }
43 
45  const std::vector<size_t>& indexes,
46  DataBlockPtr& data) {
47  const auto& ti = cd->columnType;
48  switch (ti.get_type()) {
49  case kBOOLEAN:
50  shuffleByIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
51  break;
52  case kTINYINT:
53  shuffleByIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
54  break;
55  case kSMALLINT:
56  shuffleByIndexesImpl(indexes, reinterpret_cast<int16_t*>(data.numbersPtr));
57  break;
58  case kINT:
59  shuffleByIndexesImpl(indexes, reinterpret_cast<int32_t*>(data.numbersPtr));
60  break;
61  case kBIGINT:
62  case kNUMERIC:
63  case kDECIMAL:
64  shuffleByIndexesImpl(indexes, reinterpret_cast<int64_t*>(data.numbersPtr));
65  break;
66  case kFLOAT:
67  shuffleByIndexesImpl(indexes, reinterpret_cast<float*>(data.numbersPtr));
68  break;
69  case kDOUBLE:
70  shuffleByIndexesImpl(indexes, reinterpret_cast<double*>(data.numbersPtr));
71  break;
72  case kTEXT:
73  case kVARCHAR:
74  case kCHAR:
75  if (ti.is_varlen()) {
76  shuffleByIndexesImpl(indexes, *data.stringsPtr);
77  } else {
78  switch (ti.get_size()) {
79  case 1:
80  shuffleByIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
81  break;
82  case 2:
83  shuffleByIndexesImpl(indexes, reinterpret_cast<int16_t*>(data.numbersPtr));
84  break;
85  case 4:
86  shuffleByIndexesImpl(indexes, reinterpret_cast<int32_t*>(data.numbersPtr));
87  break;
88  default:
89  CHECK(false);
90  }
91  }
92  break;
93  case kDATE:
94  case kTIME:
95  case kTIMESTAMP:
96  shuffleByIndexesImpl(indexes, reinterpret_cast<int64_t*>(data.numbersPtr));
97  break;
98  case kARRAY:
99  shuffleByIndexesImpl(indexes, *data.arraysPtr);
100  break;
101  case kPOINT:
102  case kLINESTRING:
103  case kPOLYGON:
104  case kMULTIPOLYGON:
105  shuffleByIndexesImpl(indexes, *data.stringsPtr);
106  break;
107  default:
108  CHECK(false);
109  }
110 }
111 
112 template <typename T>
113 void sortIndexesImpl(std::vector<size_t>& indexes, const T* buffer) {
114  std::sort(indexes.begin(), indexes.end(), [&](const auto a, const auto b) {
115  return buffer[a] < buffer[b];
116  });
117 }
118 
119 void sortIndexesImpl(std::vector<size_t>& indexes,
120  const std::vector<std::string>& buffer) {
121  std::sort(indexes.begin(), indexes.end(), [&](const auto a, const auto b) {
122  return buffer[a].size() < buffer[b].size() ||
123  (buffer[a].size() == buffer[b].size() && buffer[a] < buffer[b]);
124  });
125 }
126 
127 void sortIndexesImpl(std::vector<size_t>& indexes,
128  const std::vector<ArrayDatum>& buffer) {
129  std::sort(indexes.begin(), indexes.end(), [&](const auto a, const auto b) {
130  return buffer[a].is_null || buffer[a].length < buffer[b].length ||
131  (!buffer[b].is_null && buffer[a].length == buffer[b].length &&
132  memcmp(buffer[a].pointer, buffer[b].pointer, buffer[a].length) < 0);
133  });
134 }
135 
137  std::vector<size_t>& indexes,
138  const DataBlockPtr& data) {
139  const auto& ti = cd->columnType;
140  switch (ti.get_type()) {
141  case kBOOLEAN:
142  sortIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
143  break;
144  case kTINYINT:
145  sortIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
146  break;
147  case kSMALLINT:
148  sortIndexesImpl(indexes, reinterpret_cast<int16_t*>(data.numbersPtr));
149  break;
150  case kINT:
151  sortIndexesImpl(indexes, reinterpret_cast<int32_t*>(data.numbersPtr));
152  break;
153  case kBIGINT:
154  case kNUMERIC:
155  case kDECIMAL:
156  sortIndexesImpl(indexes, reinterpret_cast<int64_t*>(data.numbersPtr));
157  break;
158  case kFLOAT:
159  sortIndexesImpl(indexes, reinterpret_cast<float*>(data.numbersPtr));
160  break;
161  case kDOUBLE:
162  sortIndexesImpl(indexes, reinterpret_cast<double*>(data.numbersPtr));
163  break;
164  case kTEXT:
165  case kVARCHAR:
166  case kCHAR:
167  if (ti.is_varlen()) {
168  sortIndexesImpl(indexes, *data.stringsPtr);
169  } else {
170  switch (ti.get_size()) {
171  case 1:
172  sortIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
173  break;
174  case 2:
175  sortIndexesImpl(indexes, reinterpret_cast<int16_t*>(data.numbersPtr));
176  break;
177  case 4:
178  sortIndexesImpl(indexes, reinterpret_cast<int32_t*>(data.numbersPtr));
179  break;
180  default:
181  CHECK(false);
182  }
183  }
184  break;
185  case kDATE:
186  case kTIME:
187  case kTIMESTAMP:
188  sortIndexesImpl(indexes, reinterpret_cast<int64_t*>(data.numbersPtr));
189  break;
190  case kARRAY:
191  sortIndexesImpl(indexes, *data.arraysPtr);
192  break;
193  default:
194  CHECK(false) << "invalid type '" << ti.get_type() << "' to sort";
195  }
196 }
197 
199  // coming here table must have defined a sort_column for mini sort
200  const auto table_desc = catalog_->getMetadataForTable(physicalTableId_);
201  CHECK(table_desc);
202  CHECK_GT(table_desc->sortedColumnId, 0);
203  const auto logical_cd =
204  catalog_->getMetadataForColumn(table_desc->tableId, table_desc->sortedColumnId);
205  CHECK(logical_cd);
206  const auto physical_cd = catalog_->getMetadataForColumn(
207  table_desc->tableId,
208  table_desc->sortedColumnId + (logical_cd->columnType.is_geometry() ? 1 : 0));
209  const auto it = std::find(insertDataStruct.columnIds.begin(),
210  insertDataStruct.columnIds.end(),
211  physical_cd->columnId);
212  CHECK(it != insertDataStruct.columnIds.end());
213  // sort row indexes of the sort column
214  std::vector<size_t> indexes(insertDataStruct.numRows);
215  std::iota(indexes.begin(), indexes.end(), 0);
216  const auto dist = std::distance(insertDataStruct.columnIds.begin(), it);
217  CHECK_LT(static_cast<size_t>(dist), insertDataStruct.data.size());
218  sortIndexes(physical_cd, indexes, insertDataStruct.data[dist]);
219  // shuffle rows of all columns
220  for (size_t i = 0; i < insertDataStruct.columnIds.size(); ++i) {
221  const auto cd = catalog_->getMetadataForColumn(table_desc->tableId,
222  insertDataStruct.columnIds[i]);
223  shuffleByIndexes(cd, indexes, insertDataStruct.data[i]);
224  }
225 }
226 
227 } // namespace Fragmenter_Namespace
Definition: sqltypes.h:51
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:138
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:139
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
void sortIndexesImpl(std::vector< size_t > &indexes, const T *buffer)
void shuffleByIndexes(const ColumnDescriptor *cd, const std::vector< size_t > &indexes, DataBlockPtr &data)
#define CHECK_GT(x, y)
Definition: Logger.h:199
virtual void sortData(InsertData &insertDataStruct)
size_t numRows
a vector of column ids for the row(s) being inserted
Definition: Fragmenter.h:63
specifies the content in-memory of a row in the column metadata table
void sortIndexes(const ColumnDescriptor *cd, std::vector< size_t > &indexes, const DataBlockPtr &data)
#define CHECK_LT(x, y)
Definition: Logger.h:197
Definition: sqltypes.h:54
Definition: sqltypes.h:55
std::vector< DataBlockPtr > data
the number of rows being inserted
Definition: Fragmenter.h:64
Definition: sqltypes.h:43
#define CHECK(condition)
Definition: Logger.h:187
The data to be inserted using the fragment manager.
Definition: Fragmenter.h:59
Definition: sqltypes.h:47
SQLTypeInfo columnType
int8_t * numbersPtr
Definition: sqltypes.h:137
std::vector< int > columnIds
identifies the table into which the data is being inserted
Definition: Fragmenter.h:62
void shuffleByIndexesImpl(const std::vector< size_t > &indexes, T *buffer)