OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SortedOrderFragmenter.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <cstring>
18 #include <numeric>
19 
20 #include "../Catalog/Catalog.h"
21 #include "SortedOrderFragmenter.h"
22 
23 namespace Fragmenter_Namespace {
24 
25 template <typename T>
26 void shuffleByIndexesImpl(const std::vector<size_t>& indexes, T* buffer) {
27  std::vector<T> new_buffer;
28  new_buffer.reserve(indexes.size());
29  for (const auto i : indexes) {
30  new_buffer.push_back(buffer[i]);
31  }
32  std::memcpy(buffer, new_buffer.data(), indexes.size() * sizeof(T));
33 }
34 
35 template <typename T>
36 void shuffleByIndexesImpl(const std::vector<size_t>& indexes, std::vector<T>& buffer) {
37  std::vector<T> new_buffer;
38  new_buffer.reserve(indexes.size());
39  for (const auto i : indexes) {
40  new_buffer.push_back(buffer[i]);
41  }
42  buffer.swap(new_buffer);
43 }
44 
46  const std::vector<size_t>& indexes,
47  DataBlockPtr& data) {
48  const auto& ti = cd->columnType;
49  switch (ti.get_type()) {
50  case kBOOLEAN:
51  shuffleByIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
52  break;
53  case kTINYINT:
54  shuffleByIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
55  break;
56  case kSMALLINT:
57  shuffleByIndexesImpl(indexes, reinterpret_cast<int16_t*>(data.numbersPtr));
58  break;
59  case kINT:
60  shuffleByIndexesImpl(indexes, reinterpret_cast<int32_t*>(data.numbersPtr));
61  break;
62  case kBIGINT:
63  case kNUMERIC:
64  case kDECIMAL:
65  shuffleByIndexesImpl(indexes, reinterpret_cast<int64_t*>(data.numbersPtr));
66  break;
67  case kFLOAT:
68  shuffleByIndexesImpl(indexes, reinterpret_cast<float*>(data.numbersPtr));
69  break;
70  case kDOUBLE:
71  shuffleByIndexesImpl(indexes, reinterpret_cast<double*>(data.numbersPtr));
72  break;
73  case kTEXT:
74  case kVARCHAR:
75  case kCHAR:
76  if (ti.is_varlen()) {
77  shuffleByIndexesImpl(indexes, *data.stringsPtr);
78  } else {
79  switch (ti.get_size()) {
80  case 1:
81  shuffleByIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
82  break;
83  case 2:
84  shuffleByIndexesImpl(indexes, reinterpret_cast<int16_t*>(data.numbersPtr));
85  break;
86  case 4:
87  shuffleByIndexesImpl(indexes, reinterpret_cast<int32_t*>(data.numbersPtr));
88  break;
89  default:
90  CHECK(false);
91  }
92  }
93  break;
94  case kDATE:
95  case kTIME:
96  case kTIMESTAMP:
97  shuffleByIndexesImpl(indexes, reinterpret_cast<int64_t*>(data.numbersPtr));
98  break;
99  case kARRAY:
100  shuffleByIndexesImpl(indexes, *data.arraysPtr);
101  break;
102  case kPOINT:
103  case kMULTIPOINT:
104  case kLINESTRING:
105  case kMULTILINESTRING:
106  case kPOLYGON:
107  case kMULTIPOLYGON:
108  shuffleByIndexesImpl(indexes, *data.stringsPtr);
109  break;
110  default:
111  CHECK(false);
112  }
113 }
114 
115 template <typename T>
116 void sortIndexesImpl(std::vector<size_t>& indexes, const T* buffer) {
117  CHECK(buffer);
118  std::sort(indexes.begin(), indexes.end(), [&](const auto a, const auto b) {
119  return buffer[a] < buffer[b];
120  });
121 }
122 
123 void sortIndexesImpl(std::vector<size_t>& indexes,
124  const std::vector<std::string>& buffer) {
125  std::sort(indexes.begin(), indexes.end(), [&](const auto a, const auto b) {
126  return buffer[a].size() < buffer[b].size() ||
127  (buffer[a].size() == buffer[b].size() && buffer[a] < buffer[b]);
128  });
129 }
130 
131 void sortIndexesImpl(std::vector<size_t>& indexes,
132  const std::vector<ArrayDatum>& buffer) {
133  std::sort(indexes.begin(), indexes.end(), [&](const auto a, const auto b) {
134  return buffer[a].is_null || buffer[a].length < buffer[b].length ||
135  (!buffer[b].is_null && buffer[a].length == buffer[b].length &&
136  memcmp(buffer[a].pointer, buffer[b].pointer, buffer[a].length) < 0);
137  });
138 }
139 
141  std::vector<size_t>& indexes,
142  const DataBlockPtr& data) {
143  const auto& ti = cd->columnType;
144  switch (ti.get_type()) {
145  case kBOOLEAN:
146  sortIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
147  break;
148  case kTINYINT:
149  sortIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
150  break;
151  case kSMALLINT:
152  sortIndexesImpl(indexes, reinterpret_cast<int16_t*>(data.numbersPtr));
153  break;
154  case kINT:
155  sortIndexesImpl(indexes, reinterpret_cast<int32_t*>(data.numbersPtr));
156  break;
157  case kBIGINT:
158  case kNUMERIC:
159  case kDECIMAL:
160  sortIndexesImpl(indexes, reinterpret_cast<int64_t*>(data.numbersPtr));
161  break;
162  case kFLOAT:
163  sortIndexesImpl(indexes, reinterpret_cast<float*>(data.numbersPtr));
164  break;
165  case kDOUBLE:
166  sortIndexesImpl(indexes, reinterpret_cast<double*>(data.numbersPtr));
167  break;
168  case kTEXT:
169  case kVARCHAR:
170  case kCHAR:
171  if (ti.is_varlen()) {
172  sortIndexesImpl(indexes, *data.stringsPtr);
173  } else {
174  switch (ti.get_size()) {
175  case 1:
176  sortIndexesImpl(indexes, reinterpret_cast<int8_t*>(data.numbersPtr));
177  break;
178  case 2:
179  sortIndexesImpl(indexes, reinterpret_cast<int16_t*>(data.numbersPtr));
180  break;
181  case 4:
182  sortIndexesImpl(indexes, reinterpret_cast<int32_t*>(data.numbersPtr));
183  break;
184  default:
185  CHECK(false);
186  }
187  }
188  break;
189  case kDATE:
190  case kTIME:
191  case kTIMESTAMP:
192  sortIndexesImpl(indexes, reinterpret_cast<int64_t*>(data.numbersPtr));
193  break;
194  case kARRAY:
195  sortIndexesImpl(indexes, *data.arraysPtr);
196  break;
197  default:
198  CHECK(false) << "invalid type '" << ti.get_type() << "' to sort";
199  }
200 }
201 
203  // coming here table must have defined a sort_column for mini sort
204  const auto table_desc = catalog_->getMetadataForTable(physicalTableId_);
205  CHECK(table_desc);
206  CHECK_GT(table_desc->sortedColumnId, 0);
207  const auto logical_cd =
208  catalog_->getMetadataForColumn(table_desc->tableId, table_desc->sortedColumnId);
209  CHECK(logical_cd);
210  const auto physical_cd = catalog_->getMetadataForColumn(
211  table_desc->tableId,
212  table_desc->sortedColumnId + (logical_cd->columnType.is_geometry() ? 1 : 0));
213  const auto it = std::find(insertDataStruct.columnIds.begin(),
214  insertDataStruct.columnIds.end(),
215  physical_cd->columnId);
216  CHECK(it != insertDataStruct.columnIds.end());
217  // sort row indexes of the sort column
218  const auto dist = std::distance(insertDataStruct.columnIds.begin(), it);
219  if (!insertDataStruct.is_default[dist]) {
220  std::vector<size_t> indexes(insertDataStruct.numRows);
221  std::iota(indexes.begin(), indexes.end(), 0);
222  CHECK_LT(static_cast<size_t>(dist), insertDataStruct.data.size());
223  sortIndexes(physical_cd, indexes, insertDataStruct.data[dist]);
224  // shuffle rows of all columns
225  for (size_t i = 0; i < insertDataStruct.columnIds.size(); ++i) {
226  if (insertDataStruct.is_default[i]) {
227  continue;
228  }
229  const auto cd = catalog_->getMetadataForColumn(table_desc->tableId,
230  insertDataStruct.columnIds[i]);
231  shuffleByIndexes(cd, indexes, insertDataStruct.data[i]);
232  }
233  } else {
234  // nothing to shuffle, the column has the same value across all rows
235  return;
236  }
237 }
238 
239 } // namespace Fragmenter_Namespace
Definition: sqltypes.h:76
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:234
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:235
DEVICE void sort(ARGS &&...args)
Definition: gpu_enabled.h:105
std::vector< bool > is_default
Definition: Fragmenter.h:75
void sortIndexesImpl(std::vector< size_t > &indexes, const T *buffer)
void shuffleByIndexes(const ColumnDescriptor *cd, const std::vector< size_t > &indexes, DataBlockPtr &data)
#define CHECK_GT(x, y)
Definition: Logger.h:305
constexpr double a
Definition: Utm.h:32
virtual void sortData(InsertData &insertDataStruct)
size_t numRows
a vector of column ids for the row(s) being inserted
Definition: Fragmenter.h:72
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
specifies the content in-memory of a row in the column metadata table
void sortIndexes(const ColumnDescriptor *cd, std::vector< size_t > &indexes, const DataBlockPtr &data)
#define CHECK_LT(x, y)
Definition: Logger.h:303
Definition: sqltypes.h:79
Definition: sqltypes.h:80
std::vector< DataBlockPtr > data
the number of rows being inserted
Definition: Fragmenter.h:73
DEVICE void iota(ARGS &&...args)
Definition: gpu_enabled.h:69
Definition: sqltypes.h:68
#define CHECK(condition)
Definition: Logger.h:291
The data to be inserted using the fragment manager.
Definition: Fragmenter.h:68
Definition: sqltypes.h:72
SQLTypeInfo columnType
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
int8_t * numbersPtr
Definition: sqltypes.h:233
std::vector< int > columnIds
identifies the table into which the data is being inserted
Definition: Fragmenter.h:71
void shuffleByIndexesImpl(const std::vector< size_t > &indexes, T *buffer)