OmniSciDB  5ade3759e0
PopulateTableRandom.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #include <boost/functional/hash.hpp>
26 #include <cfloat>
27 #include <cstdint>
28 #include <cstdlib>
29 #include <cstring>
30 #include <exception>
31 #include <iostream>
32 #include <memory>
33 #include <random>
34 #include <string>
35 
36 #include "../Catalog/Catalog.h"
37 #include "../DataMgr/DataMgr.h"
38 #include "../Fragmenter/Fragmenter.h"
39 #include "../Shared/DateConverters.h"
40 #include "../Shared/measure.h"
41 #include "../Shared/sqltypes.h"
42 #include "Shared/Logger.h"
43 
44 using namespace Catalog_Namespace;
45 using namespace Fragmenter_Namespace;
46 
47 size_t random_fill_int16(int8_t* buf, size_t num_elems) {
48  std::default_random_engine gen;
49  std::uniform_int_distribution<int16_t> dist(INT16_MIN, INT16_MAX);
50  auto p = reinterpret_cast<int16_t*>(buf);
51  size_t hash = 0;
52  for (size_t i = 0; i < num_elems; i++) {
53  p[i] = dist(gen);
54  boost::hash_combine(hash, p[i]);
55  }
56  return hash;
57 }
58 
59 size_t random_fill_int32(int8_t* buf, size_t num_elems) {
60  std::default_random_engine gen;
61  std::uniform_int_distribution<int32_t> dist(INT32_MIN, INT32_MAX);
62  auto p = reinterpret_cast<int32_t*>(buf);
63  size_t hash = 0;
64  for (size_t i = 0; i < num_elems; i++) {
65  p[i] = dist(gen);
66  boost::hash_combine(hash, p[i]);
67  }
68  return hash;
69 }
70 
71 size_t random_fill_int64(int8_t* buf, size_t num_elems, int64_t min, int64_t max) {
72  std::default_random_engine gen;
73  std::uniform_int_distribution<int64_t> dist(min, max);
74  auto p = reinterpret_cast<int64_t*>(buf);
75  size_t hash = 0;
76  for (size_t i = 0; i < num_elems; i++) {
77  p[i] = dist(gen);
78  boost::hash_combine(hash, p[i]);
79  }
80  return hash;
81 }
82 
83 size_t random_fill_int64(int8_t* buf, size_t num_elems) {
84  return random_fill_int64(buf, num_elems, INT64_MIN, INT64_MAX);
85 }
86 
87 size_t random_fill_float(int8_t* buf, size_t num_elems) {
88  std::default_random_engine gen;
89  std::uniform_real_distribution<float> dist(FLT_MIN, FLT_MAX);
90  auto p = reinterpret_cast<float*>(buf);
91  size_t hash = 0;
92  for (size_t i = 0; i < num_elems; i++) {
93  p[i] = dist(gen);
94  boost::hash_combine(hash, p[i]);
95  }
96  return hash;
97 }
98 
99 size_t random_fill_double(int8_t* buf, size_t num_elems) {
100  std::default_random_engine gen;
101  std::uniform_real_distribution<double> dist(DBL_MIN, DBL_MAX);
102  auto p = reinterpret_cast<double*>(buf);
103  size_t hash = 0;
104  for (size_t i = 0; i < num_elems; i++) {
105  p[i] = dist(gen);
106  boost::hash_combine(hash, p[i]);
107  }
108  return hash;
109 }
110 
111 size_t random_fill_string(std::vector<std::string>& stringVec,
112  size_t num_elems,
113  int max_len,
114  size_t& data_volumn) {
115  std::string chars("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890");
116  std::default_random_engine gen;
117  std::uniform_int_distribution<> char_dist(0, chars.size() - 1);
118  std::uniform_int_distribution<> len_dist(0, max_len);
119  size_t hash = 0;
120  std::hash<std::string> string_hash;
121  for (size_t n = 0; n < num_elems; n++) {
122  int len = len_dist(gen);
123  std::string s(len, ' ');
124  for (int i = 0; i < len; i++) {
125  {
126  s[i] = chars[char_dist(gen)];
127  }
128  }
129  stringVec[n] = s;
130  boost::hash_combine(hash, string_hash(s));
131  data_volumn += len;
132  }
133  return hash;
134 }
135 
136 size_t random_fill_int8array(std::vector<std::vector<int8_t>>& stringVec,
137  size_t num_elems,
138  int max_len,
139  size_t& data_volumn) {
140  std::default_random_engine gen;
141  std::uniform_int_distribution<int8_t> dist(INT8_MIN, INT8_MAX);
142  std::uniform_int_distribution<> len_dist(0, max_len);
143  size_t hash = 0;
144  for (size_t n = 0; n < num_elems; n++) {
145  int len = len_dist(gen);
146  std::vector<int8_t> s(len);
147  for (int i = 0; i < len; i++) {
148  s[i] = dist(gen);
149  boost::hash_combine(hash, s[i]);
150  }
151  stringVec[n] = s;
152  data_volumn += len * sizeof(int8_t);
153  }
154  return hash;
155 }
156 
157 size_t random_fill_int16array(std::vector<std::vector<int16_t>>& stringVec,
158  size_t num_elems,
159  int max_len,
160  size_t& data_volumn) {
161  std::default_random_engine gen;
162  std::uniform_int_distribution<int16_t> dist(INT16_MIN, INT16_MAX);
163  std::uniform_int_distribution<> len_dist(0, max_len / 2);
164  size_t hash = 0;
165  for (size_t n = 0; n < num_elems; n++) {
166  int len = len_dist(gen);
167  std::vector<int16_t> s(len);
168  for (int i = 0; i < len; i++) {
169  s[i] = dist(gen);
170  boost::hash_combine(hash, s[i]);
171  }
172  stringVec[n] = s;
173  data_volumn += len * sizeof(int16_t);
174  }
175  return hash;
176 }
177 
178 size_t random_fill_int32array(std::vector<std::vector<int32_t>>& stringVec,
179  size_t num_elems,
180  int max_len,
181  size_t& data_volumn) {
182  std::default_random_engine gen;
183  std::uniform_int_distribution<int32_t> dist(INT32_MIN, INT32_MAX);
184  std::uniform_int_distribution<> len_dist(0, max_len / 4);
185  size_t hash = 0;
186  for (size_t n = 0; n < num_elems; n++) {
187  int len = len_dist(gen);
188  std::vector<int32_t> s(len);
189  for (int i = 0; i < len; i++) {
190  s[i] = dist(gen);
191  boost::hash_combine(hash, s[i]);
192  }
193  stringVec[n] = s;
194  data_volumn += len * sizeof(int32_t);
195  }
196  return hash;
197 }
198 
199 size_t random_fill_dates(int8_t* buf, size_t num_elems) {
200  constexpr int64_t kDateMin = -185542587187200;
201  constexpr int64_t kDateMax = 185542587100800;
202  std::default_random_engine gen;
203  std::uniform_int_distribution<int64_t> dist(kDateMin, kDateMax);
204  auto p = reinterpret_cast<int64_t*>(buf);
205  size_t hash = 0;
206  for (size_t i = 0; i < num_elems; i++) {
207  p[i] = dist(gen);
209  }
210  return hash;
211 }
212 
213 #define MAX_TEXT_LEN 255
214 
215 size_t random_fill(const ColumnDescriptor* cd,
216  DataBlockPtr p,
217  size_t num_elems,
218  size_t& data_volumn) {
219  size_t hash = 0;
220  switch (cd->columnType.get_type()) {
221  case kSMALLINT:
222  hash = random_fill_int16(p.numbersPtr, num_elems);
223  data_volumn += num_elems * sizeof(int16_t);
224  break;
225  case kINT:
226  hash = random_fill_int32(p.numbersPtr, num_elems);
227  data_volumn += num_elems * sizeof(int32_t);
228  break;
229  case kBIGINT:
230  hash = random_fill_int64(p.numbersPtr, num_elems, INT64_MIN, INT64_MAX);
231  data_volumn += num_elems * sizeof(int64_t);
232  break;
233  case kNUMERIC:
234  case kDECIMAL: {
235  int64_t max = std::pow((double)10, cd->columnType.get_precision());
236  int64_t min = -max;
237  hash = random_fill_int64(p.numbersPtr, num_elems, min, max);
238  data_volumn += num_elems * sizeof(int64_t);
239  break;
240  }
241  case kFLOAT:
242  hash = random_fill_float(p.numbersPtr, num_elems);
243  data_volumn += num_elems * sizeof(float);
244  break;
245  case kDOUBLE:
246  hash = random_fill_double(p.numbersPtr, num_elems);
247  data_volumn += num_elems * sizeof(double);
248  break;
249  case kVARCHAR:
250  case kCHAR:
252  {
253  hash = random_fill_string(
254  *p.stringsPtr, num_elems, cd->columnType.get_dimension(), data_volumn);
255  }
256  }
257  break;
258  case kTEXT:
260  {
261  hash = random_fill_string(*p.stringsPtr, num_elems, MAX_TEXT_LEN, data_volumn);
262  }
263  }
264  break;
265  case kDATE:
266  case kTIME:
267  case kTIMESTAMP:
268  hash = cd->columnType.get_type() == kDATE
269  ? random_fill_dates(p.numbersPtr, num_elems)
270  : random_fill_int64(p.numbersPtr, num_elems);
271  data_volumn += num_elems * sizeof(int64_t);
272  break;
273  default:
274  assert(false);
275  }
276  return hash;
277 }
278 
279 std::vector<size_t> populate_table_random(const std::string& table_name,
280  const size_t num_rows,
281  const Catalog& cat) {
282  const TableDescriptor* td = cat.getMetadataForTable(table_name);
283  const auto cds = cat.getAllColumnMetadataForTable(td->tableId, false, false, false);
284  InsertData insert_data;
285  insert_data.databaseId = cat.getCurrentDB().dbId;
286  insert_data.tableId = td->tableId;
287  for (const auto& cd : cds) {
288  insert_data.columnIds.push_back(cd->columnId);
289  }
290  insert_data.numRows = num_rows;
291  std::vector<std::vector<int8_t>> numbers_vec;
292  std::vector<std::unique_ptr<std::vector<std::string>>> strings_vec;
293 
294  DataBlockPtr p{0};
295  // now allocate space for insert data
296  for (auto cd : cds) {
297  if (cd->columnType.is_varlen()) {
298  if (cd->columnType.get_compression() == kENCODING_NONE) {
299  strings_vec.push_back(std::make_unique<std::vector<std::string>>(num_rows));
300  p.stringsPtr = strings_vec.back().get();
301  } else {
302  CHECK(false);
303  }
304  } else {
305  numbers_vec.emplace_back(num_rows * cd->columnType.get_logical_size());
306  p.numbersPtr = numbers_vec.back().data();
307  }
308  insert_data.data.push_back(p);
309  }
310 
311  // fill InsertData with random data
312  std::vector<size_t> col_hashs(
313  cds.size()); // compute one hash per column for the generated data
314  int i = 0;
315  size_t data_volumn = 0;
316  for (auto cd : cds) {
317  col_hashs[i] = random_fill(cd, insert_data.data[i], num_rows, data_volumn);
318  i++;
319  }
320 
321  // now load the data into table
322  auto ms = measure<>::execution([&]() { td->fragmenter->insertData(insert_data); });
323  std::cout << "Loaded " << num_rows << " rows " << data_volumn << " bytes in " << ms
324  << " ms. at " << (double)data_volumn / (ms / 1000.0) / 1e6 << " MB/sec."
325  << std::endl;
326 
327  return col_hashs;
328 }
int get_precision() const
Definition: sqltypes.h:326
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:325
const int8_t const int64_t * num_rows
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:81
Definition: sqltypes.h:51
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:138
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
size_t random_fill(const ColumnDescriptor *cd, DataBlockPtr p, size_t num_elems, size_t &data_volumn)
size_t random_fill_int16(int8_t *buf, size_t num_elems)
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:323
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:331
size_t random_fill_int32(int8_t *buf, size_t num_elems)
size_t random_fill_float(int8_t *buf, size_t num_elems)
int tableId
identifies the database into which the data is being inserted
Definition: Fragmenter.h:61
size_t numRows
a vector of column ids for the row(s) being inserted
Definition: Fragmenter.h:63
std::vector< size_t > populate_table_random(const std::string &table_name, const size_t num_rows, const Catalog &cat)
size_t random_fill_int32array(std::vector< std::vector< int32_t >> &stringVec, size_t num_elems, int max_len, size_t &data_volumn)
specifies the content in-memory of a row in the column metadata table
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:176
void hash_combine(std::size_t &seed, T const &v)
Definition: sqltypes.h:54
Definition: sqltypes.h:55
size_t random_fill_int64(int8_t *buf, size_t num_elems, int64_t min, int64_t max)
std::vector< DataBlockPtr > data
the number of rows being inserted
Definition: Fragmenter.h:64
size_t random_fill_int8array(std::vector< std::vector< int8_t >> &stringVec, size_t num_elems, int max_len, size_t &data_volumn)
std::list< const ColumnDescriptor * > getAllColumnMetadataForTable(const int tableId, const bool fetchSystemColumns, const bool fetchVirtualColumns, const bool fetchPhysicalColumns) const
Returns a list of pointers to constant ColumnDescriptor structs for all the columns from a particular...
Definition: Catalog.cpp:1579
size_t random_fill_double(int8_t *buf, size_t num_elems)
Definition: sqltypes.h:43
size_t random_fill_dates(int8_t *buf, size_t num_elems)
size_t random_fill_int16array(std::vector< std::vector< int16_t >> &stringVec, size_t num_elems, int max_len, size_t &data_volumn)
#define CHECK(condition)
Definition: Logger.h:187
The data to be inserted using the fragment manager.
Definition: Fragmenter.h:59
static TimeT::rep execution(F func, Args &&... args)
Definition: sample.cpp:29
int64_t get_epoch_days_from_seconds(const int64_t seconds)
Definition: sqltypes.h:47
SQLTypeInfo columnType
specifies the content in-memory of a row in the table metadata table
size_t random_fill_string(std::vector< std::string > &stringVec, size_t num_elems, int max_len, size_t &data_volumn)
int8_t * numbersPtr
Definition: sqltypes.h:137
Fragmenter_Namespace::AbstractFragmenter * fragmenter
std::vector< int > columnIds
identifies the table into which the data is being inserted
Definition: Fragmenter.h:62
virtual void insertData(InsertData &insertDataStruct)=0
Given data wrapped in an InsertData struct, inserts it into the correct partitions with locks and che...
#define MAX_TEXT_LEN