OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DdlUtils.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2020 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DdlUtils.h"
18 
19 #include <unordered_set>
20 
21 #include <boost/algorithm/string.hpp>
22 #include <boost/filesystem.hpp>
23 #include <boost/program_options.hpp>
24 
25 #include "rapidjson/document.h"
26 
28 #include "Geospatial/Types.h"
30 #include "Shared/file_path_util.h"
31 #include "Shared/misc.h"
32 #include "Shared/sqltypes.h"
33 
35 
36 namespace ddl_utils {
37 SqlType::SqlType(SQLTypes type, int param1, int param2, bool is_array, int array_size)
38  : type(type)
39  , param1(param1)
40  , param2(param2)
41  , is_array(is_array)
42  , array_size(array_size) {}
43 
45  return type;
46 }
47 
48 int SqlType::get_param1() const {
49  return param1;
50 }
51 
52 void SqlType::set_param1(int param) {
53  param1 = param;
54 }
55 
56 int SqlType::get_param2() const {
57  return param2;
58 }
59 
60 bool SqlType::get_is_array() const {
61  return is_array;
62 }
63 
65  is_array = a;
66 }
67 
69  return array_size;
70 }
71 
73  array_size = s;
74 }
75 
76 std::string SqlType::to_string() const {
77  std::string str;
78  switch (type) {
79  case kBOOLEAN:
80  str = "BOOLEAN";
81  break;
82  case kCHAR:
83  str = "CHAR(" + boost::lexical_cast<std::string>(param1) + ")";
84  break;
85  case kVARCHAR:
86  str = "VARCHAR(" + boost::lexical_cast<std::string>(param1) + ")";
87  break;
88  case kTEXT:
89  str = "TEXT";
90  break;
91  case kNUMERIC:
92  str = "NUMERIC(" + boost::lexical_cast<std::string>(param1);
93  if (param2 > 0) {
94  str += ", " + boost::lexical_cast<std::string>(param2);
95  }
96  str += ")";
97  break;
98  case kDECIMAL:
99  str = "DECIMAL(" + boost::lexical_cast<std::string>(param1);
100  if (param2 > 0) {
101  str += ", " + boost::lexical_cast<std::string>(param2);
102  }
103  str += ")";
104  break;
105  case kBIGINT:
106  str = "BIGINT";
107  break;
108  case kINT:
109  str = "INT";
110  break;
111  case kTINYINT:
112  str = "TINYINT";
113  break;
114  case kSMALLINT:
115  str = "SMALLINT";
116  break;
117  case kFLOAT:
118  str = "FLOAT";
119  break;
120  case kDOUBLE:
121  str = "DOUBLE";
122  break;
123  case kTIME:
124  str = "TIME";
125  if (param1 < 6) {
126  str += "(" + boost::lexical_cast<std::string>(param1) + ")";
127  }
128  break;
129  case kTIMESTAMP:
130  str = "TIMESTAMP";
131  if (param1 <= 9) {
132  str += "(" + boost::lexical_cast<std::string>(param1) + ")";
133  }
134  break;
135  case kDATE:
136  str = "DATE";
137  break;
138  default:
139  assert(false);
140  break;
141  }
142  if (is_array) {
143  str += "[";
144  if (array_size > 0) {
145  str += boost::lexical_cast<std::string>(array_size);
146  }
147  str += "]";
148  }
149  return str;
150 }
151 
153  switch (type) {
154  case kCHAR:
155  case kVARCHAR:
156  if (param1 <= 0) {
157  throw std::runtime_error("CHAR and VARCHAR must have a positive dimension.");
158  }
159  break;
160  case kDECIMAL:
161  case kNUMERIC:
162  if (param1 <= 0) {
163  throw std::runtime_error("DECIMAL and NUMERIC must have a positive precision.");
164  } else if (param1 > 19) {
165  throw std::runtime_error(
166  "DECIMAL and NUMERIC precision cannot be larger than 19.");
167  } else if (param1 <= param2) {
168  throw std::runtime_error(
169  "DECIMAL and NUMERIC must have precision larger than scale.");
170  }
171  break;
172  case kTIMESTAMP:
173  if (param1 == -1) {
174  param1 = 0; // set default to 0
175  } else if (param1 != 0 && param1 != 3 && param1 != 6 &&
176  param1 != 9) { // support ms, us, ns
177  throw std::runtime_error(
178  "Only TIMESTAMP(n) where n = (0,3,6,9) are supported now.");
179  }
180  break;
181  case kTIME:
182  if (param1 == -1) {
183  param1 = 0; // default precision is 0
184  }
185  if (param1 > 0) { // @TODO(wei) support sub-second precision later.
186  throw std::runtime_error("Only TIME(0) is supported now.");
187  }
188  break;
189  case kPOINT:
190  case kLINESTRING:
191  case kPOLYGON:
192  case kMULTIPOLYGON:
193  // Storing SRID in param1
194  break;
195  default:
196  param1 = 0;
197  break;
198  }
199 }
200 
201 Encoding::Encoding(std::string* encoding_name, int encoding_param)
202  : encoding_name(encoding_name), encoding_param(encoding_param) {}
203 
204 const std::string* Encoding::get_encoding_name() const {
205  return encoding_name.get();
206 }
207 
209  return encoding_param;
210 }
211 
213  // Change default TEXT column behaviour to be DICT encoded
214  if (cd.columnType.is_string() || cd.columnType.is_string_array()) {
215  // default to 32-bits
217  cd.columnType.set_comp_param(32);
218  } else if (cd.columnType.is_decimal() && cd.columnType.get_precision() <= 4) {
220  cd.columnType.set_comp_param(16);
221  } else if (cd.columnType.is_decimal() && cd.columnType.get_precision() <= 9) {
223  cd.columnType.set_comp_param(32);
224  } else if (cd.columnType.is_decimal() && cd.columnType.get_precision() > 18) {
225  throw std::runtime_error(cd.columnName + ": Precision too high, max 18.");
226  } else if (cd.columnType.is_geometry() && cd.columnType.get_output_srid() == 4326) {
227  // default to GEOINT 32-bits
229  cd.columnType.set_comp_param(32);
231  // Days encoding for DATE
234  } else {
237  }
238 }
239 
241  int encoding_size,
242  const SqlType* column_type) {
243  auto type = cd.columnType.get_type();
244  // fixed-bits encoding
245  if (type == kARRAY) {
246  type = cd.columnType.get_subtype();
247  switch (type) {
248  case kTINYINT:
249  case kSMALLINT:
250  case kINT:
251  case kBIGINT:
252  case kDATE:
253  throw std::runtime_error(cd.columnName + ": Cannot apply FIXED encoding to " +
254  column_type->to_string() + " type array.");
255  break;
256  default:
257  break;
258  }
259  }
260 
261  if (!IS_INTEGER(type) && !is_datetime(type) &&
262  !(type == kDECIMAL || type == kNUMERIC)) {
263  throw std::runtime_error(
264  cd.columnName +
265  ": Fixed encoding is only supported for integer or time columns.");
266  }
267 
268  switch (type) {
269  case kSMALLINT:
270  if (encoding_size != 8) {
271  throw std::runtime_error(
272  cd.columnName +
273  ": Compression parameter for Fixed encoding on SMALLINT must be 8.");
274  }
275  break;
276  case kINT:
277  if (encoding_size != 8 && encoding_size != 16) {
278  throw std::runtime_error(
279  cd.columnName +
280  ": Compression parameter for Fixed encoding on INTEGER must be 8 or 16.");
281  }
282  break;
283  case kBIGINT:
284  if (encoding_size != 8 && encoding_size != 16 && encoding_size != 32) {
285  throw std::runtime_error(cd.columnName +
286  ": Compression parameter for Fixed encoding on "
287  "BIGINT must be 8 or 16 or 32.");
288  }
289  break;
290  case kTIMESTAMP:
291  case kTIME:
292  if (encoding_size != 32) {
293  throw std::runtime_error(cd.columnName +
294  ": Compression parameter for Fixed encoding on "
295  "TIME or TIMESTAMP must be 32.");
296  } else if (cd.columnType.is_high_precision_timestamp()) {
297  throw std::runtime_error("Fixed encoding is not supported for TIMESTAMP(3|6|9).");
298  }
299  break;
300  case kDECIMAL:
301  case kNUMERIC:
302  if (encoding_size != 32 && encoding_size != 16) {
303  throw std::runtime_error(cd.columnName +
304  ": Compression parameter for Fixed encoding on "
305  "DECIMAL must be 16 or 32.");
306  }
307 
308  if (encoding_size == 32 && cd.columnType.get_precision() > 9) {
309  throw std::runtime_error(cd.columnName +
310  ": Precision too high for Fixed(32) encoding, max 9.");
311  }
312 
313  if (encoding_size == 16 && cd.columnType.get_precision() > 4) {
314  throw std::runtime_error(cd.columnName +
315  ": Precision too high for Fixed(16) encoding, max 4.");
316  }
317  break;
318  case kDATE:
319  if (encoding_size != 32 && encoding_size != 16) {
320  throw std::runtime_error(cd.columnName +
321  ": Compression parameter for Fixed encoding on "
322  "DATE must be 16 or 32.");
323  }
324  break;
325  default:
326  throw std::runtime_error(cd.columnName + ": Cannot apply FIXED encoding to " +
327  column_type->to_string());
328  }
329  if (type == kDATE) {
331  cd.columnType.set_comp_param(16);
332  } else {
334  cd.columnType.set_comp_param(encoding_size);
335  }
336 }
337 
339  if (!cd.columnType.is_string() && !cd.columnType.is_string_array()) {
340  throw std::runtime_error(
341  cd.columnName +
342  ": Dictionary encoding is only supported on string or string array columns.");
343  }
344  int comp_param;
345  if (encoding_size == 0) {
346  comp_param = 32; // default to 32-bits
347  } else {
348  comp_param = encoding_size;
349  }
350  if (cd.columnType.is_string_array() && comp_param != 32) {
351  throw std::runtime_error(cd.columnName +
352  ": Compression parameter for string arrays must be 32");
353  }
354  if (comp_param != 8 && comp_param != 16 && comp_param != 32) {
355  throw std::runtime_error(
356  cd.columnName +
357  ": Compression parameter for Dictionary encoding must be 8 or 16 or 32.");
358  }
359  // dictionary encoding
361  cd.columnType.set_comp_param(comp_param);
362 }
363 
365  if (!cd.columnType.is_string() && !cd.columnType.is_string_array() &&
366  !cd.columnType.is_geometry()) {
367  throw std::runtime_error(
368  cd.columnName +
369  ": None encoding is only supported on string, string array, or geo columns.");
370  }
373 }
374 
376  // sparse column encoding with mostly NULL values
377  if (cd.columnType.get_notnull()) {
378  throw std::runtime_error(cd.columnName +
379  ": Cannot do sparse column encoding on a NOT NULL column.");
380  }
381  if (encoding_size == 0 || encoding_size % 8 != 0 || encoding_size > 48) {
382  throw std::runtime_error(
383  cd.columnName +
384  "Must specify number of bits as 8, 16, 24, 32 or 48 as the parameter to "
385  "sparse-column encoding.");
386  }
388  cd.columnType.set_comp_param(encoding_size);
389  // throw std::runtime_error("SPARSE encoding not supported yet.");
390 }
391 
393  if (!cd.columnType.is_geometry() || cd.columnType.get_output_srid() != 4326) {
394  throw std::runtime_error(
395  cd.columnName + ": COMPRESSED encoding is only supported on WGS84 geo columns.");
396  }
397  int comp_param;
398  if (encoding_size == 0) {
399  comp_param = 32; // default to 32-bits
400  } else {
401  comp_param = encoding_size;
402  }
403  if (comp_param != 32) {
404  throw std::runtime_error(cd.columnName +
405  ": only 32-bit COMPRESSED geo encoding is supported");
406  }
407  // encoding longitude/latitude as integers
409  cd.columnType.set_comp_param(comp_param);
410 }
411 
412 void validate_and_set_date_encoding(ColumnDescriptor& cd, int encoding_size) {
413  // days encoding for dates
414  if (cd.columnType.get_type() == kARRAY && cd.columnType.get_subtype() == kDATE) {
415  throw std::runtime_error(cd.columnName +
416  ": Cannot apply days encoding to date array.");
417  }
418  if (cd.columnType.get_type() != kDATE) {
419  throw std::runtime_error(cd.columnName +
420  ": Days encoding is only supported for DATE columns.");
421  }
422  if (encoding_size != 32 && encoding_size != 16) {
423  throw std::runtime_error(cd.columnName +
424  ": Compression parameter for Days encoding on "
425  "DATE must be 16 or 32.");
426  }
428  cd.columnType.set_comp_param((encoding_size == 16) ? 16 : 0);
429 }
430 
432  const Encoding* encoding,
433  const SqlType* column_type) {
434  if (encoding == nullptr) {
436  } else {
437  const std::string& comp = *encoding->get_encoding_name();
438  if (boost::iequals(comp, "fixed")) {
439  validate_and_set_fixed_encoding(cd, encoding->get_encoding_param(), column_type);
440  } else if (boost::iequals(comp, "rl")) {
441  // run length encoding
444  // throw std::runtime_error("RL(Run Length) encoding not supported yet.");
445  } else if (boost::iequals(comp, "diff")) {
446  // differential encoding
449  // throw std::runtime_error("DIFF(differential) encoding not supported yet.");
450  } else if (boost::iequals(comp, "dict")) {
452  } else if (boost::iequals(comp, "NONE")) {
454  } else if (boost::iequals(comp, "sparse")) {
456  } else if (boost::iequals(comp, "compressed")) {
458  } else if (boost::iequals(comp, "days")) {
460  } else {
461  throw std::runtime_error(cd.columnName + ": Invalid column compression scheme " +
462  comp);
463  }
464  }
465 }
466 
468  column_type->check_type();
469 
470  if (column_type->get_is_array()) {
472  cd.columnType.set_subtype(column_type->get_type());
473  } else {
474  cd.columnType.set_type(column_type->get_type());
475  }
476  if (IS_GEO(column_type->get_type())) {
477  cd.columnType.set_subtype(static_cast<SQLTypes>(column_type->get_param1()));
478  cd.columnType.set_input_srid(column_type->get_param2());
479  cd.columnType.set_output_srid(column_type->get_param2());
480  } else {
481  cd.columnType.set_dimension(column_type->get_param1());
482  cd.columnType.set_scale(column_type->get_param2());
483  }
484 }
485 
486 void validate_and_set_array_size(ColumnDescriptor& cd, const SqlType* column_type) {
487  if (cd.columnType.is_string_array() &&
489  throw std::runtime_error(
490  cd.columnName +
491  ": Array of strings must be dictionary encoded. Specify ENCODING DICT");
492  }
493 
494  if (column_type->get_is_array()) {
495  int s = -1;
496  auto array_size = column_type->get_array_size();
497  if (array_size > 0) {
498  auto sti = cd.columnType.get_elem_type();
499  s = array_size * sti.get_size();
500  if (s <= 0) {
501  throw std::runtime_error(cd.columnName + ": Unexpected fixed length array size");
502  }
503  }
504  cd.columnType.set_size(s);
505 
506  } else {
508  }
509 }
510 
511 namespace {
512 
513 void validate_literal(const std::string& val,
514  SQLTypeInfo column_type,
515  const std::string& column_name) {
516  if (to_upper(val) == "NULL") {
517  return;
518  }
519  switch (column_type.get_type()) {
520  case kBOOLEAN:
521  case kTINYINT:
522  case kSMALLINT:
523  case kINT:
524  case kBIGINT:
525  case kFLOAT:
526  case kDOUBLE:
527  case kTIME:
528  case kTIMESTAMP:
529  StringToDatum(val, column_type);
530  break;
531  case kDATE: {
532  auto d = StringToDatum(val, column_type);
533  DateDaysOverflowValidator validator(column_type);
534  validator.validate(d.bigintval);
535  break;
536  }
537  case kDECIMAL:
538  case kNUMERIC: {
539  SQLTypeInfo ti(kNUMERIC, 0, 0, false);
540  auto d = StringToDatum(val, ti);
541  auto converted_val = convert_decimal_value_to_scale(d.bigintval, ti, column_type);
542  DecimalOverflowValidator validator(column_type);
543  validator.validate(converted_val);
544  break;
545  }
546  case kTEXT:
547  case kVARCHAR:
548  case kCHAR:
549  if (val.length() > StringDictionary::MAX_STRLEN) {
550  throw std::runtime_error("String too long for column " + column_name + " was " +
551  std::to_string(val.length()) + " max is " +
553  }
554  break;
555  case kARRAY: {
556  if (val.front() != '{' || val.back() != '}') {
557  throw std::runtime_error(column_name +
558  ": arrays should start and end with curly braces");
559  }
560  std::vector<std::string> elements = split(val.substr(1, val.length() - 2), ", ");
561  if (column_type.get_size() > 0) {
562  auto sti = column_type.get_elem_type();
563  size_t expected_size = column_type.get_size() / sti.get_size();
564  size_t actual_size = elements.size();
565  if (actual_size != expected_size) {
566  throw std::runtime_error("Fixed length array column " + column_name +
567  " expects " + std::to_string(expected_size) +
568  " values, received " + std::to_string(actual_size));
569  }
570  }
571  SQLTypeInfo element_ti = column_type.get_elem_type();
572  for (const auto& element : elements) {
573  if (to_upper(element) != "NULL") {
574  validate_literal(element, element_ti, column_name);
575  }
576  }
577  break;
578  }
579  case kPOINT:
580  case kLINESTRING:
581  case kPOLYGON:
582  case kMULTIPOLYGON:
583  if (val.empty()) {
584  return;
585  }
586  try {
588  if (!geo) {
589  throw std::runtime_error("Unexpected geo literal '" + val + "' for column " +
590  column_name);
591  }
592  if (!geo->transform(column_type)) {
593  throw std::runtime_error("Cannot transform SRID for literal '" + val +
594  "' for column " + column_name);
595  } else {
596  auto sql_type = column_type.get_type();
597  auto geo_type = geo->getType();
598  if ((geo_type == Geospatial::GeoBase::GeoType::kPOINT && sql_type != kPOINT) ||
600  sql_type != kLINESTRING) ||
602  sql_type != kPOLYGON) ||
604  sql_type != kMULTIPOLYGON)) {
605  throw std::runtime_error("Geo literal '" + val +
606  "' doesn't match the type "
607  "of column column " +
608  column_name);
609  }
610  }
611  } catch (Geospatial::GeoTypesError& e) {
612  throw std::runtime_error("Unexpected geo literal '" + val + "' for column " +
613  column_name + ": " + e.what());
614  }
615  break;
616  default:
617  CHECK(false) << "validate_literal() does not support type "
618  << column_type.get_type();
619  }
620 }
621 
622 } // namespace
623 
625  const std::string* default_value,
626  bool not_null) {
627  bool is_null_literal =
628  default_value && ((to_upper(*default_value) == "NULL") ||
629  (cd.columnType.is_geometry() && default_value->empty()));
630  if (not_null && (is_null_literal)) {
631  throw std::runtime_error(cd.columnName +
632  ": cannot set default value to NULL for "
633  "NOT NULL column");
634  }
635  if (!default_value || is_null_literal) {
636  cd.default_value = std::nullopt;
637  return;
638  }
639  const auto& column_type = cd.columnType;
640  const auto& val = *default_value;
641  validate_literal(val, column_type, cd.columnName);
642  cd.default_value = std::make_optional(*default_value);
643 }
644 
645 void set_column_descriptor(const std::string& column_name,
646  ColumnDescriptor& cd,
647  SqlType* column_type,
648  const bool not_null,
649  const Encoding* encoding,
650  const std::string* default_value) {
651  cd.columnName = column_name;
652  validate_and_set_type(cd, column_type);
653  cd.columnType.set_notnull(not_null);
654  validate_and_set_encoding(cd, encoding, column_type);
655  validate_and_set_array_size(cd, column_type);
656  cd.isSystemCol = false;
657  cd.isVirtualCol = false;
658  validate_and_set_default_value(cd, default_value, not_null);
659 }
660 
661 void set_default_table_attributes(const std::string& table_name,
662  TableDescriptor& td,
663  const int32_t column_count) {
664  td.tableName = table_name;
665  td.nColumns = column_count;
666  td.isView = false;
667  td.fragmenter = nullptr;
673 }
674 
675 void validate_non_duplicate_column(const std::string& column_name,
676  std::unordered_set<std::string>& upper_column_names) {
677  const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
678  const auto insert_it = upper_column_names.insert(upper_column_name);
679  if (!insert_it.second) {
680  throw std::runtime_error("Column '" + column_name + "' defined more than once");
681  }
682 }
683 
684 void validate_non_reserved_keyword(const std::string& column_name) {
685  const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
686  if (reserved_keywords.find(upper_column_name) != reserved_keywords.end()) {
687  throw std::runtime_error("Cannot create column with reserved keyword '" +
688  column_name + "'");
689  }
690 }
691 
693  const TableType expected_table_type,
694  const std::string& command) {
695  if (td->isView) {
696  if (expected_table_type != TableType::VIEW) {
697  throw std::runtime_error(td->tableName + " is a view. Use " + command + " VIEW.");
698  }
699  } else if (td->storageType == StorageType::FOREIGN_TABLE) {
700  if (expected_table_type != TableType::FOREIGN_TABLE) {
701  throw std::runtime_error(td->tableName + " is a foreign table. Use " + command +
702  " FOREIGN TABLE.");
703  }
704  } else if (expected_table_type != TableType::TABLE) {
705  throw std::runtime_error(td->tableName + " is a table. Use " + command + " TABLE.");
706  }
707 }
708 
709 std::string table_type_enum_to_string(const TableType table_type) {
710  if (table_type == ddl_utils::TableType::TABLE) {
711  return "Table";
712  }
713  if (table_type == ddl_utils::TableType::FOREIGN_TABLE) {
714  return "ForeignTable";
715  }
716  if (table_type == ddl_utils::TableType::VIEW) {
717  return "View";
718  }
719  throw std::runtime_error{"Unexpected table type"};
720 }
721 
722 std::string get_malformed_config_error_message(const std::string& config_key) {
723  return "Configuration value for \"" + config_key +
724  "\" is malformed. Value should be a list of paths with format: [ "
725  "\"root-path-1\", \"root-path-2\", ... ]";
726 }
727 
728 void validate_expanded_file_path(const std::string& file_path,
729  const std::vector<std::string>& whitelisted_root_paths) {
730  const auto& canonical_file_path = boost::filesystem::canonical(file_path);
731  for (const auto& root_path : whitelisted_root_paths) {
732  if (boost::istarts_with(canonical_file_path.string(), root_path)) {
733  return;
734  }
735  }
736  if (canonical_file_path == boost::filesystem::absolute(file_path)) {
737  throw std::runtime_error{"File or directory path \"" + file_path +
738  "\" is not whitelisted."};
739  }
740  throw std::runtime_error{"File or directory path \"" + file_path +
741  "\" (resolved to \"" + canonical_file_path.string() +
742  "\") is not whitelisted."};
743 }
744 
745 std::vector<std::string> get_expanded_file_paths(
746  const std::string& file_path,
747  const DataTransferType data_transfer_type) {
748  std::vector<std::string> file_paths;
749  if (data_transfer_type == DataTransferType::IMPORT) {
751  file_path, std::nullopt, std::nullopt, std::nullopt);
752  } else {
753  std::string path;
754  if (!boost::filesystem::exists(file_path)) {
755  // For exports, it is possible to provide a path to a new (nonexistent) file. In
756  // this case, validate using the parent path.
757  path = boost::filesystem::path(file_path).parent_path().string();
758  if (!boost::filesystem::exists(path)) {
759  throw std::runtime_error{"File or directory \"" + file_path +
760  "\" does not exist."};
761  }
762  } else {
763  path = file_path;
764  }
765  file_paths = {path};
766  }
767  return file_paths;
768 }
769 
770 void validate_allowed_file_path(const std::string& file_path,
771  const DataTransferType data_transfer_type,
772  const bool allow_wildcards) {
773  // Reject any punctuation characters except for a few safe ones.
774  // Some punctuation characters present a security risk when passed
775  // to subprocesses. Don't change this without a security review.
776  static const std::string safe_punctuation{"./_+-=:~"};
777  for (const auto& ch : file_path) {
778  if (std::ispunct(ch) && safe_punctuation.find(ch) == std::string::npos &&
779  !(allow_wildcards && ch == '*')) {
780  throw std::runtime_error(std::string("Punctuation \"") + ch +
781  "\" is not allowed in file path: " + file_path);
782  }
783  }
784 
785  // Enforce our whitelist and blacklist for file paths.
786  const auto& expanded_file_paths =
787  get_expanded_file_paths(file_path, data_transfer_type);
788  for (const auto& path : expanded_file_paths) {
790  const auto& canonical_file_path = boost::filesystem::canonical(file_path);
791  if (canonical_file_path == boost::filesystem::absolute(file_path)) {
792  throw std::runtime_error{"Access to file or directory path \"" + file_path +
793  "\" is not allowed."};
794  }
795  throw std::runtime_error{"Access to file or directory path \"" + file_path +
796  "\" (resolved to \"" + canonical_file_path.string() +
797  "\") is not allowed."};
798  }
799  }
800  FilePathWhitelist::validateWhitelistedFilePath(expanded_file_paths, data_transfer_type);
801 }
802 
803 void set_whitelisted_paths(const std::string& config_key,
804  const std::string& config_value,
805  std::vector<std::string>& whitelisted_paths) {
806  rapidjson::Document whitelisted_root_paths;
807  whitelisted_root_paths.Parse(config_value);
808  if (!whitelisted_root_paths.IsArray()) {
809  throw std::runtime_error{get_malformed_config_error_message(config_key)};
810  }
811  for (const auto& root_path : whitelisted_root_paths.GetArray()) {
812  if (!root_path.IsString()) {
813  throw std::runtime_error{get_malformed_config_error_message(config_key)};
814  }
815  if (!boost::filesystem::exists(root_path.GetString())) {
816  throw std::runtime_error{"Whitelisted root path \"" +
817  std::string{root_path.GetString()} + "\" does not exist."};
818  }
819  whitelisted_paths.emplace_back(
820  boost::filesystem::canonical(root_path.GetString()).string());
821  }
822  LOG(INFO) << "Parsed " << config_key << ": "
823  << shared::printContainer(whitelisted_paths);
824 }
825 
826 void FilePathWhitelist::initialize(const std::string& data_dir,
827  const std::string& allowed_import_paths,
828  const std::string& allowed_export_paths) {
829  CHECK(!data_dir.empty());
830  CHECK(boost::filesystem::is_directory(data_dir));
831 
832  auto data_dir_path = boost::filesystem::canonical(data_dir);
834  whitelisted_import_paths_.emplace_back((data_dir_path / "mapd_import").string());
835 
837  whitelisted_export_paths_.emplace_back((data_dir_path / "mapd_export").string());
838 
839  if (!allowed_import_paths.empty()) {
841  "allowed-import-paths", allowed_import_paths, whitelisted_import_paths_);
842  }
843  if (!allowed_export_paths.empty()) {
845  "allowed-export-paths", allowed_export_paths, whitelisted_export_paths_);
846  }
847 }
848 
850  const std::vector<std::string>& expanded_file_paths,
851  const DataTransferType data_transfer_type) {
852  for (const auto& path : expanded_file_paths) {
853  if (data_transfer_type == DataTransferType::IMPORT) {
855  } else if (data_transfer_type == DataTransferType::EXPORT) {
857  } else {
858  UNREACHABLE();
859  }
860  }
861 }
862 
866 }
867 
868 std::vector<std::string> FilePathWhitelist::whitelisted_import_paths_{};
869 std::vector<std::string> FilePathWhitelist::whitelisted_export_paths_{};
870 
871 void FilePathBlacklist::addToBlacklist(const std::string& path) {
872  CHECK(!path.empty());
873  blacklisted_paths_.emplace_back(path);
874 }
875 
876 bool FilePathBlacklist::isBlacklistedPath(const std::string& path) {
877  const auto canonical_path = boost::filesystem::canonical(path).string();
878  for (const auto& blacklisted_path : blacklisted_paths_) {
879  std::string full_path;
880  try {
881  full_path = boost::filesystem::canonical(blacklisted_path).string();
882  } catch (...) {
889  full_path = boost::filesystem::absolute(blacklisted_path).string();
890  }
891  if (boost::istarts_with(canonical_path, full_path)) {
892  return true;
893  }
894  }
895  return false;
896 }
897 
899  blacklisted_paths_.clear();
900 }
901 
902 std::vector<std::string> FilePathBlacklist::blacklisted_paths_{};
903 } // namespace ddl_utils
static std::set< std::string > reserved_keywords
DataTransferType
Definition: DdlUtils.h:80
HOST DEVICE SQLTypes get_subtype() const
Definition: sqltypes.h:330
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const std::optional< std::string > &filter_regex, const std::optional< std::string > &sort_by, const std::optional< std::string > &sort_regex)
void set_compression(EncodingType c)
Definition: sqltypes.h:429
void set_size(int s)
Definition: sqltypes.h:427
static std::unique_ptr< GeoBase > createGeoType(const std::string &wkt_or_wkb_hex)
Definition: Types.cpp:919
void validate_and_set_sparse_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:375
std::vector< std::string > get_expanded_file_paths(const std::string &file_path, const DataTransferType data_transfer_type)
Definition: DdlUtils.cpp:745
static std::vector< std::string > whitelisted_export_paths_
Definition: DdlUtils.h:94
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
void validate_literal(const std::string &val, SQLTypeInfo column_type, const std::string &column_name)
Definition: DdlUtils.cpp:513
shared utility for globbing files, paths can be specified as either a single file, directory or wildcards
Definition: sqltypes.h:49
SQLTypes
Definition: sqltypes.h:38
std::string tableName
SqlType(SQLTypes type, int param1, int param2, bool is_array, int array_size)
Definition: DdlUtils.cpp:37
void validate_and_set_array_size(ColumnDescriptor &cd, const SqlType *column_type)
Definition: DdlUtils.cpp:486
virtual void check_type()
Definition: DdlUtils.cpp:152
static void initialize(const std::string &data_dir, const std::string &allowed_import_paths, const std::string &allowed_export_paths)
Definition: DdlUtils.cpp:826
void validate_and_set_dictionary_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:338
#define LOG(tag)
Definition: Logger.h:203
std::string storageType
#define DEFAULT_MAX_CHUNK_SIZE
#define UNREACHABLE()
Definition: Logger.h:253
HOST DEVICE void set_subtype(SQLTypes st)
Definition: sqltypes.h:420
virtual int get_encoding_param() const
Definition: DdlUtils.cpp:208
Constants for Builtin SQL Types supported by OmniSci.
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:329
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
void validate_non_duplicate_column(const std::string &column_name, std::unordered_set< std::string > &upper_column_names)
Definition: DdlUtils.cpp:675
constexpr double a
Definition: Utm.h:38
void set_column_descriptor(const std::string &column_name, ColumnDescriptor &cd, SqlType *column_type, const bool not_null, const Encoding *encoding, const std::string *default_value)
Definition: DdlUtils.cpp:645
void validate_and_set_none_encoding(ColumnDescriptor &cd)
Definition: DdlUtils.cpp:364
void set_input_srid(int d)
Definition: sqltypes.h:423
void validate_and_set_encoding(ColumnDescriptor &cd, const Encoding *encoding, const SqlType *column_type)
Definition: DdlUtils.cpp:431
bool g_use_date_in_days_default_encoding
Definition: DdlUtils.cpp:34
#define DEFAULT_MAX_ROWS
static std::vector< std::string > whitelisted_import_paths_
Definition: DdlUtils.h:93
static void validateWhitelistedFilePath(const std::vector< std::string > &expanded_file_paths, const DataTransferType data_transfer_type)
Definition: DdlUtils.cpp:849
void set_fixed_size()
Definition: sqltypes.h:428
void set_default_encoding(ColumnDescriptor &cd)
Definition: DdlUtils.cpp:212
void set_scale(int s)
Definition: sqltypes.h:424
SQLTypes type
Definition: DdlUtils.h:54
void validate(T value)
Definition: Encoder.h:122
virtual SQLTypes get_type() const
Definition: DdlUtils.cpp:44
void validate_expanded_file_path(const std::string &file_path, const std::vector< std::string > &whitelisted_root_paths)
Definition: DdlUtils.cpp:728
virtual std::string to_string() const
Definition: DdlUtils.cpp:76
void validate_non_reserved_keyword(const std::string &column_name)
Definition: DdlUtils.cpp:684
Datum StringToDatum(std::string_view s, SQLTypeInfo &ti)
Definition: Datum.cpp:275
specifies the content in-memory of a row in the column metadata table
void set_default_table_attributes(const std::string &table_name, TableDescriptor &td, const int32_t column_count)
Definition: DdlUtils.cpp:661
std::shared_ptr< Fragmenter_Namespace::AbstractFragmenter > fragmenter
int get_precision() const
Definition: sqltypes.h:332
void validate_allowed_file_path(const std::string &file_path, const DataTransferType data_transfer_type, const bool allow_wildcards)
Definition: DdlUtils.cpp:770
void set_output_srid(int s)
Definition: sqltypes.h:425
std::string to_upper(const std::string &str)
#define DEFAULT_PAGE_SIZE
void set_comp_param(int p)
Definition: sqltypes.h:430
void validate_and_set_compressed_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:392
std::optional< std::string > default_value
Definition: sqltypes.h:52
Definition: sqltypes.h:53
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:337
virtual const std::string * get_encoding_name() const
Definition: DdlUtils.cpp:204
int64_t convert_decimal_value_to_scale(const int64_t decimal_value, const SQLTypeInfo &type_info, const SQLTypeInfo &new_type_info)
Definition: Datum.cpp:473
std::string get_malformed_config_error_message(const std::string &config_key)
Definition: DdlUtils.cpp:722
void set_dimension(int d)
Definition: sqltypes.h:421
#define DEFAULT_FRAGMENT_ROWS
void validate_and_set_fixed_encoding(ColumnDescriptor &cd, int encoding_size, const SqlType *column_type)
Definition: DdlUtils.cpp:240
std::string table_type_enum_to_string(const TableType table_type)
Definition: DdlUtils.cpp:709
Fragmenter_Namespace::FragmenterType fragType
Encoding(std::string *encoding_name, int encoding_param)
Definition: DdlUtils.cpp:201
#define IS_INTEGER(T)
Definition: sqltypes.h:245
void set_whitelisted_paths(const std::string &config_key, const std::string &config_value, std::vector< std::string > &whitelisted_paths)
Definition: DdlUtils.cpp:803
Definition: sqltypes.h:41
virtual void set_param1(int param)
Definition: DdlUtils.cpp:52
static bool isBlacklistedPath(const std::string &path)
Definition: DdlUtils.cpp:876
void validate_table_type(const TableDescriptor *td, const TableType expected_table_type, const std::string &command)
Definition: DdlUtils.cpp:692
void set_notnull(bool n)
Definition: sqltypes.h:426
#define CHECK(condition)
Definition: Logger.h:209
bool is_geometry() const
Definition: sqltypes.h:521
void validate_and_set_default_value(ColumnDescriptor &cd, const std::string *default_value, bool not_null)
Definition: DdlUtils.cpp:624
bool is_high_precision_timestamp() const
Definition: sqltypes.h:870
void validate_and_set_date_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:412
static constexpr size_t MAX_STRLEN
static void addToBlacklist(const std::string &path)
Definition: DdlUtils.cpp:871
Definition: sqltypes.h:45
std::unique_ptr< std::string > encoding_name
Definition: DdlUtils.h:76
SQLTypeInfo columnType
virtual void set_is_array(bool a)
Definition: DdlUtils.cpp:64
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:105
bool is_string() const
Definition: sqltypes.h:509
virtual int get_param1() const
Definition: DdlUtils.cpp:48
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:336
static constexpr char const * FOREIGN_TABLE
bool is_string_array() const
Definition: sqltypes.h:510
void validate(T value) const
Definition: Encoder.h:54
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:850
bool is_decimal() const
Definition: sqltypes.h:512
virtual int get_param2() const
Definition: DdlUtils.cpp:56
std::string columnName
#define IS_GEO(T)
Definition: sqltypes.h:251
virtual bool get_is_array() const
Definition: DdlUtils.cpp:60
HOST DEVICE int get_output_srid() const
Definition: sqltypes.h:335
virtual void set_array_size(int s)
Definition: DdlUtils.cpp:72
constexpr auto is_datetime(SQLTypes type)
Definition: sqltypes.h:263
static std::vector< std::string > blacklisted_paths_
Definition: DdlUtils.h:104
virtual int get_array_size() const
Definition: DdlUtils.cpp:68
void validate_and_set_type(ColumnDescriptor &cd, SqlType *column_type)
Definition: DdlUtils.cpp:467
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:419