OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DdlUtils.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DdlUtils.h"
18 
19 #include <unordered_set>
20 
21 #include <boost/algorithm/string.hpp>
22 #include <boost/filesystem.hpp>
23 #include <boost/program_options.hpp>
24 
25 #include "rapidjson/document.h"
26 
28 #include "Geospatial/Types.h"
30 #include "Shared/SysDefinitions.h"
31 #include "Shared/file_path_util.h"
32 #include "Shared/misc.h"
33 #include "Shared/sqltypes.h"
34 
36 
37 namespace ddl_utils {
38 SqlType::SqlType(SQLTypes type, int param1, int param2, bool is_array, int array_size)
39  : type(type)
40  , param1(param1)
41  , param2(param2)
42  , is_array(is_array)
43  , array_size(array_size) {}
44 
46  return type;
47 }
48 
49 int SqlType::get_param1() const {
50  return param1;
51 }
52 
53 void SqlType::set_param1(int param) {
54  param1 = param;
55 }
56 
57 int SqlType::get_param2() const {
58  return param2;
59 }
60 
61 bool SqlType::get_is_array() const {
62  return is_array;
63 }
64 
66  is_array = a;
67 }
68 
70  return array_size;
71 }
72 
74  array_size = s;
75 }
76 
77 std::string SqlType::to_string() const {
78  std::string str;
79  switch (type) {
80  case kBOOLEAN:
81  str = "BOOLEAN";
82  break;
83  case kCHAR:
84  str = "CHAR(" + boost::lexical_cast<std::string>(param1) + ")";
85  break;
86  case kVARCHAR:
87  str = "VARCHAR(" + boost::lexical_cast<std::string>(param1) + ")";
88  break;
89  case kTEXT:
90  str = "TEXT";
91  break;
92  case kNUMERIC:
93  str = "NUMERIC(" + boost::lexical_cast<std::string>(param1);
94  if (param2 > 0) {
95  str += ", " + boost::lexical_cast<std::string>(param2);
96  }
97  str += ")";
98  break;
99  case kDECIMAL:
100  str = "DECIMAL(" + boost::lexical_cast<std::string>(param1);
101  if (param2 > 0) {
102  str += ", " + boost::lexical_cast<std::string>(param2);
103  }
104  str += ")";
105  break;
106  case kBIGINT:
107  str = "BIGINT";
108  break;
109  case kINT:
110  str = "INT";
111  break;
112  case kTINYINT:
113  str = "TINYINT";
114  break;
115  case kSMALLINT:
116  str = "SMALLINT";
117  break;
118  case kFLOAT:
119  str = "FLOAT";
120  break;
121  case kDOUBLE:
122  str = "DOUBLE";
123  break;
124  case kTIME:
125  str = "TIME";
126  if (param1 < 6) {
127  str += "(" + boost::lexical_cast<std::string>(param1) + ")";
128  }
129  break;
130  case kTIMESTAMP:
131  str = "TIMESTAMP";
132  if (param1 <= 9) {
133  str += "(" + boost::lexical_cast<std::string>(param1) + ")";
134  }
135  break;
136  case kDATE:
137  str = "DATE";
138  break;
139  default:
140  assert(false);
141  break;
142  }
143  if (is_array) {
144  str += "[";
145  if (array_size > 0) {
146  str += boost::lexical_cast<std::string>(array_size);
147  }
148  str += "]";
149  }
150  return str;
151 }
152 
154  switch (type) {
155  case kCHAR:
156  case kVARCHAR:
157  if (param1 <= 0) {
158  throw std::runtime_error("CHAR and VARCHAR must have a positive dimension.");
159  }
160  break;
161  case kDECIMAL:
162  case kNUMERIC:
163  if (param1 <= 0) {
164  throw std::runtime_error("DECIMAL and NUMERIC must have a positive precision.");
166  throw std::runtime_error("DECIMAL and NUMERIC precision cannot be larger than " +
168  ".");
169  } else if (param1 <= param2) {
170  throw std::runtime_error(
171  "DECIMAL and NUMERIC must have precision larger than scale.");
172  }
173  break;
174  case kTIMESTAMP:
175  if (param1 == -1) {
176  param1 = 0; // set default to 0
177  } else if (param1 != 0 && param1 != 3 && param1 != 6 &&
178  param1 != 9) { // support ms, us, ns
179  throw std::runtime_error(
180  "Only TIMESTAMP(n) where n = (0,3,6,9) are supported now.");
181  }
182  break;
183  case kTIME:
184  if (param1 == -1) {
185  param1 = 0; // default precision is 0
186  }
187  if (param1 > 0) { // @TODO(wei) support sub-second precision later.
188  throw std::runtime_error("Only TIME(0) is supported now.");
189  }
190  break;
191  case kPOINT:
192  case kMULTIPOINT:
193  case kLINESTRING:
194  case kMULTILINESTRING:
195  case kPOLYGON:
196  case kMULTIPOLYGON:
197  // Storing SRID in param1
198  break;
199  default:
200  param1 = 0;
201  break;
202  }
203 }
204 
205 Encoding::Encoding(std::string* encoding_name, int encoding_param)
206  : encoding_name(encoding_name), encoding_param(encoding_param) {}
207 
208 const std::string* Encoding::get_encoding_name() const {
209  return encoding_name.get();
210 }
211 
213  return encoding_param;
214 }
215 
217  // Change default TEXT column behaviour to be DICT encoded
218  if (cd.columnType.is_string() || cd.columnType.is_string_array()) {
219  // default to 32-bits
221  cd.columnType.set_comp_param(32);
222  } else if (cd.columnType.is_decimal() && cd.columnType.get_precision() <= 4) {
224  cd.columnType.set_comp_param(16);
225  } else if (cd.columnType.is_decimal() && cd.columnType.get_precision() <= 9) {
227  cd.columnType.set_comp_param(32);
228  } else if (cd.columnType.is_decimal() &&
230  throw std::runtime_error(cd.columnName + ": Precision too high, max " +
232  } else if (cd.columnType.is_geometry() && cd.columnType.get_output_srid() == 4326) {
233  // default to GEOINT 32-bits
235  cd.columnType.set_comp_param(32);
237  // Days encoding for DATE
240  } else {
243  }
244 }
245 
247  int encoding_size,
248  const SqlType* column_type) {
249  auto type = cd.columnType.get_type();
250  // fixed-bits encoding
251  if (type == kARRAY) {
252  type = cd.columnType.get_subtype();
253  switch (type) {
254  case kTINYINT:
255  case kSMALLINT:
256  case kINT:
257  case kBIGINT:
258  case kDATE:
259  throw std::runtime_error(cd.columnName + ": Cannot apply FIXED encoding to " +
260  column_type->to_string() + " type array.");
261  break;
262  default:
263  break;
264  }
265  }
266 
267  if (!IS_INTEGER(type) && !is_datetime(type) &&
268  !(type == kDECIMAL || type == kNUMERIC)) {
269  throw std::runtime_error(
270  cd.columnName +
271  ": Fixed encoding is only supported for integer or time columns.");
272  }
273 
274  switch (type) {
275  case kSMALLINT:
276  if (encoding_size != 8) {
277  throw std::runtime_error(
278  cd.columnName +
279  ": Compression parameter for Fixed encoding on SMALLINT must be 8.");
280  }
281  break;
282  case kINT:
283  if (encoding_size != 8 && encoding_size != 16) {
284  throw std::runtime_error(
285  cd.columnName +
286  ": Compression parameter for Fixed encoding on INTEGER must be 8 or 16.");
287  }
288  break;
289  case kBIGINT:
290  if (encoding_size != 8 && encoding_size != 16 && encoding_size != 32) {
291  throw std::runtime_error(cd.columnName +
292  ": Compression parameter for Fixed encoding on "
293  "BIGINT must be 8 or 16 or 32.");
294  }
295  break;
296  case kTIMESTAMP:
297  case kTIME:
298  if (encoding_size != 32) {
299  throw std::runtime_error(cd.columnName +
300  ": Compression parameter for Fixed encoding on "
301  "TIME or TIMESTAMP must be 32.");
302  } else if (cd.columnType.is_high_precision_timestamp()) {
303  throw std::runtime_error("Fixed encoding is not supported for TIMESTAMP(3|6|9).");
304  }
305  break;
306  case kDECIMAL:
307  case kNUMERIC:
308  if (encoding_size != 32 && encoding_size != 16) {
309  throw std::runtime_error(cd.columnName +
310  ": Compression parameter for Fixed encoding on "
311  "DECIMAL must be 16 or 32.");
312  }
313 
314  if (encoding_size == 32 && cd.columnType.get_precision() > 9) {
315  throw std::runtime_error(cd.columnName +
316  ": Precision too high for Fixed(32) encoding, max 9.");
317  }
318 
319  if (encoding_size == 16 && cd.columnType.get_precision() > 4) {
320  throw std::runtime_error(cd.columnName +
321  ": Precision too high for Fixed(16) encoding, max 4.");
322  }
323  break;
324  case kDATE:
325  if (encoding_size != 32 && encoding_size != 16) {
326  throw std::runtime_error(cd.columnName +
327  ": Compression parameter for Fixed encoding on "
328  "DATE must be 16 or 32.");
329  }
330  break;
331  default:
332  throw std::runtime_error(cd.columnName + ": Cannot apply FIXED encoding to " +
333  column_type->to_string());
334  }
335  if (type == kDATE) {
337  cd.columnType.set_comp_param(16);
338  } else {
340  cd.columnType.set_comp_param(encoding_size);
341  }
342 }
343 
345  if (!cd.columnType.is_string() && !cd.columnType.is_string_array()) {
346  throw std::runtime_error(
347  cd.columnName +
348  ": Dictionary encoding is only supported on string or string array columns.");
349  }
350  int comp_param;
351  if (encoding_size == 0) {
352  comp_param = 32; // default to 32-bits
353  } else {
354  comp_param = encoding_size;
355  }
356  if (cd.columnType.is_string_array() && comp_param != 32) {
357  throw std::runtime_error(cd.columnName +
358  ": Compression parameter for string arrays must be 32");
359  }
360  if (comp_param != 8 && comp_param != 16 && comp_param != 32) {
361  throw std::runtime_error(
362  cd.columnName +
363  ": Compression parameter for Dictionary encoding must be 8 or 16 or 32.");
364  }
365  // dictionary encoding
367  cd.columnType.set_comp_param(comp_param);
368 }
369 
371  if (!cd.columnType.is_string() && !cd.columnType.is_string_array() &&
372  !cd.columnType.is_geometry()) {
373  throw std::runtime_error(
374  cd.columnName +
375  ": None encoding is only supported on string, string array, or geo columns.");
376  }
379 }
380 
382  // sparse column encoding with mostly NULL values
383  if (cd.columnType.get_notnull()) {
384  throw std::runtime_error(cd.columnName +
385  ": Cannot do sparse column encoding on a NOT NULL column.");
386  }
387  if (encoding_size == 0 || encoding_size % 8 != 0 || encoding_size > 48) {
388  throw std::runtime_error(
389  cd.columnName +
390  "Must specify number of bits as 8, 16, 24, 32 or 48 as the parameter to "
391  "sparse-column encoding.");
392  }
394  cd.columnType.set_comp_param(encoding_size);
395  // throw std::runtime_error("SPARSE encoding not supported yet.");
396 }
397 
399  if (!cd.columnType.is_geometry() || cd.columnType.get_output_srid() != 4326) {
400  throw std::runtime_error(
401  cd.columnName + ": COMPRESSED encoding is only supported on WGS84 geo columns.");
402  }
403  int comp_param;
404  if (encoding_size == 0) {
405  comp_param = 32; // default to 32-bits
406  } else {
407  comp_param = encoding_size;
408  }
409  if (comp_param != 32) {
410  throw std::runtime_error(cd.columnName +
411  ": only 32-bit COMPRESSED geo encoding is supported");
412  }
413  // encoding longitude/latitude as integers
415  cd.columnType.set_comp_param(comp_param);
416 }
417 
418 void validate_and_set_date_encoding(ColumnDescriptor& cd, int encoding_size) {
419  // days encoding for dates
420  if (cd.columnType.get_type() == kARRAY && cd.columnType.get_subtype() == kDATE) {
421  throw std::runtime_error(cd.columnName +
422  ": Cannot apply days encoding to date array.");
423  }
424  if (cd.columnType.get_type() != kDATE) {
425  throw std::runtime_error(cd.columnName +
426  ": Days encoding is only supported for DATE columns.");
427  }
428  if (encoding_size != 32 && encoding_size != 16) {
429  throw std::runtime_error(cd.columnName +
430  ": Compression parameter for Days encoding on "
431  "DATE must be 16 or 32.");
432  }
434  cd.columnType.set_comp_param((encoding_size == 16) ? 16 : 0);
435 }
436 
438  const Encoding* encoding,
439  const SqlType* column_type) {
440  if (encoding == nullptr) {
442  } else {
443  const std::string& comp = *encoding->get_encoding_name();
444  if (boost::iequals(comp, "fixed")) {
445  validate_and_set_fixed_encoding(cd, encoding->get_encoding_param(), column_type);
446  } else if (boost::iequals(comp, "rl")) {
447  // run length encoding
450  // throw std::runtime_error("RL(Run Length) encoding not supported yet.");
451  } else if (boost::iequals(comp, "diff")) {
452  // differential encoding
455  // throw std::runtime_error("DIFF(differential) encoding not supported yet.");
456  } else if (boost::iequals(comp, "dict")) {
458  } else if (boost::iequals(comp, "NONE")) {
460  } else if (boost::iequals(comp, "sparse")) {
462  } else if (boost::iequals(comp, "compressed")) {
464  } else if (boost::iequals(comp, "days")) {
466  } else {
467  throw std::runtime_error(cd.columnName + ": Invalid column compression scheme " +
468  comp);
469  }
470  }
471 }
472 
474  column_type->check_type();
475 
476  if (column_type->get_type() == kGEOMETRY) {
477  throw std::runtime_error("Unsupported type \"GEOMETRY\" specified.");
478  }
479 
480  if (column_type->get_is_array()) {
482  cd.columnType.set_subtype(column_type->get_type());
483  } else {
484  cd.columnType.set_type(column_type->get_type());
485  }
486  if (IS_GEO(column_type->get_type())) {
487  cd.columnType.set_subtype(static_cast<SQLTypes>(column_type->get_param1()));
488  cd.columnType.set_input_srid(column_type->get_param2());
489  cd.columnType.set_output_srid(column_type->get_param2());
490  } else {
491  cd.columnType.set_dimension(column_type->get_param1());
492  cd.columnType.set_scale(column_type->get_param2());
493  }
494 }
495 
496 void validate_and_set_array_size(ColumnDescriptor& cd, const SqlType* column_type) {
497  if (cd.columnType.is_string_array() &&
499  throw std::runtime_error(
500  cd.columnName +
501  ": Array of strings must be dictionary encoded. Specify ENCODING DICT");
502  }
503 
504  if (column_type->get_is_array()) {
505  int s = -1;
506  auto array_size = column_type->get_array_size();
507  if (array_size > 0) {
508  auto sti = cd.columnType.get_elem_type();
509  s = array_size * sti.get_size();
510  if (s <= 0) {
511  throw std::runtime_error(cd.columnName + ": Unexpected fixed length array size");
512  }
513  }
514  cd.columnType.set_size(s);
515 
516  } else {
518  }
519 }
520 
521 namespace {
522 
523 void validate_literal(const std::string& val,
524  SQLTypeInfo column_type,
525  const std::string& column_name) {
526  if (to_upper(val) == "NULL") {
527  return;
528  }
529  switch (column_type.get_type()) {
530  case kBOOLEAN:
531  case kTINYINT:
532  case kSMALLINT:
533  case kINT:
534  case kBIGINT:
535  case kFLOAT:
536  case kDOUBLE:
537  case kTIME:
538  case kTIMESTAMP:
539  StringToDatum(val, column_type);
540  break;
541  case kDATE: {
542  auto d = StringToDatum(val, column_type);
543  DateDaysOverflowValidator validator(column_type);
544  validator.validate(d.bigintval);
545  break;
546  }
547  case kDECIMAL:
548  case kNUMERIC: {
549  SQLTypeInfo ti(kNUMERIC, 0, 0, false);
550  auto d = StringToDatum(val, ti);
551  auto converted_val = convert_decimal_value_to_scale(d.bigintval, ti, column_type);
552  DecimalOverflowValidator validator(column_type);
553  validator.validate(converted_val);
554  break;
555  }
556  case kTEXT:
557  case kVARCHAR:
558  case kCHAR:
559  if (val.length() > StringDictionary::MAX_STRLEN) {
560  throw std::runtime_error("String too long for column " + column_name + " was " +
561  std::to_string(val.length()) + " max is " +
563  }
564  break;
565  case kARRAY: {
566  if (val.front() != '{' || val.back() != '}') {
567  throw std::runtime_error(column_name +
568  ": arrays should start and end with curly braces");
569  }
570  std::vector<std::string> elements = split(val.substr(1, val.length() - 2), ", ");
571  if (column_type.get_size() > 0) {
572  auto sti = column_type.get_elem_type();
573  size_t expected_size = column_type.get_size() / sti.get_size();
574  size_t actual_size = elements.size();
575  if (actual_size != expected_size) {
576  throw std::runtime_error("Fixed length array column " + column_name +
577  " expects " + std::to_string(expected_size) +
578  " values, received " + std::to_string(actual_size));
579  }
580  }
581  SQLTypeInfo element_ti = column_type.get_elem_type();
582  for (const auto& element : elements) {
583  if (to_upper(element) != "NULL") {
584  validate_literal(element, element_ti, column_name);
585  }
586  }
587  break;
588  }
589  case kPOINT:
590  case kMULTIPOINT:
591  case kLINESTRING:
592  case kMULTILINESTRING:
593  case kPOLYGON:
594  case kMULTIPOLYGON:
595  if (val.empty()) {
596  return;
597  }
598  try {
600  if (!geo) {
601  throw std::runtime_error("Unexpected geo literal '" + val + "' for column " +
602  column_name);
603  }
604  if (!geo->transform(column_type)) {
605  throw std::runtime_error("Cannot transform SRID for literal '" + val +
606  "' for column " + column_name);
607  } else {
608  auto sql_type = column_type.get_type();
609  auto geo_type = geo->getType();
610  if ((geo_type == Geospatial::GeoBase::GeoType::kPOINT && sql_type != kPOINT) ||
612  sql_type != kMULTIPOINT) ||
614  sql_type != kLINESTRING) ||
616  sql_type != kMULTILINESTRING) ||
618  sql_type != kPOLYGON) ||
620  sql_type != kMULTIPOLYGON)) {
621  throw std::runtime_error("Geo literal '" + val +
622  "' doesn't match the type "
623  "of column column " +
624  column_name);
625  }
626  }
627  } catch (Geospatial::GeoTypesError& e) {
628  throw std::runtime_error("Unexpected geo literal '" + val + "' for column " +
629  column_name + ": " + e.what());
630  }
631  break;
632  default:
633  CHECK(false) << "validate_literal() does not support type "
634  << column_type.get_type();
635  }
636 }
637 
638 } // namespace
639 
641  const std::string* default_value,
642  bool not_null) {
643  bool is_null_literal =
644  default_value && ((to_upper(*default_value) == "NULL") ||
645  (cd.columnType.is_geometry() && default_value->empty()));
646  if (not_null && (is_null_literal)) {
647  throw std::runtime_error(cd.columnName +
648  ": cannot set default value to NULL for "
649  "NOT NULL column");
650  }
651  if (!default_value || is_null_literal) {
652  cd.default_value = std::nullopt;
653  return;
654  }
655  const auto& column_type = cd.columnType;
656  const auto& val = *default_value;
657  validate_literal(val, column_type, cd.columnName);
658  cd.default_value = std::make_optional(*default_value);
659 }
660 
661 void set_column_descriptor(const std::string& column_name,
662  ColumnDescriptor& cd,
663  SqlType* column_type,
664  const bool not_null,
665  const Encoding* encoding,
666  const std::string* default_value) {
667  cd.columnName = column_name;
668  validate_and_set_type(cd, column_type);
669  cd.columnType.set_notnull(not_null);
670  validate_and_set_encoding(cd, encoding, column_type);
671  validate_and_set_array_size(cd, column_type);
672  cd.isSystemCol = false;
673  cd.isVirtualCol = false;
674  validate_and_set_default_value(cd, default_value, not_null);
675 }
676 
677 void set_default_table_attributes(const std::string& table_name,
678  TableDescriptor& td,
679  const int32_t column_count) {
680  td.tableName = table_name;
681  td.nColumns = column_count;
682  td.isView = false;
683  td.fragmenter = nullptr;
689 }
690 
691 void validate_non_duplicate_column(const std::string& column_name,
692  std::unordered_set<std::string>& upper_column_names) {
693  const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
694  const auto insert_it = upper_column_names.insert(upper_column_name);
695  if (!insert_it.second) {
696  throw std::runtime_error("Column '" + column_name + "' defined more than once");
697  }
698 }
699 
700 void validate_non_reserved_keyword(const std::string& column_name) {
701  const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
702  if (reserved_keywords.find(upper_column_name) != reserved_keywords.end()) {
703  throw std::runtime_error("Cannot create column with reserved keyword '" +
704  column_name + "'");
705  }
706 }
707 
709  const TableType expected_table_type,
710  const std::string& command) {
711  if (td->isView) {
712  if (expected_table_type != TableType::VIEW) {
713  throw std::runtime_error(td->tableName + " is a view. Use " + command + " VIEW.");
714  }
715  } else if (td->storageType == StorageType::FOREIGN_TABLE) {
716  if (expected_table_type != TableType::FOREIGN_TABLE) {
717  throw std::runtime_error(td->tableName + " is a foreign table. Use " + command +
718  " FOREIGN TABLE.");
719  }
720  } else if (expected_table_type != TableType::TABLE) {
721  throw std::runtime_error(td->tableName + " is a table. Use " + command + " TABLE.");
722  }
723 }
724 
725 std::string table_type_enum_to_string(const TableType table_type) {
726  if (table_type == ddl_utils::TableType::TABLE) {
727  return "Table";
728  }
729  if (table_type == ddl_utils::TableType::FOREIGN_TABLE) {
730  return "ForeignTable";
731  }
732  if (table_type == ddl_utils::TableType::VIEW) {
733  return "View";
734  }
735  throw std::runtime_error{"Unexpected table type"};
736 }
737 
738 std::string get_malformed_config_error_message(const std::string& config_key) {
739  return "Configuration value for \"" + config_key +
740  "\" is malformed. Value should be a list of paths with format: [ "
741  "\"root-path-1\", \"root-path-2\", ... ]";
742 }
743 
744 void validate_expanded_file_path(const std::string& file_path,
745  const std::vector<std::string>& whitelisted_root_paths) {
746  const auto& canonical_file_path = boost::filesystem::canonical(file_path);
747  for (const auto& root_path : whitelisted_root_paths) {
748  if (boost::istarts_with(canonical_file_path.string(), root_path)) {
749  return;
750  }
751  }
752  if (canonical_file_path == boost::filesystem::absolute(file_path)) {
753  throw std::runtime_error{"File or directory path \"" + file_path +
754  "\" is not whitelisted."};
755  }
756  throw std::runtime_error{"File or directory path \"" + file_path +
757  "\" (resolved to \"" + canonical_file_path.string() +
758  "\") is not whitelisted."};
759 }
760 
761 std::vector<std::string> get_expanded_file_paths(
762  const std::string& file_path,
763  const DataTransferType data_transfer_type) {
764  std::vector<std::string> file_paths;
765  if (data_transfer_type == DataTransferType::IMPORT) {
766  file_paths = shared::local_glob_filter_sort_files(file_path, {});
767  } else {
768  std::string path;
769  if (!boost::filesystem::exists(file_path)) {
770  // For exports, it is possible to provide a path to a new (nonexistent) file. In
771  // this case, validate using the parent path.
772  path = boost::filesystem::path(file_path).parent_path().string();
773  if (!boost::filesystem::exists(path)) {
774  throw std::runtime_error{"File or directory \"" + file_path +
775  "\" does not exist."};
776  }
777  } else {
778  path = file_path;
779  }
780  file_paths = {path};
781  }
782  return file_paths;
783 }
784 
785 void validate_allowed_file_path(const std::string& file_path,
786  const DataTransferType data_transfer_type,
787  const bool allow_wildcards) {
788  // Reject any punctuation characters except for a few safe ones.
789  // Some punctuation characters present a security risk when passed
790  // to subprocesses. Don't change this without a security review.
791  static const std::string safe_punctuation{"./_+-=:~"};
792  for (const auto& ch : file_path) {
793  if (std::ispunct(ch) && safe_punctuation.find(ch) == std::string::npos &&
794  !(allow_wildcards && ch == '*')) {
795  throw std::runtime_error(std::string("Punctuation \"") + ch +
796  "\" is not allowed in file path: " + file_path);
797  }
798  }
799 
800  // Enforce our whitelist and blacklist for file paths.
801  const auto& expanded_file_paths =
802  get_expanded_file_paths(file_path, data_transfer_type);
803  for (const auto& path : expanded_file_paths) {
805  const auto& canonical_file_path = boost::filesystem::canonical(file_path);
806  if (canonical_file_path == boost::filesystem::absolute(file_path)) {
807  throw std::runtime_error{"Access to file or directory path \"" + file_path +
808  "\" is not allowed."};
809  }
810  throw std::runtime_error{"Access to file or directory path \"" + file_path +
811  "\" (resolved to \"" + canonical_file_path.string() +
812  "\") is not allowed."};
813  }
814  }
815  FilePathWhitelist::validateWhitelistedFilePath(expanded_file_paths, data_transfer_type);
816 }
817 
818 void set_whitelisted_paths(const std::string& config_key,
819  const std::string& config_value,
820  std::vector<std::string>& whitelisted_paths) {
821  rapidjson::Document whitelisted_root_paths;
822  whitelisted_root_paths.Parse(config_value);
823  if (!whitelisted_root_paths.IsArray()) {
824  throw std::runtime_error{get_malformed_config_error_message(config_key)};
825  }
826  for (const auto& root_path : whitelisted_root_paths.GetArray()) {
827  if (!root_path.IsString()) {
828  throw std::runtime_error{get_malformed_config_error_message(config_key)};
829  }
830  if (!boost::filesystem::exists(root_path.GetString())) {
831  throw std::runtime_error{"Whitelisted root path \"" +
832  std::string{root_path.GetString()} + "\" does not exist."};
833  }
834  whitelisted_paths.emplace_back(
835  boost::filesystem::canonical(root_path.GetString()).string());
836  }
837  LOG(INFO) << "Parsed " << config_key << ": "
838  << shared::printContainer(whitelisted_paths);
839 }
840 
841 void FilePathWhitelist::initialize(const std::string& data_dir,
842  const std::string& allowed_import_paths,
843  const std::string& allowed_export_paths) {
844  CHECK(!data_dir.empty());
845  CHECK(boost::filesystem::is_directory(data_dir));
846 
847  auto data_dir_path = boost::filesystem::canonical(data_dir);
849  whitelisted_import_paths_.emplace_back(
850  (data_dir_path / shared::kDefaultImportDirName).string());
851 
853  whitelisted_export_paths_.emplace_back(
854  (data_dir_path / shared::kDefaultExportDirName).string());
855 
856  if (!allowed_import_paths.empty()) {
858  "allowed-import-paths", allowed_import_paths, whitelisted_import_paths_);
859  }
860  if (!allowed_export_paths.empty()) {
862  "allowed-export-paths", allowed_export_paths, whitelisted_export_paths_);
863  }
864 }
865 
867  const std::vector<std::string>& expanded_file_paths,
868  const DataTransferType data_transfer_type) {
869  for (const auto& path : expanded_file_paths) {
870  if (data_transfer_type == DataTransferType::IMPORT) {
872  } else if (data_transfer_type == DataTransferType::EXPORT) {
874  } else {
875  UNREACHABLE();
876  }
877  }
878 }
879 
883 }
884 
885 std::vector<std::string> FilePathWhitelist::whitelisted_import_paths_{};
886 std::vector<std::string> FilePathWhitelist::whitelisted_export_paths_{};
887 
888 void FilePathBlacklist::addToBlacklist(const std::string& path) {
889  CHECK(!path.empty());
890  blacklisted_paths_.emplace_back(path);
891 }
892 
893 bool FilePathBlacklist::isBlacklistedPath(const std::string& path) {
894  const auto canonical_path = boost::filesystem::canonical(path).string();
895  for (const auto& blacklisted_path : blacklisted_paths_) {
896  std::string full_path;
897  try {
898  full_path = boost::filesystem::canonical(blacklisted_path).string();
899  } catch (...) {
906  full_path = boost::filesystem::absolute(blacklisted_path).string();
907  }
908  if (boost::istarts_with(canonical_path, full_path)) {
909  return true;
910  }
911  }
912  return false;
913 }
914 
916  blacklisted_paths_.clear();
917 }
918 
919 std::vector<std::string> FilePathBlacklist::blacklisted_paths_{};
920 } // namespace ddl_utils
static std::set< std::string > reserved_keywords
DataTransferType
Definition: DdlUtils.h:80
HOST DEVICE SQLTypes get_subtype() const
Definition: sqltypes.h:382
void set_compression(EncodingType c)
Definition: sqltypes.h:504
void set_size(int s)
Definition: sqltypes.h:501
static std::unique_ptr< GeoBase > createGeoType(const std::string &wkt_or_wkb_hex)
Definition: Types.cpp:1061
void validate_and_set_sparse_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:381
std::vector< std::string > get_expanded_file_paths(const std::string &file_path, const DataTransferType data_transfer_type)
Definition: DdlUtils.cpp:761
static std::vector< std::string > whitelisted_export_paths_
Definition: DdlUtils.h:94
HOST DEVICE int get_size() const
Definition: sqltypes.h:393
void validate_literal(const std::string &val, SQLTypeInfo column_type, const std::string &column_name)
Definition: DdlUtils.cpp:523
shared utility for globbing files, paths can be specified as either a single file, directory or wildcards
static constexpr int32_t kMaxNumericPrecision
Definition: sqltypes.h:48
Definition: sqltypes.h:66
SQLTypes
Definition: sqltypes.h:55
std::string tableName
SqlType(SQLTypes type, int param1, int param2, bool is_array, int array_size)
Definition: DdlUtils.cpp:38
void validate_and_set_array_size(ColumnDescriptor &cd, const SqlType *column_type)
Definition: DdlUtils.cpp:496
virtual void check_type()
Definition: DdlUtils.cpp:153
static void initialize(const std::string &data_dir, const std::string &allowed_import_paths, const std::string &allowed_export_paths)
Definition: DdlUtils.cpp:841
void validate_and_set_dictionary_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:344
#define LOG(tag)
Definition: Logger.h:285
std::string storageType
#define DEFAULT_MAX_CHUNK_SIZE
#define UNREACHABLE()
Definition: Logger.h:337
HOST DEVICE void set_subtype(SQLTypes st)
Definition: sqltypes.h:494
virtual int get_encoding_param() const
Definition: DdlUtils.cpp:212
Constants for Builtin SQL Types supported by HEAVY.AI.
const std::string kDefaultExportDirName
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:381
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
void validate_non_duplicate_column(const std::string &column_name, std::unordered_set< std::string > &upper_column_names)
Definition: DdlUtils.cpp:691
constexpr double a
Definition: Utm.h:32
void set_column_descriptor(const std::string &column_name, ColumnDescriptor &cd, SqlType *column_type, const bool not_null, const Encoding *encoding, const std::string *default_value)
Definition: DdlUtils.cpp:661
void validate_and_set_none_encoding(ColumnDescriptor &cd)
Definition: DdlUtils.cpp:370
void set_input_srid(int d)
Definition: sqltypes.h:497
void validate_and_set_encoding(ColumnDescriptor &cd, const Encoding *encoding, const SqlType *column_type)
Definition: DdlUtils.cpp:437
bool g_use_date_in_days_default_encoding
Definition: DdlUtils.cpp:35
#define DEFAULT_MAX_ROWS
const std::string kDefaultImportDirName
static std::vector< std::string > whitelisted_import_paths_
Definition: DdlUtils.h:93
static void validateWhitelistedFilePath(const std::vector< std::string > &expanded_file_paths, const DataTransferType data_transfer_type)
Definition: DdlUtils.cpp:866
void set_fixed_size()
Definition: sqltypes.h:502
void set_default_encoding(ColumnDescriptor &cd)
Definition: DdlUtils.cpp:216
void set_scale(int s)
Definition: sqltypes.h:498
SQLTypes type
Definition: DdlUtils.h:54
void validate(T value)
Definition: Encoder.h:122
virtual SQLTypes get_type() const
Definition: DdlUtils.cpp:45
void validate_expanded_file_path(const std::string &file_path, const std::vector< std::string > &whitelisted_root_paths)
Definition: DdlUtils.cpp:744
Datum StringToDatum(const std::string_view s, SQLTypeInfo &ti)
Definition: Datum.cpp:337
virtual std::string to_string() const
Definition: DdlUtils.cpp:77
void validate_non_reserved_keyword(const std::string &column_name)
Definition: DdlUtils.cpp:700
specifies the content in-memory of a row in the column metadata table
void set_default_table_attributes(const std::string &table_name, TableDescriptor &td, const int32_t column_count)
Definition: DdlUtils.cpp:677
std::shared_ptr< Fragmenter_Namespace::AbstractFragmenter > fragmenter
int get_precision() const
Definition: sqltypes.h:384
void validate_allowed_file_path(const std::string &file_path, const DataTransferType data_transfer_type, const bool allow_wildcards)
Definition: DdlUtils.cpp:785
void set_output_srid(int s)
Definition: sqltypes.h:499
std::string to_upper(const std::string &str)
#define DEFAULT_PAGE_SIZE
void set_comp_param(int p)
Definition: sqltypes.h:505
void validate_and_set_compressed_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:398
std::optional< std::string > default_value
Definition: sqltypes.h:69
Definition: sqltypes.h:70
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:389
virtual const std::string * get_encoding_name() const
Definition: DdlUtils.cpp:208
int64_t convert_decimal_value_to_scale(const int64_t decimal_value, const SQLTypeInfo &type_info, const SQLTypeInfo &new_type_info)
Definition: Datum.cpp:622
std::string get_malformed_config_error_message(const std::string &config_key)
Definition: DdlUtils.cpp:738
void set_dimension(int d)
Definition: sqltypes.h:495
#define DEFAULT_FRAGMENT_ROWS
void validate_and_set_fixed_encoding(ColumnDescriptor &cd, int encoding_size, const SqlType *column_type)
Definition: DdlUtils.cpp:246
std::string table_type_enum_to_string(const TableType table_type)
Definition: DdlUtils.cpp:725
Fragmenter_Namespace::FragmenterType fragType
Encoding(std::string *encoding_name, int encoding_param)
Definition: DdlUtils.cpp:205
#define IS_INTEGER(T)
Definition: sqltypes.h:294
void set_whitelisted_paths(const std::string &config_key, const std::string &config_value, std::vector< std::string > &whitelisted_paths)
Definition: DdlUtils.cpp:818
Definition: sqltypes.h:58
virtual void set_param1(int param)
Definition: DdlUtils.cpp:53
static bool isBlacklistedPath(const std::string &path)
Definition: DdlUtils.cpp:893
void validate_table_type(const TableDescriptor *td, const TableType expected_table_type, const std::string &command)
Definition: DdlUtils.cpp:708
void set_notnull(bool n)
Definition: sqltypes.h:500
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:592
void validate_and_set_default_value(ColumnDescriptor &cd, const std::string *default_value, bool not_null)
Definition: DdlUtils.cpp:640
bool is_high_precision_timestamp() const
Definition: sqltypes.h:1004
void validate_and_set_date_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:418
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const FilePathOptions &options, const bool recurse)
static constexpr size_t MAX_STRLEN
static void addToBlacklist(const std::string &path)
Definition: DdlUtils.cpp:888
Definition: sqltypes.h:62
std::unique_ptr< std::string > encoding_name
Definition: DdlUtils.h:76
SQLTypeInfo columnType
virtual void set_is_array(bool a)
Definition: DdlUtils.cpp:65
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:107
bool is_string() const
Definition: sqltypes.h:580
virtual int get_param1() const
Definition: DdlUtils.cpp:49
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:388
static constexpr char const * FOREIGN_TABLE
bool is_string_array() const
Definition: sqltypes.h:581
void validate(T value) const
Definition: Encoder.h:54
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:963
bool is_decimal() const
Definition: sqltypes.h:583
virtual int get_param2() const
Definition: DdlUtils.cpp:57
std::string columnName
#define IS_GEO(T)
Definition: sqltypes.h:300
virtual bool get_is_array() const
Definition: DdlUtils.cpp:61
HOST DEVICE int get_output_srid() const
Definition: sqltypes.h:387
virtual void set_array_size(int s)
Definition: DdlUtils.cpp:73
constexpr auto is_datetime(SQLTypes type)
Definition: sqltypes.h:315
static std::vector< std::string > blacklisted_paths_
Definition: DdlUtils.h:104
virtual int get_array_size() const
Definition: DdlUtils.cpp:69
void validate_and_set_type(ColumnDescriptor &cd, SqlType *column_type)
Definition: DdlUtils.cpp:473
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:493