19 #include <unordered_set>
21 #include <boost/algorithm/string.hpp>
22 #include <boost/filesystem.hpp>
23 #include <boost/program_options.hpp>
25 #include "rapidjson/document.h"
43 , array_size(array_size) {}
84 str =
"CHAR(" + boost::lexical_cast<std::string>(
param1) +
")";
87 str =
"VARCHAR(" + boost::lexical_cast<std::string>(
param1) +
")";
93 str =
"NUMERIC(" + boost::lexical_cast<std::string>(
param1);
95 str +=
", " + boost::lexical_cast<std::string>(
param2);
100 str =
"DECIMAL(" + boost::lexical_cast<std::string>(
param1);
102 str +=
", " + boost::lexical_cast<std::string>(
param2);
127 str +=
"(" + boost::lexical_cast<std::string>(
param1) +
")";
133 str +=
"(" + boost::lexical_cast<std::string>(
param1) +
")";
146 str += boost::lexical_cast<std::string>(
array_size);
158 throw std::runtime_error(
"CHAR and VARCHAR must have a positive dimension.");
164 throw std::runtime_error(
"DECIMAL and NUMERIC must have a positive precision.");
166 throw std::runtime_error(
"DECIMAL and NUMERIC precision cannot be larger than " +
170 throw std::runtime_error(
171 "DECIMAL and NUMERIC must have precision larger than scale.");
179 throw std::runtime_error(
180 "Only TIMESTAMP(n) where n = (0,3,6,9) are supported now.");
188 throw std::runtime_error(
"Only TIME(0) is supported now.");
206 : encoding_name(encoding_name), encoding_param(encoding_param) {}
230 throw std::runtime_error(cd.
columnName +
": Precision too high, max " +
259 throw std::runtime_error(cd.
columnName +
": Cannot apply FIXED encoding to " +
260 column_type->
to_string() +
" type array.");
269 throw std::runtime_error(
271 ": Fixed encoding is only supported for integer or time columns.");
276 if (encoding_size != 8) {
277 throw std::runtime_error(
279 ": Compression parameter for Fixed encoding on SMALLINT must be 8.");
283 if (encoding_size != 8 && encoding_size != 16) {
284 throw std::runtime_error(
286 ": Compression parameter for Fixed encoding on INTEGER must be 8 or 16.");
290 if (encoding_size != 8 && encoding_size != 16 && encoding_size != 32) {
292 ": Compression parameter for Fixed encoding on "
293 "BIGINT must be 8 or 16 or 32.");
298 if (encoding_size != 32) {
300 ": Compression parameter for Fixed encoding on "
301 "TIME or TIMESTAMP must be 32.");
303 throw std::runtime_error(
"Fixed encoding is not supported for TIMESTAMP(3|6|9).");
308 if (encoding_size != 32 && encoding_size != 16) {
310 ": Compression parameter for Fixed encoding on "
311 "DECIMAL must be 16 or 32.");
316 ": Precision too high for Fixed(32) encoding, max 9.");
321 ": Precision too high for Fixed(16) encoding, max 4.");
325 if (encoding_size != 32 && encoding_size != 16) {
327 ": Compression parameter for Fixed encoding on "
328 "DATE must be 16 or 32.");
332 throw std::runtime_error(cd.
columnName +
": Cannot apply FIXED encoding to " +
346 throw std::runtime_error(
348 ": Dictionary encoding is only supported on string or string array columns.");
351 if (encoding_size == 0) {
354 comp_param = encoding_size;
358 ": Compression parameter for string arrays must be 32");
360 if (comp_param != 8 && comp_param != 16 && comp_param != 32) {
361 throw std::runtime_error(
363 ": Compression parameter for Dictionary encoding must be 8 or 16 or 32.");
373 throw std::runtime_error(
375 ": None encoding is only supported on string, string array, or geo columns.");
385 ": Cannot do sparse column encoding on a NOT NULL column.");
387 if (encoding_size == 0 || encoding_size % 8 != 0 || encoding_size > 48) {
388 throw std::runtime_error(
390 "Must specify number of bits as 8, 16, 24, 32 or 48 as the parameter to "
391 "sparse-column encoding.");
400 throw std::runtime_error(
401 cd.
columnName +
": COMPRESSED encoding is only supported on WGS84 geo columns.");
404 if (encoding_size == 0) {
407 comp_param = encoding_size;
409 if (comp_param != 32) {
411 ": only 32-bit COMPRESSED geo encoding is supported");
422 ": Cannot apply days encoding to date array.");
426 ": Days encoding is only supported for DATE columns.");
428 if (encoding_size != 32 && encoding_size != 16) {
430 ": Compression parameter for Days encoding on "
431 "DATE must be 16 or 32.");
440 if (encoding ==
nullptr) {
444 if (boost::iequals(comp,
"fixed")) {
446 }
else if (boost::iequals(comp,
"rl")) {
451 }
else if (boost::iequals(comp,
"diff")) {
456 }
else if (boost::iequals(comp,
"dict")) {
458 }
else if (boost::iequals(comp,
"NONE")) {
460 }
else if (boost::iequals(comp,
"sparse")) {
462 }
else if (boost::iequals(comp,
"compressed")) {
464 }
else if (boost::iequals(comp,
"days")) {
467 throw std::runtime_error(cd.
columnName +
": Invalid column compression scheme " +
477 throw std::runtime_error(
"Unsupported type \"GEOMETRY\" specified.");
499 throw std::runtime_error(
501 ": Array of strings must be dictionary encoded. Specify ENCODING DICT");
507 if (array_size > 0) {
511 throw std::runtime_error(cd.
columnName +
": Unexpected fixed length array size");
525 const std::string& column_name) {
560 throw std::runtime_error(
"String too long for column " + column_name +
" was " +
566 if (val.front() !=
'{' || val.back() !=
'}') {
567 throw std::runtime_error(column_name +
568 ": arrays should start and end with curly braces");
570 std::vector<std::string> elements =
split(val.substr(1, val.length() - 2),
", ");
573 size_t expected_size = column_type.
get_size() / sti.get_size();
574 size_t actual_size = elements.size();
575 if (actual_size != expected_size) {
576 throw std::runtime_error(
"Fixed length array column " + column_name +
582 for (
const auto& element : elements) {
601 throw std::runtime_error(
"Unexpected geo literal '" + val +
"' for column " +
604 if (!geo->transform(column_type)) {
605 throw std::runtime_error(
"Cannot transform SRID for literal '" + val +
606 "' for column " + column_name);
608 auto sql_type = column_type.
get_type();
609 auto geo_type = geo->getType();
621 throw std::runtime_error(
"Geo literal '" + val +
622 "' doesn't match the type "
623 "of column column " +
628 throw std::runtime_error(
"Unexpected geo literal '" + val +
"' for column " +
629 column_name +
": " + e.what());
633 CHECK(
false) <<
"validate_literal() does not support type "
641 const std::string* default_value,
643 bool is_null_literal =
644 default_value && ((
to_upper(*default_value) ==
"NULL") ||
646 if (not_null && (is_null_literal)) {
648 ": cannot set default value to NULL for "
651 if (!default_value || is_null_literal) {
656 const auto& val = *default_value;
666 const std::string* default_value) {
679 const int32_t column_count) {
692 std::unordered_set<std::string>& upper_column_names) {
693 const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
694 const auto insert_it = upper_column_names.insert(upper_column_name);
695 if (!insert_it.second) {
696 throw std::runtime_error(
"Column '" + column_name +
"' defined more than once");
701 const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
703 throw std::runtime_error(
"Cannot create column with reserved keyword '" +
710 const std::string& command) {
713 throw std::runtime_error(td->
tableName +
" is a view. Use " + command +
" VIEW.");
717 throw std::runtime_error(td->
tableName +
" is a foreign table. Use " + command +
721 throw std::runtime_error(td->
tableName +
" is a table. Use " + command +
" TABLE.");
730 return "ForeignTable";
735 throw std::runtime_error{
"Unexpected table type"};
739 return "Configuration value for \"" + config_key +
740 "\" is malformed. Value should be a list of paths with format: [ "
741 "\"root-path-1\", \"root-path-2\", ... ]";
745 const std::vector<std::string>& whitelisted_root_paths) {
746 const auto& canonical_file_path = boost::filesystem::canonical(file_path);
747 for (
const auto& root_path : whitelisted_root_paths) {
748 if (boost::istarts_with(canonical_file_path.string(), root_path)) {
752 if (canonical_file_path == boost::filesystem::absolute(file_path)) {
753 throw std::runtime_error{
"File or directory path \"" + file_path +
754 "\" is not whitelisted."};
756 throw std::runtime_error{
"File or directory path \"" + file_path +
757 "\" (resolved to \"" + canonical_file_path.string() +
758 "\") is not whitelisted."};
762 const std::string& file_path,
764 std::vector<std::string> file_paths;
769 if (!boost::filesystem::exists(file_path)) {
772 path = boost::filesystem::path(file_path).parent_path().string();
773 if (!boost::filesystem::exists(path)) {
774 throw std::runtime_error{
"File or directory \"" + file_path +
775 "\" does not exist."};
787 const bool allow_wildcards) {
791 static const std::string safe_punctuation{
"./_+-=:~"};
792 for (
const auto& ch : file_path) {
793 if (std::ispunct(ch) && safe_punctuation.find(ch) == std::string::npos &&
794 !(allow_wildcards && ch ==
'*')) {
795 throw std::runtime_error(std::string(
"Punctuation \"") + ch +
796 "\" is not allowed in file path: " + file_path);
801 const auto& expanded_file_paths =
803 for (
const auto& path : expanded_file_paths) {
805 const auto& canonical_file_path = boost::filesystem::canonical(file_path);
806 if (canonical_file_path == boost::filesystem::absolute(file_path)) {
807 throw std::runtime_error{
"Access to file or directory path \"" + file_path +
808 "\" is not allowed."};
810 throw std::runtime_error{
"Access to file or directory path \"" + file_path +
811 "\" (resolved to \"" + canonical_file_path.string() +
812 "\") is not allowed."};
819 const std::string& config_value,
820 std::vector<std::string>& whitelisted_paths) {
821 rapidjson::Document whitelisted_root_paths;
822 whitelisted_root_paths.Parse(config_value);
823 if (!whitelisted_root_paths.IsArray()) {
826 for (
const auto& root_path : whitelisted_root_paths.GetArray()) {
827 if (!root_path.IsString()) {
830 if (!boost::filesystem::exists(root_path.GetString())) {
831 throw std::runtime_error{
"Whitelisted root path \"" +
832 std::string{root_path.GetString()} +
"\" does not exist."};
834 whitelisted_paths.emplace_back(
835 boost::filesystem::canonical(root_path.GetString()).
string());
837 LOG(
INFO) <<
"Parsed " << config_key <<
": "
842 const std::string& allowed_import_paths,
843 const std::string& allowed_export_paths) {
844 CHECK(!data_dir.empty());
845 CHECK(boost::filesystem::is_directory(data_dir));
847 auto data_dir_path = boost::filesystem::canonical(data_dir);
856 if (!allowed_import_paths.empty()) {
860 if (!allowed_export_paths.empty()) {
867 const std::vector<std::string>& expanded_file_paths,
869 for (
const auto& path : expanded_file_paths) {
889 CHECK(!path.empty());
894 const auto canonical_path = boost::filesystem::canonical(path).string();
896 std::string full_path;
898 full_path = boost::filesystem::canonical(blacklisted_path).string();
906 full_path = boost::filesystem::absolute(blacklisted_path).string();
908 if (boost::istarts_with(canonical_path, full_path)) {
static std::set< std::string > reserved_keywords
HOST DEVICE SQLTypes get_subtype() const
void set_compression(EncodingType c)
static std::unique_ptr< GeoBase > createGeoType(const std::string &wkt_or_wkb_hex)
void validate_and_set_sparse_encoding(ColumnDescriptor &cd, int encoding_size)
std::vector< std::string > get_expanded_file_paths(const std::string &file_path, const DataTransferType data_transfer_type)
static std::vector< std::string > whitelisted_export_paths_
HOST DEVICE int get_size() const
void validate_literal(const std::string &val, SQLTypeInfo column_type, const std::string &column_name)
shared utility for globbing files, paths can be specified as either a single file, directory or wildcards
static constexpr int32_t kMaxNumericPrecision
SqlType(SQLTypes type, int param1, int param2, bool is_array, int array_size)
void validate_and_set_array_size(ColumnDescriptor &cd, const SqlType *column_type)
virtual void check_type()
static void initialize(const std::string &data_dir, const std::string &allowed_import_paths, const std::string &allowed_export_paths)
void validate_and_set_dictionary_encoding(ColumnDescriptor &cd, int encoding_size)
#define DEFAULT_MAX_CHUNK_SIZE
HOST DEVICE void set_subtype(SQLTypes st)
virtual int get_encoding_param() const
Constants for Builtin SQL Types supported by HEAVY.AI.
const std::string kDefaultExportDirName
HOST DEVICE SQLTypes get_type() const
void validate_non_duplicate_column(const std::string &column_name, std::unordered_set< std::string > &upper_column_names)
void set_column_descriptor(const std::string &column_name, ColumnDescriptor &cd, SqlType *column_type, const bool not_null, const Encoding *encoding, const std::string *default_value)
void validate_and_set_none_encoding(ColumnDescriptor &cd)
void set_input_srid(int d)
void validate_and_set_encoding(ColumnDescriptor &cd, const Encoding *encoding, const SqlType *column_type)
bool g_use_date_in_days_default_encoding
const std::string kDefaultImportDirName
static std::vector< std::string > whitelisted_import_paths_
static void validateWhitelistedFilePath(const std::vector< std::string > &expanded_file_paths, const DataTransferType data_transfer_type)
void set_default_encoding(ColumnDescriptor &cd)
virtual SQLTypes get_type() const
void validate_expanded_file_path(const std::string &file_path, const std::vector< std::string > &whitelisted_root_paths)
Datum StringToDatum(const std::string_view s, SQLTypeInfo &ti)
virtual std::string to_string() const
void validate_non_reserved_keyword(const std::string &column_name)
specifies the content in-memory of a row in the column metadata table
void set_default_table_attributes(const std::string &table_name, TableDescriptor &td, const int32_t column_count)
std::shared_ptr< Fragmenter_Namespace::AbstractFragmenter > fragmenter
int get_precision() const
void validate_allowed_file_path(const std::string &file_path, const DataTransferType data_transfer_type, const bool allow_wildcards)
void set_output_srid(int s)
#define DEFAULT_PAGE_SIZE
void set_comp_param(int p)
void validate_and_set_compressed_encoding(ColumnDescriptor &cd, int encoding_size)
std::optional< std::string > default_value
HOST DEVICE EncodingType get_compression() const
virtual const std::string * get_encoding_name() const
int64_t convert_decimal_value_to_scale(const int64_t decimal_value, const SQLTypeInfo &type_info, const SQLTypeInfo &new_type_info)
std::string get_malformed_config_error_message(const std::string &config_key)
void set_dimension(int d)
#define DEFAULT_FRAGMENT_ROWS
void validate_and_set_fixed_encoding(ColumnDescriptor &cd, int encoding_size, const SqlType *column_type)
std::string table_type_enum_to_string(const TableType table_type)
Fragmenter_Namespace::FragmenterType fragType
Encoding(std::string *encoding_name, int encoding_param)
void set_whitelisted_paths(const std::string &config_key, const std::string &config_value, std::vector< std::string > &whitelisted_paths)
virtual void set_param1(int param)
static bool isBlacklistedPath(const std::string &path)
void validate_table_type(const TableDescriptor *td, const TableType expected_table_type, const std::string &command)
void validate_and_set_default_value(ColumnDescriptor &cd, const std::string *default_value, bool not_null)
bool is_high_precision_timestamp() const
void validate_and_set_date_encoding(ColumnDescriptor &cd, int encoding_size)
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const FilePathOptions &options, const bool recurse)
static constexpr size_t MAX_STRLEN
static void addToBlacklist(const std::string &path)
std::unique_ptr< std::string > encoding_name
virtual void set_is_array(bool a)
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
virtual int get_param1() const
HOST DEVICE bool get_notnull() const
static constexpr char const * FOREIGN_TABLE
bool is_string_array() const
void validate(T value) const
SQLTypeInfo get_elem_type() const
virtual int get_param2() const
virtual bool get_is_array() const
HOST DEVICE int get_output_srid() const
virtual void set_array_size(int s)
constexpr auto is_datetime(SQLTypes type)
static std::vector< std::string > blacklisted_paths_
virtual int get_array_size() const
void validate_and_set_type(ColumnDescriptor &cd, SqlType *column_type)
HOST DEVICE void set_type(SQLTypes t)