OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
import_export::Detector Class Reference

#include <Importer.h>

+ Inheritance diagram for import_export::Detector:
+ Collaboration diagram for import_export::Detector:

Public Member Functions

 Detector (const boost::filesystem::path &fp, CopyParams &cp)
 
std::vector< std::string > get_headers ()
 
std::vector< std::vector
< std::string > > 
get_sample_rows (size_t n)
 
std::vector< SQLTypeInfogetBestColumnTypes () const
 
- Public Member Functions inherited from import_export::DataStreamSink
 DataStreamSink ()
 
 DataStreamSink (const CopyParams &copy_params, const std::string file_path)
 
virtual ~DataStreamSink ()
 
const CopyParamsget_copy_params () const
 
void import_compressed (std::vector< std::string > &file_paths, const Catalog_Namespace::SessionInfo *session_info)
 

Static Public Member Functions

static SQLTypes detect_sqltype (const std::string &str)
 

Public Attributes

std::vector< std::vector
< std::string > > 
raw_rows
 
bool has_headers = false
 

Static Public Attributes

static constexpr size_t kDefaultSampleRowsCount {100}
 

Private Member Functions

void init ()
 
void read_file ()
 
void detect_row_delimiter ()
 
void split_raw_data ()
 
std::vector< SQLTypesdetect_column_types (const std::vector< std::string > &row)
 
void find_best_sqltypes ()
 
std::vector< SQLTypesfind_best_sqltypes (const std::vector< std::vector< std::string >> &raw_rows, const CopyParams &copy_params)
 
std::vector< SQLTypesfind_best_sqltypes (const std::vector< std::vector< std::string >>::const_iterator &row_begin, const std::vector< std::vector< std::string >>::const_iterator &row_end, const CopyParams &copy_params)
 
std::vector< EncodingTypefind_best_encodings (const std::vector< std::vector< std::string >>::const_iterator &row_begin, const std::vector< std::vector< std::string >>::const_iterator &row_end, const std::vector< SQLTypes > &best_types)
 
bool detect_headers (const std::vector< SQLTypes > &first_types, const std::vector< SQLTypes > &rest_types)
 
void find_best_sqltypes_and_headers ()
 
ImportStatus importDelimited (const std::string &file_path, const bool decompressed, const Catalog_Namespace::SessionInfo *session_info) override
 

Static Private Member Functions

static bool more_restrictive_sqltype (const SQLTypes a, const SQLTypes b)
 

Private Attributes

std::string raw_data
 
boost::filesystem::path file_path
 
std::chrono::duration< double > timeout {1}
 
std::string line1
 
std::vector< SQLTypesbest_sqltypes
 
std::vector< EncodingTypebest_encodings
 

Additional Inherited Members

- Protected Member Functions inherited from import_export::DataStreamSink
ImportStatus archivePlumber (const Catalog_Namespace::SessionInfo *session_info)
 
- Protected Attributes inherited from import_export::DataStreamSink
CopyParams copy_params
 
const std::string file_path
 
FILE * p_file = nullptr
 
ImportStatus import_status_
 
heavyai::shared_mutex import_mutex_
 
size_t total_file_size {0}
 
std::vector< size_t > file_offsets
 
std::mutex file_offsets_mutex
 

Detailed Description

Definition at line 721 of file Importer.h.

Constructor & Destructor Documentation

import_export::Detector::Detector ( const boost::filesystem::path &  fp,
CopyParams cp 
)

Definition at line 3748 of file Importer.cpp.

References g_enable_fsi, init(), import_export::kParquetFile, read_file(), and import_export::CopyParams::source_type.

3749  : DataStreamSink(cp, fp.string()), file_path(fp) {
3750 #ifdef ENABLE_IMPORT_PARQUET
3751  if (cp.source_type == import_export::SourceType::kParquetFile && g_enable_fsi &&
3752  !g_enable_legacy_parquet_import) {
3753  data_preview_ = get_parquet_data_preview(fp.string(), cp);
3754  } else
3755 #endif
3756  {
3757  read_file();
3758  init();
3759  }
3760 }
boost::filesystem::path file_path
Definition: Importer.h:768
bool g_enable_fsi
Definition: Catalog.cpp:96

+ Here is the call graph for this function:

Member Function Documentation

std::vector< SQLTypes > import_export::Detector::detect_column_types ( const std::vector< std::string > &  row)
private

Definition at line 3383 of file Importer.cpp.

References detect_sqltype().

Referenced by find_best_sqltypes_and_headers().

3383  {
3384  std::vector<SQLTypes> types(row.size());
3385  for (size_t i = 0; i < row.size(); i++) {
3386  types[i] = detect_sqltype(row[i]);
3387  }
3388  return types;
3389 }
static SQLTypes detect_sqltype(const std::string &str)
Definition: Importer.cpp:3293

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool import_export::Detector::detect_headers ( const std::vector< SQLTypes > &  first_types,
const std::vector< SQLTypes > &  rest_types 
)
private

Definition at line 3527 of file Importer.cpp.

References has_headers, and kTEXT.

Referenced by find_best_sqltypes_and_headers().

3528  {
3529  if (head_types.size() != tail_types.size()) {
3530  return false;
3531  }
3532  bool has_headers = false;
3533  for (size_t col_idx = 0; col_idx < tail_types.size(); col_idx++) {
3534  if (head_types[col_idx] != kTEXT) {
3535  return false;
3536  }
3537  has_headers = has_headers || tail_types[col_idx] != kTEXT;
3538  }
3539  return has_headers;
3540 }
Definition: sqltypes.h:69

+ Here is the caller graph for this function:

void import_export::Detector::detect_row_delimiter ( )
private

Definition at line 3233 of file Importer.cpp.

References import_export::DataStreamSink::copy_params, import_export::CopyParams::delimiter, and file_path.

Referenced by init().

3233  {
3234  if (copy_params.delimiter == '\0') {
3235  copy_params.delimiter = ',';
3236  if (boost::filesystem::extension(file_path) == ".tsv") {
3237  copy_params.delimiter = '\t';
3238  }
3239  }
3240 }
boost::filesystem::path file_path
Definition: Importer.h:768

+ Here is the caller graph for this function:

SQLTypes import_export::Detector::detect_sqltype ( const std::string &  str)
static

Definition at line 3293 of file Importer.cpp.

References dateTimeParseOptional< kDATE >(), dateTimeParseOptional< kTIME >(), dateTimeParseOptional< kTIMESTAMP >(), kBIGINT, kDATE, kDOUBLE, kFLOAT, kINT, kLINESTRING, kMULTILINESTRING, kMULTIPOINT, kMULTIPOLYGON, kPOINT, kPOLYGON, kSMALLINT, kTEXT, kTIME, kTIMESTAMP, import_export::PROMOTE_POLYGON_TO_MULTIPOLYGON, shared::transform(), and run_benchmark_import::type.

Referenced by detect_column_types(), and find_best_sqltypes().

3293  {
3294  SQLTypes type = kTEXT;
3295  if (try_cast<double>(str)) {
3296  type = kDOUBLE;
3297  /*if (try_cast<bool>(str)) {
3298  type = kBOOLEAN;
3299  }*/
3300  if (try_cast<int16_t>(str)) {
3301  type = kSMALLINT;
3302  } else if (try_cast<int32_t>(str)) {
3303  type = kINT;
3304  } else if (try_cast<int64_t>(str)) {
3305  type = kBIGINT;
3306  } else if (try_cast<float>(str)) {
3307  type = kFLOAT;
3308  }
3309  }
3310 
3311  // check for geo types
3312  if (type == kTEXT) {
3313  // convert to upper case
3314  std::string str_upper_case = str;
3316  str_upper_case.begin(), str_upper_case.end(), str_upper_case.begin(), ::toupper);
3317 
3318  // then test for leading words
3319  if (str_upper_case.find("POINT") == 0) {
3320  type = kPOINT;
3321  } else if (str_upper_case.find("MULTIPOINT") == 0) {
3322  type = kMULTIPOINT;
3323  } else if (str_upper_case.find("LINESTRING") == 0) {
3324  type = kLINESTRING;
3325  } else if (str_upper_case.find("MULTILINESTRING") == 0) {
3326  type = kMULTILINESTRING;
3327  } else if (str_upper_case.find("POLYGON") == 0) {
3329  type = kMULTIPOLYGON;
3330  } else {
3331  type = kPOLYGON;
3332  }
3333  } else if (str_upper_case.find("MULTIPOLYGON") == 0) {
3334  type = kMULTIPOLYGON;
3335  } else if (str_upper_case.find_first_not_of("0123456789ABCDEF") ==
3336  std::string::npos &&
3337  (str_upper_case.size() % 2) == 0) {
3338  // simple hex blob (two characters per byte, not uu-encode or base64)
3339  if (str_upper_case.size() >= 10) {
3340  // match WKB blobs for supported geometry types
3341  // the first byte specifies if the data is big-endian or little-endian
3342  // the next four bytes are the geometry type (1 = POINT etc.)
3343  // @TODO support eWKB, which has extra bits set in the geometry type
3344  auto first_five_bytes = str_upper_case.substr(0, 10);
3345  if (first_five_bytes == "0000000001" || first_five_bytes == "0101000000") {
3346  type = kPOINT;
3347  } else if (first_five_bytes == "0000000004" || first_five_bytes == "0104000000") {
3348  type = kMULTIPOINT;
3349  } else if (first_five_bytes == "0000000002" || first_five_bytes == "0102000000") {
3350  type = kLINESTRING;
3351  } else if (first_five_bytes == "0000000005" || first_five_bytes == "0105000000") {
3352  type = kMULTILINESTRING;
3353  } else if (first_five_bytes == "0000000003" || first_five_bytes == "0103000000") {
3354  type = kPOLYGON;
3355  } else if (first_five_bytes == "0000000006" || first_five_bytes == "0106000000") {
3356  type = kMULTIPOLYGON;
3357  } else {
3358  // unsupported WKB type
3359  return type;
3360  }
3361  } else {
3362  // too short to be WKB
3363  return type;
3364  }
3365  }
3366  }
3367 
3368  // check for time types
3369  if (type == kTEXT) {
3370  // This won't match unix timestamp, since floats and ints were checked above.
3371  if (dateTimeParseOptional<kTIME>(str, 0)) {
3372  type = kTIME;
3373  } else if (dateTimeParseOptional<kTIMESTAMP>(str, 0)) {
3374  type = kTIMESTAMP;
3375  } else if (dateTimeParseOptional<kDATE>(str, 0)) {
3376  type = kDATE;
3377  }
3378  }
3379 
3380  return type;
3381 }
Definition: sqltypes.h:66
SQLTypes
Definition: sqltypes.h:55
std::optional< int64_t > dateTimeParseOptional< kTIME >(std::string_view str, unsigned const dim)
std::optional< int64_t > dateTimeParseOptional< kDATE >(std::string_view str, unsigned const dim)
static constexpr bool PROMOTE_POLYGON_TO_MULTIPOLYGON
Definition: Importer.cpp:163
OUTPUT transform(INPUT const &input, FUNC const &func)
Definition: misc.h:320
std::optional< int64_t > dateTimeParseOptional< kTIMESTAMP >(std::string_view str, unsigned const dim)
Definition: sqltypes.h:69
Definition: sqltypes.h:70
Definition: sqltypes.h:62

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector< EncodingType > import_export::Detector::find_best_encodings ( const std::vector< std::vector< std::string >>::const_iterator &  row_begin,
const std::vector< std::vector< std::string >>::const_iterator &  row_end,
const std::vector< SQLTypes > &  best_types 
)
private

Definition at line 3491 of file Importer.cpp.

References file_path, IS_STRING, kENCODING_DICT, kENCODING_NONE, and raw_rows.

Referenced by find_best_sqltypes_and_headers().

3494  {
3495  if (raw_rows.size() < 1) {
3496  throw std::runtime_error("No rows found in: " +
3497  boost::filesystem::basename(file_path));
3498  }
3499  size_t num_cols = best_types.size();
3500  std::vector<EncodingType> best_encodes(num_cols, kENCODING_NONE);
3501  std::vector<size_t> num_rows_per_col(num_cols, 1);
3502  std::vector<std::unordered_set<std::string>> count_set(num_cols);
3503  for (auto row = row_begin; row != row_end; row++) {
3504  for (size_t col_idx = 0; col_idx < row->size() && col_idx < num_cols; col_idx++) {
3505  if (IS_STRING(best_types[col_idx])) {
3506  count_set[col_idx].insert(row->at(col_idx));
3507  num_rows_per_col[col_idx]++;
3508  }
3509  }
3510  }
3511  for (size_t col_idx = 0; col_idx < num_cols; col_idx++) {
3512  if (IS_STRING(best_types[col_idx])) {
3513  float uniqueRatio =
3514  static_cast<float>(count_set[col_idx].size()) / num_rows_per_col[col_idx];
3515  if (uniqueRatio < 0.75) {
3516  best_encodes[col_idx] = kENCODING_DICT;
3517  }
3518  }
3519  }
3520  return best_encodes;
3521 }
std::vector< std::vector< std::string > > raw_rows
Definition: Importer.h:731
boost::filesystem::path file_path
Definition: Importer.h:768
#define IS_STRING(T)
Definition: sqltypes.h:299

+ Here is the caller graph for this function:

void import_export::Detector::find_best_sqltypes ( )
private

Definition at line 3438 of file Importer.cpp.

References best_sqltypes, import_export::DataStreamSink::copy_params, and raw_rows.

Referenced by find_best_sqltypes(), and find_best_sqltypes_and_headers().

3438  {
3440 }
std::vector< SQLTypes > best_sqltypes
Definition: Importer.h:774
std::vector< std::vector< std::string > > raw_rows
Definition: Importer.h:731

+ Here is the caller graph for this function:

std::vector< SQLTypes > import_export::Detector::find_best_sqltypes ( const std::vector< std::vector< std::string >> &  raw_rows,
const CopyParams copy_params 
)
private

Definition at line 3442 of file Importer.cpp.

References import_export::DataStreamSink::copy_params, find_best_sqltypes(), and raw_rows.

3444  {
3445  return find_best_sqltypes(raw_rows.begin(), raw_rows.end(), copy_params);
3446 }
std::vector< std::vector< std::string > > raw_rows
Definition: Importer.h:731

+ Here is the call graph for this function:

std::vector< SQLTypes > import_export::Detector::find_best_sqltypes ( const std::vector< std::vector< std::string >>::const_iterator &  row_begin,
const std::vector< std::vector< std::string >>::const_iterator &  row_end,
const CopyParams copy_params 
)
private

Definition at line 3448 of file Importer.cpp.

References detect_sqltype(), run_benchmark_import::end_time, file_path, kCHAR, kTEXT, more_restrictive_sqltype(), import_export::CopyParams::null_str, raw_rows, and timeout.

3451  {
3452  if (raw_rows.size() < 1) {
3453  throw std::runtime_error("No rows found in: " +
3454  boost::filesystem::basename(file_path));
3455  }
3456  auto end_time = std::chrono::steady_clock::now() + timeout;
3457  size_t num_cols = raw_rows.front().size();
3458  std::vector<SQLTypes> best_types(num_cols, kCHAR);
3459  std::vector<size_t> non_null_col_counts(num_cols, 0);
3460  for (auto row = row_begin; row != row_end; row++) {
3461  while (best_types.size() < row->size() || non_null_col_counts.size() < row->size()) {
3462  best_types.push_back(kCHAR);
3463  non_null_col_counts.push_back(0);
3464  }
3465  for (size_t col_idx = 0; col_idx < row->size(); col_idx++) {
3466  // do not count nulls
3467  if (row->at(col_idx) == "" || !row->at(col_idx).compare(copy_params.null_str)) {
3468  continue;
3469  }
3470  SQLTypes t = detect_sqltype(row->at(col_idx));
3471  non_null_col_counts[col_idx]++;
3472  if (!more_restrictive_sqltype(best_types[col_idx], t)) {
3473  best_types[col_idx] = t;
3474  }
3475  }
3476  if (std::chrono::steady_clock::now() > end_time) {
3477  break;
3478  }
3479  }
3480  for (size_t col_idx = 0; col_idx < num_cols; col_idx++) {
3481  // if we don't have any non-null values for this column make it text to be
3482  // safe b/c that is least restrictive type
3483  if (non_null_col_counts[col_idx] == 0) {
3484  best_types[col_idx] = kTEXT;
3485  }
3486  }
3487 
3488  return best_types;
3489 }
SQLTypes
Definition: sqltypes.h:55
static SQLTypes detect_sqltype(const std::string &str)
Definition: Importer.cpp:3293
std::vector< std::vector< std::string > > raw_rows
Definition: Importer.h:731
boost::filesystem::path file_path
Definition: Importer.h:768
Definition: sqltypes.h:69
static bool more_restrictive_sqltype(const SQLTypes a, const SQLTypes b)
Definition: Importer.cpp:3391
std::chrono::duration< double > timeout
Definition: Importer.h:769
Definition: sqltypes.h:58

+ Here is the call graph for this function:

void import_export::Detector::find_best_sqltypes_and_headers ( )
private

Definition at line 3415 of file Importer.cpp.

References best_encodings, best_sqltypes, import_export::DataStreamSink::copy_params, detect_column_types(), detect_headers(), find_best_encodings(), find_best_sqltypes(), import_export::CopyParams::has_header, has_headers, import_export::kAutoDetect, import_export::kHasHeader, import_export::kNoHeader, and raw_rows.

Referenced by init().

3415  {
3417  best_encodings =
3418  find_best_encodings(raw_rows.begin() + 1, raw_rows.end(), best_sqltypes);
3419  std::vector<SQLTypes> head_types = detect_column_types(raw_rows.at(0));
3420  switch (copy_params.has_header) {
3422  has_headers = detect_headers(head_types, best_sqltypes);
3423  if (has_headers) {
3425  } else {
3427  }
3428  break;
3430  has_headers = false;
3431  break;
3433  has_headers = true;
3434  break;
3435  }
3436 }
std::vector< SQLTypes > best_sqltypes
Definition: Importer.h:774
ImportHeaderRow has_header
Definition: CopyParams.h:46
std::vector< EncodingType > find_best_encodings(const std::vector< std::vector< std::string >>::const_iterator &row_begin, const std::vector< std::vector< std::string >>::const_iterator &row_end, const std::vector< SQLTypes > &best_types)
Definition: Importer.cpp:3491
std::vector< std::vector< std::string > > raw_rows
Definition: Importer.h:731
std::vector< EncodingType > best_encodings
Definition: Importer.h:775
bool detect_headers(const std::vector< SQLTypes > &first_types, const std::vector< SQLTypes > &rest_types)
Definition: Importer.cpp:3527
std::vector< SQLTypes > detect_column_types(const std::vector< std::string > &row)
Definition: Importer.cpp:3383

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector< std::string > import_export::Detector::get_headers ( )

Definition at line 3557 of file Importer.cpp.

References best_sqltypes, has_headers, raw_rows, and to_string().

Referenced by DBHandler::detect_column_types().

3557  {
3558 #if defined(ENABLE_IMPORT_PARQUET)
3559  if (data_preview_.has_value()) {
3560  return data_preview_.value().column_names;
3561  } else
3562 #endif
3563  {
3564  std::vector<std::string> headers(best_sqltypes.size());
3565  for (size_t i = 0; i < best_sqltypes.size(); i++) {
3566  if (has_headers && i < raw_rows[0].size()) {
3567  headers[i] = raw_rows[0][i];
3568  } else {
3569  headers[i] = "column_" + std::to_string(i + 1);
3570  }
3571  }
3572  return headers;
3573  }
3574 }
std::vector< SQLTypes > best_sqltypes
Definition: Importer.h:774
std::string to_string(char const *&&v)
std::vector< std::vector< std::string > > raw_rows
Definition: Importer.h:731

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector< std::vector< std::string > > import_export::Detector::get_sample_rows ( size_t  n)

Definition at line 3542 of file Importer.cpp.

References has_headers, anonymous_namespace{Utm.h}::n, and raw_rows.

Referenced by DBHandler::detect_column_types().

3542  {
3543 #if defined(ENABLE_IMPORT_PARQUET)
3544  if (data_preview_.has_value()) {
3545  return data_preview_.value().sample_rows;
3546  } else
3547 #endif
3548  {
3549  n = std::min(n, raw_rows.size());
3550  size_t offset = (has_headers && raw_rows.size() > 1) ? 1 : 0;
3551  std::vector<std::vector<std::string>> sample_rows(raw_rows.begin() + offset,
3552  raw_rows.begin() + n);
3553  return sample_rows;
3554  }
3555 }
std::vector< std::vector< std::string > > raw_rows
Definition: Importer.h:731
constexpr double n
Definition: Utm.h:38

+ Here is the caller graph for this function:

std::vector< SQLTypeInfo > import_export::Detector::getBestColumnTypes ( ) const

Definition at line 3576 of file Importer.cpp.

References best_encodings, best_sqltypes, and CHECK_EQ.

Referenced by DBHandler::detect_column_types().

3576  {
3577 #if defined(ENABLE_IMPORT_PARQUET)
3578  if (data_preview_.has_value()) {
3579  return data_preview_.value().column_types;
3580  } else
3581 #endif
3582  {
3583  std::vector<SQLTypeInfo> types;
3584  CHECK_EQ(best_sqltypes.size(), best_encodings.size());
3585  for (size_t i = 0; i < best_sqltypes.size(); i++) {
3586  types.emplace_back(best_sqltypes[i], false, best_encodings[i]);
3587  }
3588  return types;
3589  }
3590 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::vector< SQLTypes > best_sqltypes
Definition: Importer.h:774
std::vector< EncodingType > best_encodings
Definition: Importer.h:775

+ Here is the caller graph for this function:

ImportStatus import_export::Detector::importDelimited ( const std::string &  file_path,
const bool  decompressed,
const Catalog_Namespace::SessionInfo session_info 
)
overrideprivatevirtual

Implements import_export::DataStreamSink.

Definition at line 3163 of file Importer.cpp.

References import_export::DataStreamSink::copy_params, run_benchmark_import::end_time, heavyai::fopen(), import_export::DataStreamSink::import_mutex_, import_export::DataStreamSink::import_status_, import_export::kImportRowLimit, parse_ast::line, line1, import_export::CopyParams::line_delim, import_export::ImportStatus::load_failed, anonymous_namespace{Utm.h}::n, import_export::DataStreamSink::p_file, raw_data, import_export::ImportStatus::rows_completed, and timeout.

3166  {
3167  // we do not check interrupt status for this detection
3168  if (!p_file) {
3169  p_file = fopen(file_path.c_str(), "rb");
3170  }
3171  if (!p_file) {
3172  throw std::runtime_error("failed to open file '" + file_path +
3173  "': " + strerror(errno));
3174  }
3175 
3176  // somehow clang does not support ext/stdio_filebuf.h, so
3177  // need to diy readline with customized copy_params.line_delim...
3178  std::string line;
3179  line.reserve(1 * 1024 * 1024);
3180  auto end_time = std::chrono::steady_clock::now() +
3181  timeout * (boost::istarts_with(file_path, "s3://") ? 3 : 1);
3182  try {
3183  while (!feof(p_file)) {
3184  int c;
3185  size_t n = 0;
3186  while (EOF != (c = fgetc(p_file)) && copy_params.line_delim != c) {
3187  if (n++ >= line.capacity()) {
3188  break;
3189  }
3190  line += c;
3191  }
3192  if (0 == n) {
3193  break;
3194  }
3195  // remember the first line, which is possibly a header line, to
3196  // ignore identical header line(s) in 2nd+ files of a archive;
3197  // otherwise, 2nd+ header may be mistaken as an all-string row
3198  // and so be final column types.
3199  if (line1.empty()) {
3200  line1 = line;
3201  } else if (line == line1) {
3202  line.clear();
3203  continue;
3204  }
3205 
3206  raw_data += line;
3208  line.clear();
3210  if (std::chrono::steady_clock::now() > end_time) {
3212  // stop import when row limit reached
3213  break;
3214  }
3215  }
3216  }
3217  } catch (std::exception& e) {
3218  }
3219 
3221  import_status_.load_failed = true;
3222 
3223  fclose(p_file);
3224  p_file = nullptr;
3225  return import_status_;
3226 }
std::lock_guard< T > lock_guard
heavyai::unique_lock< heavyai::shared_mutex > write_lock
::FILE * fopen(const char *filename, const char *mode)
Definition: heavyai_fs.cpp:74
boost::filesystem::path file_path
Definition: Importer.h:768
tuple line
Definition: parse_ast.py:10
static const size_t kImportRowLimit
Definition: Importer.cpp:169
std::chrono::duration< double > timeout
Definition: Importer.h:769
std::string raw_data
Definition: Importer.h:767
constexpr double n
Definition: Utm.h:38
heavyai::shared_mutex import_mutex_
Definition: Importer.h:715

+ Here is the call graph for this function:

void import_export::Detector::init ( )
private

Definition at line 3157 of file Importer.cpp.

References detect_row_delimiter(), find_best_sqltypes_and_headers(), and split_raw_data().

Referenced by Detector().

3157  {
3159  split_raw_data();
3161 }
void find_best_sqltypes_and_headers()
Definition: Importer.cpp:3415

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool import_export::Detector::more_restrictive_sqltype ( const SQLTypes  a,
const SQLTypes  b 
)
staticprivate

Definition at line 3391 of file Importer.cpp.

References anonymous_namespace{Utm.h}::a, kBIGINT, kBOOLEAN, kCHAR, kDATE, kDOUBLE, kFLOAT, kINT, kLINESTRING, kMULTILINESTRING, kMULTIPOINT, kMULTIPOLYGON, kPOINT, kPOLYGON, kSMALLINT, kTEXT, kTIME, and kTIMESTAMP.

Referenced by find_best_sqltypes().

3391  {
3392  static std::array<int, kSQLTYPE_LAST> typeorder;
3393  typeorder[kCHAR] = 0;
3394  typeorder[kBOOLEAN] = 2;
3395  typeorder[kSMALLINT] = 3;
3396  typeorder[kINT] = 4;
3397  typeorder[kBIGINT] = 5;
3398  typeorder[kFLOAT] = 6;
3399  typeorder[kDOUBLE] = 7;
3400  typeorder[kTIMESTAMP] = 8;
3401  typeorder[kTIME] = 9;
3402  typeorder[kDATE] = 10;
3403  typeorder[kPOINT] = 11;
3404  typeorder[kMULTIPOINT] = 11;
3405  typeorder[kLINESTRING] = 11;
3406  typeorder[kMULTILINESTRING] = 11;
3407  typeorder[kPOLYGON] = 11;
3408  typeorder[kMULTIPOLYGON] = 11;
3409  typeorder[kTEXT] = 12;
3410 
3411  // note: b < a instead of a < b because the map is ordered most to least restrictive
3412  return typeorder[b] < typeorder[a];
3413 }
Definition: sqltypes.h:66
constexpr double a
Definition: Utm.h:32
Definition: sqltypes.h:69
Definition: sqltypes.h:70
Definition: sqltypes.h:58
Definition: sqltypes.h:62

+ Here is the caller graph for this function:

void import_export::Detector::read_file ( )
private

Definition at line 3228 of file Importer.cpp.

References import_export::DataStreamSink::archivePlumber().

Referenced by Detector().

3228  {
3229  // this becomes analogous to Importer::import()
3230  (void)DataStreamSink::archivePlumber(nullptr);
3231 }
ImportStatus archivePlumber(const Catalog_Namespace::SessionInfo *session_info)
Definition: Importer.cpp:3638

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void import_export::Detector::split_raw_data ( )
private

Definition at line 3242 of file Importer.cpp.

References import_export::DataStreamSink::copy_params, import_export::delimited_parser::get_row(), raw_data, raw_rows, and import_export::CopyParams::threads.

Referenced by init().

3242  {
3243  const char* buf = raw_data.c_str();
3244  const char* buf_end = buf + raw_data.size();
3245  bool try_single_thread = false;
3246  for (const char* p = buf; p < buf_end; p++) {
3247  std::vector<std::string> row;
3248  std::vector<std::unique_ptr<char[]>> tmp_buffers;
3250  buf_end,
3251  buf_end,
3252  copy_params,
3253  nullptr,
3254  row,
3255  tmp_buffers,
3256  try_single_thread,
3257  true);
3258  raw_rows.push_back(row);
3259  if (try_single_thread) {
3260  break;
3261  }
3262  }
3263  if (try_single_thread) {
3264  copy_params.threads = 1;
3265  raw_rows.clear();
3266  for (const char* p = buf; p < buf_end; p++) {
3267  std::vector<std::string> row;
3268  std::vector<std::unique_ptr<char[]>> tmp_buffers;
3270  buf_end,
3271  buf_end,
3272  copy_params,
3273  nullptr,
3274  row,
3275  tmp_buffers,
3276  try_single_thread,
3277  true);
3278  raw_rows.push_back(row);
3279  }
3280  }
3281 }
const char * get_row(const char *buf, const char *buf_end, const char *entire_buf_end, const import_export::CopyParams &copy_params, const bool *is_array, std::vector< T > &row, std::vector< std::unique_ptr< char[]>> &tmp_buffers, bool &try_single_thread, bool filter_empty_lines)
Parses the first row in the given buffer and inserts fields into given vector.
std::vector< std::vector< std::string > > raw_rows
Definition: Importer.h:731
std::string raw_data
Definition: Importer.h:767

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Member Data Documentation

std::vector<EncodingType> import_export::Detector::best_encodings
private

Definition at line 775 of file Importer.h.

Referenced by find_best_sqltypes_and_headers(), and getBestColumnTypes().

std::vector<SQLTypes> import_export::Detector::best_sqltypes
private
boost::filesystem::path import_export::Detector::file_path
private

Definition at line 768 of file Importer.h.

Referenced by detect_row_delimiter(), find_best_encodings(), and find_best_sqltypes().

bool import_export::Detector::has_headers = false
constexpr size_t import_export::Detector::kDefaultSampleRowsCount {100}
static

Definition at line 737 of file Importer.h.

Referenced by DBHandler::detect_column_types().

std::string import_export::Detector::line1
private

Definition at line 770 of file Importer.h.

Referenced by importDelimited().

std::string import_export::Detector::raw_data
private

Definition at line 767 of file Importer.h.

Referenced by importDelimited(), and split_raw_data().

std::vector<std::vector<std::string> > import_export::Detector::raw_rows
std::chrono::duration<double> import_export::Detector::timeout {1}
private

Definition at line 769 of file Importer.h.

Referenced by find_best_sqltypes(), and importDelimited().


The documentation for this class was generated from the following files: