OmniSciDB  343343d194
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Importer.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * @file Importer.h
19  * @author Wei Hong < wei@mapd.com>
20  * @brief Importer class for table import from file
21  */
22 #ifndef _IMPORTER_H_
23 #define _IMPORTER_H_
24 
25 #include "Shared/Logger.h"
26 #include "Shared/fixautotools.h"
27 
28 #include <gdal.h>
29 #include <ogrsf_frmts.h>
30 
31 #include <atomic>
32 #include <boost/filesystem.hpp>
33 #include <boost/noncopyable.hpp>
34 #include <boost/tokenizer.hpp>
35 #include <condition_variable>
36 #include <cstdio>
37 #include <cstdlib>
38 #include <iostream>
39 #include <list>
40 #include <map>
41 #include <memory>
42 #include <mutex>
43 #include <set>
44 #include <string>
45 #include <utility>
46 
47 #include "../Catalog/Catalog.h"
48 #include "../Catalog/TableDescriptor.h"
49 #include "../Chunk/Chunk.h"
50 #include "../Fragmenter/Fragmenter.h"
51 #include "../Shared/ThreadController.h"
52 #include "../Shared/checked_alloc.h"
53 
55 
56 // Some builds of boost::geometry require iostream, but don't explicitly include it.
57 // Placing in own section to ensure it's included after iostream.
58 #include <boost/geometry/index/rtree.hpp>
59 
60 class TDatum;
61 class TColumn;
62 
63 // not too big (need much memory) but not too small (many thread forks)
64 constexpr static size_t kImportFileBufferSize = (1 << 23);
65 
66 namespace arrow {
67 
68 class Array;
69 
70 } // namespace arrow
71 
72 namespace Importer_NS {
73 
74 class Importer;
75 
76 using ArraySliceRange = std::pair<size_t, size_t>;
77 
79  std::mutex mutex;
80  std::set<int64_t> rows;
81  std::atomic<int> nerrors;
82  std::string file_name;
83  int row_group;
85 };
86 
87 enum class FileType {
88  DELIMITED,
89  POLYGON
90 #ifdef ENABLE_IMPORT_PARQUET
91  ,
92  PARQUET
93 #endif
94 };
95 
97 
98 struct CopyParams {
99  char delimiter;
100  std::string null_str;
102  bool quoted; // does the input have any quoted fields, default to false
103  char quote;
104  char escape;
108  char array_end;
109  int threads;
110  size_t
111  max_reject; // maximum number of records that can be rejected before copy is failed
113  bool plain_text = false;
114  // s3/parquet related params
115  std::string s3_access_key; // per-query credentials to override the
116  std::string s3_secret_key; // settings in ~/.aws/credentials or environment
117  std::string s3_region;
118  std::string s3_endpoint;
119  // kafka related params
120  size_t retry_count;
121  size_t retry_wait;
122  size_t batch_size;
123  size_t buffer_size;
124  // geospatial params
125  bool lonlat;
131  std::string geo_layer_name;
132 
134  : delimiter(',')
135  , null_str("\\N")
137  , quoted(true)
138  , quote('"')
139  , escape('"')
140  , line_delim('\n')
141  , array_delim(',')
142  , array_begin('{')
143  , array_end('}')
144  , threads(0)
145  , max_reject(100000)
147  , retry_count(100)
148  , retry_wait(5)
149  , batch_size(1000)
151  , lonlat(true)
155  , geo_coords_srid(4326)
157 
158  CopyParams(char d, const std::string& n, char l, size_t b, size_t retries, size_t wait)
159  : delimiter(d)
160  , null_str(n)
162  , quoted(true)
163  , quote('"')
164  , escape('"')
165  , line_delim(l)
166  , array_delim(',')
167  , array_begin('{')
168  , array_end('}')
169  , threads(0)
170  , max_reject(100000)
172  , retry_count(retries)
173  , retry_wait(wait)
174  , batch_size(b)
176  , lonlat(true)
180  , geo_coords_srid(4326)
182 };
183 
184 class TypedImportBuffer : boost::noncopyable {
185  public:
186  TypedImportBuffer(const ColumnDescriptor* col_desc, StringDictionary* string_dict)
187  : column_desc_(col_desc), string_dict_(string_dict) {
188  switch (col_desc->columnType.get_type()) {
189  case kBOOLEAN:
190  bool_buffer_ = new std::vector<int8_t>();
191  break;
192  case kTINYINT:
193  tinyint_buffer_ = new std::vector<int8_t>();
194  break;
195  case kSMALLINT:
196  smallint_buffer_ = new std::vector<int16_t>();
197  break;
198  case kINT:
199  int_buffer_ = new std::vector<int32_t>();
200  break;
201  case kBIGINT:
202  case kNUMERIC:
203  case kDECIMAL:
204  bigint_buffer_ = new std::vector<int64_t>();
205  break;
206  case kFLOAT:
207  float_buffer_ = new std::vector<float>();
208  break;
209  case kDOUBLE:
210  double_buffer_ = new std::vector<double>();
211  break;
212  case kTEXT:
213  case kVARCHAR:
214  case kCHAR:
215  string_buffer_ = new std::vector<std::string>();
216  if (col_desc->columnType.get_compression() == kENCODING_DICT) {
217  switch (col_desc->columnType.get_size()) {
218  case 1:
219  string_dict_i8_buffer_ = new std::vector<uint8_t>();
220  break;
221  case 2:
222  string_dict_i16_buffer_ = new std::vector<uint16_t>();
223  break;
224  case 4:
225  string_dict_i32_buffer_ = new std::vector<int32_t>();
226  break;
227  default:
228  CHECK(false);
229  }
230  }
231  break;
232  case kDATE:
233  case kTIME:
234  case kTIMESTAMP:
235  bigint_buffer_ = new std::vector<int64_t>();
236  break;
237  case kARRAY:
238  if (IS_STRING(col_desc->columnType.get_subtype())) {
240  string_array_buffer_ = new std::vector<std::vector<std::string>>();
241  string_array_dict_buffer_ = new std::vector<ArrayDatum>();
242  } else {
243  array_buffer_ = new std::vector<ArrayDatum>();
244  }
245  break;
246  case kPOINT:
247  case kLINESTRING:
248  case kPOLYGON:
249  case kMULTIPOLYGON:
250  geo_string_buffer_ = new std::vector<std::string>();
251  break;
252  default:
253  CHECK(false);
254  }
255  }
256 
258  switch (column_desc_->columnType.get_type()) {
259  case kBOOLEAN:
260  delete bool_buffer_;
261  break;
262  case kTINYINT:
263  delete tinyint_buffer_;
264  break;
265  case kSMALLINT:
266  delete smallint_buffer_;
267  break;
268  case kINT:
269  delete int_buffer_;
270  break;
271  case kBIGINT:
272  case kNUMERIC:
273  case kDECIMAL:
274  delete bigint_buffer_;
275  break;
276  case kFLOAT:
277  delete float_buffer_;
278  break;
279  case kDOUBLE:
280  delete double_buffer_;
281  break;
282  case kTEXT:
283  case kVARCHAR:
284  case kCHAR:
285  delete string_buffer_;
287  switch (column_desc_->columnType.get_size()) {
288  case 1:
289  delete string_dict_i8_buffer_;
290  break;
291  case 2:
293  break;
294  case 4:
296  break;
297  }
298  }
299  break;
300  case kDATE:
301  case kTIME:
302  case kTIMESTAMP:
303  delete bigint_buffer_;
304  break;
305  case kARRAY:
307  delete string_array_buffer_;
309  } else {
310  delete array_buffer_;
311  }
312  break;
313  case kPOINT:
314  case kLINESTRING:
315  case kPOLYGON:
316  case kMULTIPOLYGON:
317  delete geo_string_buffer_;
318  break;
319  default:
320  CHECK(false);
321  }
322  }
323 
324  void addBoolean(const int8_t v) { bool_buffer_->push_back(v); }
325 
326  void addTinyint(const int8_t v) { tinyint_buffer_->push_back(v); }
327 
328  void addSmallint(const int16_t v) { smallint_buffer_->push_back(v); }
329 
330  void addInt(const int32_t v) { int_buffer_->push_back(v); }
331 
332  void addBigint(const int64_t v) { bigint_buffer_->push_back(v); }
333 
334  void addFloat(const float v) { float_buffer_->push_back(v); }
335 
336  void addDouble(const double v) { double_buffer_->push_back(v); }
337 
338  void addString(const std::string& v) { string_buffer_->push_back(v); }
339 
340  void addGeoString(const std::string& v) { geo_string_buffer_->push_back(v); }
341 
342  void addArray(const ArrayDatum& v) { array_buffer_->push_back(v); }
343 
344  std::vector<std::string>& addStringArray() {
345  string_array_buffer_->push_back(std::vector<std::string>());
346  return string_array_buffer_->back();
347  }
348 
349  void addStringArray(const std::vector<std::string>& arr) {
350  string_array_buffer_->push_back(arr);
351  }
352 
353  void addDictEncodedString(const std::vector<std::string>& string_vec) {
355  for (const auto& str : string_vec) {
356  if (str.size() > StringDictionary::MAX_STRLEN) {
357  throw std::runtime_error("String too long for dictionary encoding.");
358  }
359  }
360  switch (column_desc_->columnType.get_size()) {
361  case 1:
362  string_dict_i8_buffer_->resize(string_vec.size());
363  string_dict_->getOrAddBulk(string_vec, string_dict_i8_buffer_->data());
364  break;
365  case 2:
366  string_dict_i16_buffer_->resize(string_vec.size());
367  string_dict_->getOrAddBulk(string_vec, string_dict_i16_buffer_->data());
368  break;
369  case 4:
370  string_dict_i32_buffer_->resize(string_vec.size());
371  string_dict_->getOrAddBulk(string_vec, string_dict_i32_buffer_->data());
372  break;
373  default:
374  CHECK(false);
375  }
376  }
377 
379  const std::vector<std::vector<std::string>>& string_array_vec) {
381 
382  // first check data is ok
383  for (auto& p : string_array_vec) {
384  for (const auto& str : p) {
385  if (str.size() > StringDictionary::MAX_STRLEN) {
386  throw std::runtime_error("String too long for dictionary encoding.");
387  }
388  }
389  }
390 
391  std::vector<std::vector<int32_t>> ids_array(0);
392  string_dict_->getOrAddBulkArray(string_array_vec, ids_array);
393 
394  for (auto& p : ids_array) {
395  size_t len = p.size() * sizeof(int32_t);
396  auto a = static_cast<int32_t*>(checked_malloc(len));
397  memcpy(a, &p[0], len);
398  // TODO: distinguish between empty and NULL
399  string_array_dict_buffer_->push_back(
400  ArrayDatum(len, reinterpret_cast<int8_t*>(a), len == 0));
401  }
402  }
403 
404  const SQLTypeInfo& getTypeInfo() const { return column_desc_->columnType; }
405 
406  const ColumnDescriptor* getColumnDesc() const { return column_desc_; }
407 
409 
410  int8_t* getAsBytes() const {
411  switch (column_desc_->columnType.get_type()) {
412  case kBOOLEAN:
413  return reinterpret_cast<int8_t*>(&((*bool_buffer_)[0]));
414  case kTINYINT:
415  return reinterpret_cast<int8_t*>(&((*tinyint_buffer_)[0]));
416  case kSMALLINT:
417  return reinterpret_cast<int8_t*>(&((*smallint_buffer_)[0]));
418  case kINT:
419  return reinterpret_cast<int8_t*>(&((*int_buffer_)[0]));
420  case kBIGINT:
421  case kNUMERIC:
422  case kDECIMAL:
423  return reinterpret_cast<int8_t*>(&((*bigint_buffer_)[0]));
424  case kFLOAT:
425  return reinterpret_cast<int8_t*>(&((*float_buffer_)[0]));
426  case kDOUBLE:
427  return reinterpret_cast<int8_t*>(&((*double_buffer_)[0]));
428  case kDATE:
429  case kTIME:
430  case kTIMESTAMP:
431  return reinterpret_cast<int8_t*>(&((*bigint_buffer_)[0]));
432  default:
433  abort();
434  }
435  }
436 
437  size_t getElementSize() const {
438  switch (column_desc_->columnType.get_type()) {
439  case kBOOLEAN:
440  return sizeof((*bool_buffer_)[0]);
441  case kTINYINT:
442  return sizeof((*tinyint_buffer_)[0]);
443  case kSMALLINT:
444  return sizeof((*smallint_buffer_)[0]);
445  case kINT:
446  return sizeof((*int_buffer_)[0]);
447  case kBIGINT:
448  case kNUMERIC:
449  case kDECIMAL:
450  return sizeof((*bigint_buffer_)[0]);
451  case kFLOAT:
452  return sizeof((*float_buffer_)[0]);
453  case kDOUBLE:
454  return sizeof((*double_buffer_)[0]);
455  case kDATE:
456  case kTIME:
457  case kTIMESTAMP:
458  return sizeof((*bigint_buffer_)[0]);
459  default:
460  abort();
461  }
462  }
463 
464  std::vector<std::string>* getStringBuffer() const { return string_buffer_; }
465 
466  std::vector<std::string>* getGeoStringBuffer() const { return geo_string_buffer_; }
467 
468  std::vector<ArrayDatum>* getArrayBuffer() const { return array_buffer_; }
469 
470  std::vector<std::vector<std::string>>* getStringArrayBuffer() const {
471  return string_array_buffer_;
472  }
473 
474  std::vector<ArrayDatum>* getStringArrayDictBuffer() const {
476  }
477 
478  int8_t* getStringDictBuffer() const {
479  switch (column_desc_->columnType.get_size()) {
480  case 1:
481  return reinterpret_cast<int8_t*>(&((*string_dict_i8_buffer_)[0]));
482  case 2:
483  return reinterpret_cast<int8_t*>(&((*string_dict_i16_buffer_)[0]));
484  case 4:
485  return reinterpret_cast<int8_t*>(&((*string_dict_i32_buffer_)[0]));
486  default:
487  abort();
488  }
489  }
490 
492  if (string_dict_ == nullptr) {
493  return true;
494  }
495  return string_dict_->checkpoint();
496  }
497 
498  void clear() {
499  switch (column_desc_->columnType.get_type()) {
500  case kBOOLEAN: {
501  bool_buffer_->clear();
502  break;
503  }
504  case kTINYINT: {
505  tinyint_buffer_->clear();
506  break;
507  }
508  case kSMALLINT: {
509  smallint_buffer_->clear();
510  break;
511  }
512  case kINT: {
513  int_buffer_->clear();
514  break;
515  }
516  case kBIGINT:
517  case kNUMERIC:
518  case kDECIMAL: {
519  bigint_buffer_->clear();
520  break;
521  }
522  case kFLOAT: {
523  float_buffer_->clear();
524  break;
525  }
526  case kDOUBLE: {
527  double_buffer_->clear();
528  break;
529  }
530  case kTEXT:
531  case kVARCHAR:
532  case kCHAR: {
533  string_buffer_->clear();
535  switch (column_desc_->columnType.get_size()) {
536  case 1:
537  string_dict_i8_buffer_->clear();
538  break;
539  case 2:
540  string_dict_i16_buffer_->clear();
541  break;
542  case 4:
543  string_dict_i32_buffer_->clear();
544  break;
545  default:
546  CHECK(false);
547  }
548  }
549  break;
550  }
551  case kDATE:
552  case kTIME:
553  case kTIMESTAMP:
554  bigint_buffer_->clear();
555  break;
556  case kARRAY: {
558  string_array_buffer_->clear();
559  string_array_dict_buffer_->clear();
560  } else {
561  array_buffer_->clear();
562  }
563  break;
564  }
565  case kPOINT:
566  case kLINESTRING:
567  case kPOLYGON:
568  case kMULTIPOLYGON:
569  geo_string_buffer_->clear();
570  break;
571  default:
572  CHECK(false);
573  }
574  }
575 
576  size_t add_values(const ColumnDescriptor* cd, const TColumn& data);
577 
578  size_t add_arrow_values(const ColumnDescriptor* cd,
579  const arrow::Array& data,
580  const bool exact_type_match,
581  const ArraySliceRange& slice_range,
582  BadRowsTracker* bad_rows_tracker);
583 
584  void add_value(const ColumnDescriptor* cd,
585  const std::string& val,
586  const bool is_null,
587  const CopyParams& copy_params,
588  const int64_t replicate_count = 0);
589  void add_value(const ColumnDescriptor* cd,
590  const TDatum& val,
591  const bool is_null,
592  const int64_t replicate_count = 0);
593  void pop_value();
594 
595  int64_t get_replicate_count() const { return replicate_count_; }
596  void set_replicate_count(const int64_t replicate_count) {
597  replicate_count_ = replicate_count;
598  }
599  template <typename DATA_TYPE>
601  const arrow::Array& array,
602  std::vector<DATA_TYPE>& buffer,
603  const ArraySliceRange& slice_range,
604  BadRowsTracker* const bad_rows_tracker);
605  template <typename DATA_TYPE>
606  auto del_values(std::vector<DATA_TYPE>& buffer, BadRowsTracker* const bad_rows_tracker);
607  auto del_values(const SQLTypes type, BadRowsTracker* const bad_rows_tracker);
608  std::vector<std::unique_ptr<TypedImportBuffer>>* import_buffers;
609  size_t col_idx;
610 
611  private:
612  union {
613  std::vector<int8_t>* bool_buffer_;
614  std::vector<int8_t>* tinyint_buffer_;
615  std::vector<int16_t>* smallint_buffer_;
616  std::vector<int32_t>* int_buffer_;
617  std::vector<int64_t>* bigint_buffer_;
618  std::vector<float>* float_buffer_;
619  std::vector<double>* double_buffer_;
620  std::vector<std::string>* string_buffer_;
621  std::vector<std::string>* geo_string_buffer_;
622  std::vector<ArrayDatum>* array_buffer_;
623  std::vector<std::vector<std::string>>* string_array_buffer_;
624  };
625  union {
626  std::vector<uint8_t>* string_dict_i8_buffer_;
627  std::vector<uint16_t>* string_dict_i16_buffer_;
628  std::vector<int32_t>* string_dict_i32_buffer_;
629  std::vector<ArrayDatum>* string_array_dict_buffer_;
630  };
633  size_t replicate_count_ = 0;
634 };
635 
636 class Loader {
637  public:
639  : catalog_(c)
640  , table_desc_(t)
641  , column_descs_(c.getAllColumnMetadataForTable(t->tableId, false, false, true)) {
642  init();
643  }
644 
645  virtual ~Loader() {}
646 
648  const TableDescriptor* getTableDesc() const { return table_desc_; }
649  const std::list<const ColumnDescriptor*>& get_column_descs() const {
650  return column_descs_;
651  }
652 
654  if ((cd->columnType.get_type() != kARRAY ||
655  !IS_STRING(cd->columnType.get_subtype())) &&
656  (!cd->columnType.is_string() ||
658  return nullptr;
659  }
660  return dict_map_.at(cd->columnId);
661  }
662 
663  virtual bool load(const std::vector<std::unique_ptr<TypedImportBuffer>>& import_buffers,
664  const size_t row_count);
665  virtual bool loadNoCheckpoint(
666  const std::vector<std::unique_ptr<TypedImportBuffer>>& import_buffers,
667  const size_t row_count);
668  virtual void checkpoint();
669  virtual int32_t getTableEpoch();
670  virtual void setTableEpoch(const int32_t new_epoch);
671 
672  void setReplicating(const bool replicating) { replicating_ = replicating; }
673  bool getReplicating() const { return replicating_; }
674 
675  protected:
676  void init();
677 
678  virtual bool loadImpl(
679  const std::vector<std::unique_ptr<TypedImportBuffer>>& import_buffers,
680  size_t row_count,
681  bool checkpoint);
682 
683  using OneShardBuffers = std::vector<std::unique_ptr<TypedImportBuffer>>;
684  void distributeToShards(std::vector<OneShardBuffers>& all_shard_import_buffers,
685  std::vector<size_t>& all_shard_row_counts,
686  const OneShardBuffers& import_buffers,
687  const size_t row_count,
688  const size_t shard_count);
689 
692  std::list<const ColumnDescriptor*> column_descs_;
694  std::map<int, StringDictionary*> dict_map_;
695 
696  private:
697  std::vector<DataBlockPtr> get_data_block_pointers(
698  const std::vector<std::unique_ptr<TypedImportBuffer>>& import_buffers);
699  bool loadToShard(const std::vector<std::unique_ptr<TypedImportBuffer>>& import_buffers,
700  size_t row_count,
701  const TableDescriptor* shard_table,
702  bool checkpoint);
703 
704  bool replicating_ = false;
705  std::mutex loader_mutex_;
706 };
707 
708 struct ImportStatus {
709  std::chrono::steady_clock::time_point start;
710  std::chrono::steady_clock::time_point end;
714  std::chrono::duration<size_t, std::milli> elapsed;
716  int thread_id; // to recall thread_id after thread exit
718  : start(std::chrono::steady_clock::now())
719  , rows_completed(0)
720  , rows_estimated(0)
721  , rows_rejected(0)
722  , elapsed(0)
723  , load_truncated(0)
724  , thread_id(0) {}
725 
729 
730  return *this;
731  }
732 };
733 
735  public:
737  DataStreamSink(const CopyParams& copy_params, const std::string file_path)
738  : copy_params(copy_params), file_path(file_path) {}
739  virtual ~DataStreamSink() {}
740  virtual ImportStatus importDelimited(const std::string& file_path,
741  const bool decompressed) = 0;
742 #ifdef ENABLE_IMPORT_PARQUET
743  virtual void import_parquet(std::vector<std::string>& file_paths);
744  virtual void import_local_parquet(const std::string& file_path) = 0;
745 #endif
746  const CopyParams& get_copy_params() const { return copy_params; }
747  void import_compressed(std::vector<std::string>& file_paths);
748 
749  protected:
751 
753  const std::string file_path;
754  FILE* p_file = nullptr;
756  bool load_failed = false;
757  size_t total_file_size{0};
758  std::vector<size_t> file_offsets;
759  std::mutex file_offsets_mutex;
760 };
761 
762 class Detector : public DataStreamSink {
763  public:
764  Detector(const boost::filesystem::path& fp, CopyParams& cp)
765  : DataStreamSink(cp, fp.string()), file_path(fp) {
766  read_file();
767  init();
768  };
769 #ifdef ENABLE_IMPORT_PARQUET
770  void import_local_parquet(const std::string& file_path) override;
771 #endif
772  static SQLTypes detect_sqltype(const std::string& str);
773  std::vector<std::string> get_headers();
774  std::vector<std::vector<std::string>> raw_rows;
775  std::vector<std::vector<std::string>> get_sample_rows(size_t n);
776  std::vector<SQLTypes> best_sqltypes;
777  std::vector<EncodingType> best_encodings;
778  bool has_headers = false;
779 
780  private:
781  void init();
782  void read_file();
783  void detect_row_delimiter();
784  void split_raw_data();
785  std::vector<SQLTypes> detect_column_types(const std::vector<std::string>& row);
786  static bool more_restrictive_sqltype(const SQLTypes a, const SQLTypes b);
787  void find_best_sqltypes();
788  std::vector<SQLTypes> find_best_sqltypes(
789  const std::vector<std::vector<std::string>>& raw_rows,
790  const CopyParams& copy_params);
791  std::vector<SQLTypes> find_best_sqltypes(
792  const std::vector<std::vector<std::string>>::const_iterator& row_begin,
793  const std::vector<std::vector<std::string>>::const_iterator& row_end,
794  const CopyParams& copy_params);
795 
796  std::vector<EncodingType> find_best_encodings(
797  const std::vector<std::vector<std::string>>::const_iterator& row_begin,
798  const std::vector<std::vector<std::string>>::const_iterator& row_end,
799  const std::vector<SQLTypes>& best_types);
800 
801  bool detect_headers(const std::vector<SQLTypes>& first_types,
802  const std::vector<SQLTypes>& rest_types);
804  ImportStatus importDelimited(const std::string& file_path,
805  const bool decompressed) override;
806  std::string raw_data;
807  boost::filesystem::path file_path;
808  std::chrono::duration<double> timeout{1};
809  std::string line1;
810 };
811 
813  public:
814  static bool parseStringArray(const std::string& s,
815  const CopyParams& copy_params,
816  std::vector<std::string>& string_vec) {
817  if (s == copy_params.null_str || s == "NULL" || s.size() < 1 || s.empty()) {
818  // TODO: should not convert NULL, empty arrays to {"NULL"},
819  // need to support NULL, empty properly
820  string_vec.emplace_back("NULL");
821  return true;
822  }
823  if (s[0] != copy_params.array_begin || s[s.size() - 1] != copy_params.array_end) {
824  throw std::runtime_error("Malformed Array :" + s);
825  }
826  size_t last = 1;
827  for (size_t i = s.find(copy_params.array_delim, 1); i != std::string::npos;
828  i = s.find(copy_params.array_delim, last)) {
829  if (i > last) { // if not empty string - disallow empty strings for now
830  if (s.substr(last, i - last).length() > StringDictionary::MAX_STRLEN) {
831  throw std::runtime_error("Array String too long : " +
832  std::to_string(s.substr(last, i - last).length()) +
833  " max is " +
835  }
836 
837  string_vec.push_back(s.substr(last, i - last));
838  }
839  last = i + 1;
840  }
841  if (s.size() - 1 > last) { // if not empty string - disallow empty strings for now
842  if (s.substr(last, s.size() - 1 - last).length() > StringDictionary::MAX_STRLEN) {
843  throw std::runtime_error(
844  "Array String too long : " +
845  std::to_string(s.substr(last, s.size() - 1 - last).length()) + " max is " +
847  }
848 
849  string_vec.push_back(s.substr(last, s.size() - 1 - last));
850  }
851  return false;
852  }
853  static ArrayDatum composeNullArray(const SQLTypeInfo& ti);
854 };
855 
857  public:
858  RenderGroupAnalyzer() : _rtree(std::make_unique<RTree>()), _numRenderGroups(0) {}
859  void seedFromExistingTableContents(const std::unique_ptr<Loader>& loader,
860  const std::string& geoColumnBaseName);
861  int insertBoundsAndReturnRenderGroup(const std::vector<double>& bounds);
862 
863  private:
864  using Point = boost::geometry::model::point<double, 2, boost::geometry::cs::cartesian>;
865  using BoundingBox = boost::geometry::model::box<Point>;
866  using Node = std::pair<BoundingBox, int>;
867  using RTree =
868  boost::geometry::index::rtree<Node, boost::geometry::index::quadratic<16>>;
869  std::unique_ptr<RTree> _rtree;
870  std::mutex _rtreeMutex;
872 };
873 
874 class Importer : public DataStreamSink {
875  public:
877  const TableDescriptor* t,
878  const std::string& f,
879  const CopyParams& p);
880  Importer(Loader* providedLoader, const std::string& f, const CopyParams& p);
881  ~Importer() override;
882  ImportStatus import();
883  ImportStatus importDelimited(const std::string& file_path,
884  const bool decompressed) override;
885  ImportStatus importGDAL(std::map<std::string, std::string> colname_to_src);
886  static bool hasGDALLibKML();
887  const CopyParams& get_copy_params() const { return copy_params; }
888  const std::list<const ColumnDescriptor*>& get_column_descs() const {
889  return loader->get_column_descs();
890  }
891  void load(const std::vector<std::unique_ptr<TypedImportBuffer>>& import_buffers,
892  size_t row_count);
893  std::vector<std::vector<std::unique_ptr<TypedImportBuffer>>>& get_import_buffers_vec() {
894  return import_buffers_vec;
895  }
896  std::vector<std::unique_ptr<TypedImportBuffer>>& get_import_buffers(int i) {
897  return import_buffers_vec[i];
898  }
899  const bool* get_is_array() const { return is_array_a.get(); }
900 #ifdef ENABLE_IMPORT_PARQUET
901  void import_local_parquet(const std::string& file_path) override;
902 #endif
903  static ImportStatus get_import_status(const std::string& id);
904  static void set_import_status(const std::string& id, const ImportStatus is);
905  static const std::list<ColumnDescriptor> gdalToColumnDescriptors(
906  const std::string& fileName,
907  const std::string& geoColumnName,
908  const CopyParams& copy_params);
909  static void readMetadataSampleGDAL(
910  const std::string& fileName,
911  const std::string& geoColumnName,
912  std::map<std::string, std::vector<std::string>>& metadata,
913  int rowLimit,
914  const CopyParams& copy_params);
915  static bool gdalFileExists(const std::string& path, const CopyParams& copy_params);
916  static bool gdalFileOrDirectoryExists(const std::string& path,
917  const CopyParams& copy_params);
918  static std::vector<std::string> gdalGetAllFilesInArchive(
919  const std::string& archive_path,
920  const CopyParams& copy_params);
923  GeoFileLayerInfo(const std::string& name_, GeoFileLayerContents contents_)
924  : name(name_), contents(contents_) {}
925  std::string name;
927  };
928  static std::vector<GeoFileLayerInfo> gdalGetLayersInGeoFile(
929  const std::string& file_name,
930  const CopyParams& copy_params);
931  static bool gdalSupportsNetworkFileAccess();
932  Catalog_Namespace::Catalog& getCatalog() { return loader->getCatalog(); }
933  static void set_geo_physical_import_buffer(
934  const Catalog_Namespace::Catalog& catalog,
935  const ColumnDescriptor* cd,
936  std::vector<std::unique_ptr<TypedImportBuffer>>& import_buffers,
937  size_t& col_idx,
938  std::vector<double>& coords,
939  std::vector<double>& bounds,
940  std::vector<int>& ring_sizes,
941  std::vector<int>& poly_rings,
942  int render_group,
943  const int64_t replicate_count = 0);
945  const Catalog_Namespace::Catalog& catalog,
946  const ColumnDescriptor* cd,
947  std::vector<std::unique_ptr<TypedImportBuffer>>& import_buffers,
948  size_t& col_idx,
949  std::vector<std::vector<double>>& coords_column,
950  std::vector<std::vector<double>>& bounds_column,
951  std::vector<std::vector<int>>& ring_sizes_column,
952  std::vector<std::vector<int>>& poly_rings_column,
953  int render_group,
954  const int64_t replicate_count = 0);
955  void checkpoint(const int32_t start_epoch);
956  auto getLoader() const { return loader.get(); }
957 
958  private:
959  static void initGDAL();
960  static bool gdalStatInternal(const std::string& path,
961  const CopyParams& copy_params,
962  bool also_dir);
963  static OGRDataSource* openGDALDataset(const std::string& fileName,
964  const CopyParams& copy_params);
966  std::string import_id;
967  size_t file_size;
968  size_t max_threads;
969  char* buffer[2];
970  std::vector<std::vector<std::unique_ptr<TypedImportBuffer>>> import_buffers_vec;
971  std::unique_ptr<Loader> loader;
972  std::unique_ptr<bool[]> is_array_a;
973  static std::mutex init_gdal_mutex;
974 };
975 
977  public:
978  ImportDriver(std::shared_ptr<Catalog_Namespace::Catalog> cat,
981 
982  void importGeoTable(const std::string& file_path,
983  const std::string& table_name,
984  const bool compression = true,
985  const bool create_table = true);
986 };
987 
988 } // namespace Importer_NS
989 
990 #endif // _IMPORTER_H_
std::unique_ptr< Loader > loader
Definition: Importer.h:971
static std::vector< GeoFileLayerInfo > gdalGetLayersInGeoFile(const std::string &file_name, const CopyParams &copy_params)
Definition: Importer.cpp:4499
void addSmallint(const int16_t v)
Definition: Importer.h:328
static void setGDALAuthorizationTokens(const CopyParams &copy_params)
Definition: Importer.cpp:4048
void distributeToShards(std::vector< OneShardBuffers > &all_shard_import_buffers, std::vector< size_t > &all_shard_row_counts, const OneShardBuffers &import_buffers, const size_t row_count, const size_t shard_count)
Definition: Importer.cpp:2398
StringDictionary * getStringDict(const ColumnDescriptor *cd) const
Definition: Importer.h:653
static ImportStatus get_import_status(const std::string &id)
Definition: Importer.cpp:207
std::vector< int16_t > * smallint_buffer_
Definition: Importer.h:615
std::vector< std::vector< std::string > > * getStringArrayBuffer() const
Definition: Importer.h:470
std::chrono::duration< double > timeout
Definition: Importer.h:808
std::vector< EncodingType > find_best_encodings(const std::vector< std::vector< std::string >>::const_iterator &row_begin, const std::vector< std::vector< std::string >>::const_iterator &row_end, const std::vector< SQLTypes > &best_types)
Definition: Importer.cpp:3003
size_t getElementSize() const
Definition: Importer.h:437
void d(const SQLTypes expected_type, const std::string &str)
Definition: ImportTest.cpp:289
std::vector< size_t > file_offsets
Definition: Importer.h:758
void import_compressed(std::vector< std::string > &file_paths)
Definition: Importer.cpp:3506
std::vector< std::unique_ptr< TypedImportBuffer >> OneShardBuffers
Definition: Importer.h:683
void addGeoString(const std::string &v)
Definition: Importer.h:340
std::vector< int64_t > * bigint_buffer_
Definition: Importer.h:617
std::vector< EncodingType > best_encodings
Definition: Importer.h:777
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:81
std::string null_str
Definition: Importer.h:100
Definition: sqltypes.h:52
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:332
ImportStatus importDelimited(const std::string &file_path, const bool decompressed) override
Definition: Importer.cpp:2700
SQLTypes
Definition: sqltypes.h:41
void addTinyint(const int8_t v)
Definition: Importer.h:326
size_t add_arrow_values(const ColumnDescriptor *cd, const arrow::Array &data, const bool exact_type_match, const ArraySliceRange &slice_range, BadRowsTracker *bad_rows_tracker)
Definition: Importer.cpp:962
EncodingType
Definition: encodetypes.h:22
virtual void checkpoint()
Definition: Importer.cpp:3961
ExecutorDeviceType
std::vector< std::string > * getGeoStringBuffer() const
Definition: Importer.h:466
static SQLTypes detect_sqltype(const std::string &str)
Definition: Importer.cpp:2820
std::vector< float > * float_buffer_
Definition: Importer.h:618
std::mutex loader_mutex_
Definition: Importer.h:705
StringDictionary * getStringDictionary() const
Definition: Importer.h:408
Catalog_Namespace::Catalog & getCatalog()
Definition: Importer.h:647
size_t convert_arrow_val_to_import_buffer(const ColumnDescriptor *cd, const arrow::Array &array, std::vector< DATA_TYPE > &buffer, const ArraySliceRange &slice_range, BadRowsTracker *const bad_rows_tracker)
int insertBoundsAndReturnRenderGroup(const std::vector< double > &bounds)
Definition: Importer.cpp:4926
TypedImportBuffer(const ColumnDescriptor *col_desc, StringDictionary *string_dict)
Definition: Importer.h:186
const CopyParams & get_copy_params() const
Definition: Importer.h:887
static constexpr size_t kImportFileBufferSize
Definition: Importer.h:64
ImportStatus importGDAL(std::map< std::string, std::string > colname_to_src)
Definition: Importer.cpp:4568
Fragmenter_Namespace::InsertData insert_data_
Definition: Importer.h:693
std::chrono::steady_clock::time_point end
Definition: Importer.h:710
~Importer() override
Definition: Importer.cpp:195
void c(const std::string &query_string, const ExecutorDeviceType device_type)
const std::list< const ColumnDescriptor * > & get_column_descs() const
Definition: Importer.h:649
SQLTypes geo_coords_type
Definition: Importer.h:128
std::vector< ArrayDatum > * getStringArrayDictBuffer() const
Definition: Importer.h:474
std::vector< std::vector< std::unique_ptr< TypedImportBuffer > > > & get_import_buffers_vec()
Definition: Importer.h:893
static ArrayDatum composeNullArray(const SQLTypeInfo &ti)
Definition: Importer.cpp:525
HOST DEVICE int get_size() const
Definition: sqltypes.h:334
std::map< int, StringDictionary * > dict_map_
Definition: Importer.h:694
virtual bool load(const std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, const size_t row_count)
Definition: Importer.cpp:2344
std::unique_ptr< bool[]> is_array_a
Definition: Importer.h:972
Catalog_Namespace::Catalog & catalog_
Definition: Importer.h:690
int8_t * getAsBytes() const
Definition: Importer.h:410
boost::geometry::model::point< double, 2, boost::geometry::cs::cartesian > Point
Definition: Importer.h:864
Detector(const boost::filesystem::path &fp, CopyParams &cp)
Definition: Importer.h:764
void addInt(const int32_t v)
Definition: Importer.h:330
const SQLTypeInfo & getTypeInfo() const
Definition: Importer.h:404
void addDictEncodedString(const std::vector< std::string > &string_vec)
Definition: Importer.h:353
std::vector< uint16_t > * string_dict_i16_buffer_
Definition: Importer.h:627
std::chrono::steady_clock::time_point start
Definition: Importer.h:709
virtual bool loadImpl(const std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, size_t row_count, bool checkpoint)
Definition: Importer.cpp:2546
std::string to_string(char const *&&v)
std::string line1
Definition: Importer.h:809
EncodingType geo_coords_encoding
Definition: Importer.h:126
static bool gdalFileOrDirectoryExists(const std::string &path, const CopyParams &copy_params)
Definition: Importer.cpp:4402
static const std::list< ColumnDescriptor > gdalToColumnDescriptors(const std::string &fileName, const std::string &geoColumnName, const CopyParams &copy_params)
Definition: Importer.cpp:4295
void set_replicate_count(const int64_t replicate_count)
Definition: Importer.h:596
boost::geometry::index::rtree< Node, boost::geometry::index::quadratic< 16 >> RTree
Definition: Importer.h:868
void addBoolean(const int8_t v)
Definition: Importer.h:324
void setReplicating(const bool replicating)
Definition: Importer.h:672
virtual ImportStatus importDelimited(const std::string &file_path, const bool decompressed)=0
virtual void setTableEpoch(const int32_t new_epoch)
Definition: Importer.cpp:3973
std::chrono::duration< size_t, std::milli > elapsed
Definition: Importer.h:714
std::vector< int8_t > * tinyint_buffer_
Definition: Importer.h:614
Catalog_Namespace::Catalog & getCatalog()
Definition: Importer.h:932
virtual ~Loader()
Definition: Importer.h:645
std::vector< std::vector< std::string > > get_sample_rows(size_t n)
Definition: Importer.cpp:3054
CopyParams(char d, const std::string &n, char l, size_t b, size_t retries, size_t wait)
Definition: Importer.h:158
void addFloat(const float v)
Definition: Importer.h:334
void seedFromExistingTableContents(const std::unique_ptr< Loader > &loader, const std::string &geoColumnBaseName)
Definition: Importer.cpp:4802
std::vector< std::vector< std::unique_ptr< TypedImportBuffer > > > import_buffers_vec
Definition: Importer.h:970
T v(const TargetValue &r)
std::vector< std::string > * string_buffer_
Definition: Importer.h:620
boost::geometry::model::box< Point > BoundingBox
Definition: Importer.h:865
std::string s3_access_key
Definition: Importer.h:115
StringDictionary * string_dict_
Definition: Importer.h:632
static bool hasGDALLibKML()
Definition: Importer.cpp:4043
std::vector< int32_t > * string_dict_i32_buffer_
Definition: Importer.h:628
std::vector< std::vector< std::string > > * string_array_buffer_
Definition: Importer.h:623
static bool gdalSupportsNetworkFileAccess()
Definition: Importer.cpp:4560
void importGeoTable(const std::string &file_path, const std::string &table_name, const bool compression=true, const bool create_table=true)
Definition: Importer.cpp:4977
int64_t get_replicate_count() const
Definition: Importer.h:595
std::vector< int32_t > * int_buffer_
Definition: Importer.h:616
void * checked_malloc(const size_t size)
Definition: checked_alloc.h:40
std::string import_id
Definition: Importer.h:966
const CopyParams & get_copy_params() const
Definition: Importer.h:746
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:324
ImportStatus archivePlumber()
Definition: Importer.cpp:3110
Loader(Catalog_Namespace::Catalog &c, const TableDescriptor *t)
Definition: Importer.h:638
auto getLoader() const
Definition: Importer.h:956
void getOrAddBulk(const std::vector< std::string > &string_vec, T *encoded_vec)
std::pair< BoundingBox, int > Node
Definition: Importer.h:866
std::string raw_data
Definition: Importer.h:806
ImportStatus & operator+=(const ImportStatus &is)
Definition: Importer.h:726
std::atomic< int > nerrors
Definition: Importer.h:81
ImportHeaderRow has_header
Definition: Importer.h:101
specifies the content in-memory of a row in the column metadata table
bool g_enable_smem_group_by true
void addBigint(const int64_t v)
Definition: Importer.h:332
std::vector< std::string > get_headers()
Definition: Importer.cpp:3062
void add_value(const ColumnDescriptor *cd, const std::string &val, const bool is_null, const CopyParams &copy_params, const int64_t replicate_count=0)
Definition: Importer.cpp:618
boost::filesystem::path file_path
Definition: Importer.h:807
std::vector< int8_t > * bool_buffer_
Definition: Importer.h:613
std::list< const ColumnDescriptor * > column_descs_
Definition: Importer.h:692
static void set_geo_physical_import_buffer_columnar(const Catalog_Namespace::Catalog &catalog, const ColumnDescriptor *cd, std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, size_t &col_idx, std::vector< std::vector< double >> &coords_column, std::vector< std::vector< double >> &bounds_column, std::vector< std::vector< int >> &ring_sizes_column, std::vector< std::vector< int >> &poly_rings_column, int render_group, const int64_t replicate_count=0)
Definition: Importer.cpp:1668
static void set_geo_physical_import_buffer(const Catalog_Namespace::Catalog &catalog, const ColumnDescriptor *cd, std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, size_t &col_idx, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, int render_group, const int64_t replicate_count=0)
Definition: Importer.cpp:1584
virtual int32_t getTableEpoch()
Definition: Importer.cpp:3968
static OGRDataSource * openGDALDataset(const std::string &fileName, const CopyParams &copy_params)
Definition: Importer.cpp:4118
static bool parseStringArray(const std::string &s, const CopyParams &copy_params, std::vector< std::string > &string_vec)
Definition: Importer.h:814
std::string s3_endpoint
Definition: Importer.h:118
std::vector< std::unique_ptr< TypedImportBuffer > > * import_buffers
Definition: Importer.h:608
void getOrAddBulkArray(const std::vector< std::vector< std::string >> &string_array_vec, std::vector< std::vector< int32_t >> &ids_array_vec)
std::mutex file_offsets_mutex
Definition: Importer.h:759
std::vector< SQLTypes > detect_column_types(const std::vector< std::string > &row)
Definition: Importer.cpp:2897
bool checkpoint() noexcept
const bool * get_is_array() const
Definition: Importer.h:899
Definition: sqltypes.h:55
Definition: sqltypes.h:56
static bool gdalFileExists(const std::string &path, const CopyParams &copy_params)
Definition: Importer.cpp:4397
static bool more_restrictive_sqltype(const SQLTypes a, const SQLTypes b)
Definition: Importer.cpp:2905
const std::list< const ColumnDescriptor * > & get_column_descs() const
Definition: Importer.h:888
const TableDescriptor * table_desc_
Definition: Importer.h:691
bool is_null(const T &v, const SQLTypeInfo &t)
void addStringArray(const std::vector< std::string > &arr)
Definition: Importer.h:349
std::vector< std::string > * geo_string_buffer_
Definition: Importer.h:621
void addArray(const ArrayDatum &v)
Definition: Importer.h:342
std::string s3_region
Definition: Importer.h:117
void find_best_sqltypes_and_headers()
Definition: Importer.cpp:2927
DataStreamSink(const CopyParams &copy_params, const std::string file_path)
Definition: Importer.h:737
std::vector< std::vector< std::string > > raw_rows
Definition: Importer.h:774
std::vector< ArrayDatum > * string_array_dict_buffer_
Definition: Importer.h:629
std::vector< double > * double_buffer_
Definition: Importer.h:619
bool getReplicating() const
Definition: Importer.h:673
void checkpoint(const int32_t start_epoch)
Definition: Importer.cpp:3081
static bool gdalStatInternal(const std::string &path, const CopyParams &copy_params, bool also_dir)
Definition: Importer.cpp:4364
static void readMetadataSampleGDAL(const std::string &fileName, const std::string &geoColumnName, std::map< std::string, std::vector< std::string >> &metadata, int rowLimit, const CopyParams &copy_params)
Definition: Importer.cpp:4174
std::string geo_layer_name
Definition: Importer.h:131
ImportStatus import_status
Definition: Importer.h:755
Definition: sqltypes.h:44
const ColumnDescriptor * column_desc_
Definition: Importer.h:631
const TableDescriptor * getTableDesc() const
Definition: Importer.h:648
auto del_values(std::vector< DATA_TYPE > &buffer, BadRowsTracker *const bad_rows_tracker)
#define IS_STRING(T)
Definition: sqltypes.h:164
bool detect_headers(const std::vector< SQLTypes > &first_types, const std::vector< SQLTypes > &rest_types)
Definition: Importer.cpp:3039
std::vector< DataBlockPtr > get_data_block_pointers(const std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers)
Definition: Importer.cpp:2574
void load(const std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, size_t row_count)
Definition: Importer.cpp:3074
HOST DEVICE SQLTypes get_subtype() const
Definition: sqltypes.h:325
void addString(const std::string &v)
Definition: Importer.h:338
std::vector< std::string > * getStringBuffer() const
Definition: Importer.h:464
bool is_string() const
Definition: sqltypes.h:472
bool g_enable_debug_timer false
Definition: Execute.cpp:68
void addDictEncodedStringArray(const std::vector< std::vector< std::string >> &string_array_vec)
Definition: Importer.h:378
#define CHECK(condition)
Definition: Logger.h:187
Importer(Catalog_Namespace::Catalog &c, const TableDescriptor *t, const std::string &f, const CopyParams &p)
Definition: Importer.cpp:148
static void initGDAL()
Definition: Importer.cpp:4001
ImportDriver(std::shared_ptr< Catalog_Namespace::Catalog > cat, const Catalog_Namespace::UserMetadata &user, const ExecutorDeviceType dt=ExecutorDeviceType::GPU)
Definition: Importer.cpp:4972
std::vector< ArrayDatum > * getArrayBuffer() const
Definition: Importer.h:468
The data to be inserted using the fragment manager.
Definition: Fragmenter.h:59
std::vector< std::unique_ptr< TypedImportBuffer > > & get_import_buffers(int i)
Definition: Importer.h:896
static constexpr size_t MAX_STRLEN
Definition: sqltypes.h:48
SQLTypeInfo columnType
std::vector< uint8_t > * string_dict_i8_buffer_
Definition: Importer.h:626
std::set< int64_t > rows
Definition: Importer.h:80
static std::mutex init_gdal_mutex
Definition: Importer.h:973
specifies the content in-memory of a row in the table metadata table
std::string s3_secret_key
Definition: Importer.h:116
size_t add_values(const ColumnDescriptor *cd, const TColumn &data)
Definition: Importer.cpp:1062
static void set_import_status(const std::string &id, const ImportStatus is)
Definition: Importer.cpp:212
const ColumnDescriptor * getColumnDesc() const
Definition: Importer.h:406
std::vector< std::string > & addStringArray()
Definition: Importer.h:344
std::pair< size_t, size_t > ArraySliceRange
Definition: Importer.h:76
GeoFileLayerInfo(const std::string &name_, GeoFileLayerContents contents_)
Definition: Importer.h:923
virtual bool loadNoCheckpoint(const std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, const size_t row_count)
Definition: Importer.cpp:2338
std::unique_ptr< RTree > _rtree
Definition: Importer.h:869
const std::string file_path
Definition: Importer.h:753
int32_t geo_coords_comp_param
Definition: Importer.h:127
std::conditional_t< isCudaCC(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:120
bool loadToShard(const std::vector< std::unique_ptr< TypedImportBuffer >> &import_buffers, size_t row_count, const TableDescriptor *shard_table, bool checkpoint)
Definition: Importer.cpp:2635
std::vector< ArrayDatum > * array_buffer_
Definition: Importer.h:622
ImportStatus importDelimited(const std::string &file_path, const bool decompressed) override
Definition: Importer.cpp:3756
int8_t * getStringDictBuffer() const
Definition: Importer.h:478
std::vector< SQLTypes > best_sqltypes
Definition: Importer.h:776
void addDouble(const double v)
Definition: Importer.h:336
static std::vector< std::string > gdalGetAllFilesInArchive(const std::string &archive_path, const CopyParams &copy_params)
Definition: Importer.cpp:4474