OmniSciDB  2e3a973ef4
foreign_storage Namespace Reference

Namespaces

 anonymous_namespace{CachingForeignStorageMgr.cpp}
 
 anonymous_namespace{CsvDataWrapper.cpp}
 
 anonymous_namespace{CsvReader.cpp}
 
 anonymous_namespace{CsvReaderS3.cpp}
 
 anonymous_namespace{ForeignStorageCache.cpp}
 
 anonymous_namespace{LazyParquetChunkLoader.cpp}
 
 anonymous_namespace{LazyParquetImporter.cpp}
 
 anonymous_namespace{ParquetDataWrapper.cpp}
 
 csv_file_buffer_parser
 
 json_utils
 

Classes

struct  AllowedParquetMetadataTypeMappings
 
class  ArchiveWrapper
 
class  ArrayMetadataStats
 
class  CachingForeignStorageMgr
 
struct  ColumnType
 
class  CompressedFileReader
 
class  CsvDataWrapper
 
class  CsvReader
 
class  CsvReaderS3
 
struct  DataWrapperType
 Encapsulates an enumeration of foreign data wrapper type strings. More...
 
struct  FileRegion
 
class  ForeignDataWrapper
 
struct  ForeignServer
 
class  ForeignStorageBuffer
 
class  ForeignStorageCache
 
class  ForeignStorageMgr
 
struct  ForeignTable
 
class  ForeignTableRefreshScheduler
 
class  ForeignTableSchema
 
struct  FragmentType
 
struct  Interval
 
class  LazyParquetChunkLoader
 
class  LazyParquetImporter
 
class  LocalMultiFileReader
 
struct  MetadataScanMultiThreadingParams
 
class  MockForeignDataWrapper
 
class  MultiFileReader
 
class  MultiS3Reader
 
struct  OptionsContainer
 
class  ParquetArrayEncoder
 
class  ParquetDataWrapper
 
class  ParquetDateInSecondsEncoder
 
class  ParquetDecimalEncoder
 
class  ParquetEncoder
 
class  ParquetFixedLengthArrayEncoder
 
class  ParquetFixedLengthEncoder
 
class  ParquetGeospatialEncoder
 
class  ParquetInPlaceEncoder
 
struct  ParquetLoaderMetadata
 
class  ParquetS3FileSystem
 
class  ParquetScalarEncoder
 
class  ParquetStringEncoder
 
class  ParquetStringNoneEncoder
 
class  ParquetTimeEncoder
 
class  ParquetTimestampEncoder
 
class  ParquetUnsignedFixedLengthEncoder
 
class  ParquetVariableLengthArrayEncoder
 
struct  ParseFileRegionResult
 
struct  RowGroupInterval
 
struct  RowGroupMetadata
 
class  SingleFileReader
 
struct  TableEvictionTracker
 
class  TypedParquetInPlaceEncoder
 

Typedefs

using FileRegions = std::vector< FileRegion >
 
using read_lock = mapd_shared_lock< mapd_shared_mutex >
 
using write_lock = mapd_unique_lock< mapd_shared_mutex >
 

Functions

int64_t get_interval_duration (const std::string &interval)
 
int64_t get_next_refresh_time (const std::map< std::string, std::string, std::less<>> &foreign_table_options)
 
ParseFileRegionResult parse_file_regions (const FileRegions &file_regions, const size_t start_index, const size_t end_index, CsvReader &csv_reader, std::mutex &file_access_mutex, csv_file_buffer_parser::ParseBufferRequest &parse_file_request, const std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map)
 
size_t get_buffer_size (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size)
 
size_t get_buffer_size (const FileRegions &file_regions)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size, const size_t buffer_size)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const FileRegions &file_regions)
 
void initialize_import_buffers (const std::list< const ColumnDescriptor *> &columns, std::shared_ptr< Catalog_Namespace::Catalog > catalog, std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, const std::set< int > &column_filter_set={})
 
std::vector< size_t > partition_by_fragment (const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
 
std::optional< csv_file_buffer_parser::ParseBufferRequestget_next_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params)
 
void add_file_region (std::map< int, FileRegions > &fragment_id_to_file_regions_map, std::mutex &file_region_mutex, int fragment_id, size_t first_row_index, const csv_file_buffer_parser::ParseBufferResult &result)
 
size_t get_var_length_data_block_size (DataBlockPtr data_block, SQLTypeInfo sql_type_info)
 
void update_stats (Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
 
void update_metadata (MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const csv_file_buffer_parser::ParseBufferRequest &request, const csv_file_buffer_parser::ParseBufferResult &result, std::map< int, const ColumnDescriptor *> &column_by_id)
 
void add_request_to_pool (MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
 
void scan_metadata (MetadataScanMultiThreadingParams &multi_threading_params, std::map< int, FileRegions > &fragment_id_to_file_regions_map, std::mutex &file_region_mutex)
 
csv_file_buffer_parser::ParseBufferRequest get_request_from_pool (MetadataScanMultiThreadingParams &multi_threading_params)
 
void dispatch_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
 
void resize_buffer_if_needed (std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
 
void dispatch_metadata_scan_requests (const size_t &buffer_size, const std::string &file_path, CsvReader &csv_reader, const import_export::CopyParams &copy_params, MetadataScanMultiThreadingParams &multi_threading_params, size_t &first_row_index_in_buffer, size_t &current_file_offset)
 
void set_value (rapidjson::Value &json_val, const FileRegion &file_region, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, FileRegion &file_region)
 
void throw_removed_row_error (const std::string &file_path)
 
void throw_removed_file_error (const std::string &file_path)
 
std::set< ChunkKeyget_keys_set_from_table (const ChunkKey &destination_chunk_key)
 
std::vector< ChunkKeyget_keys_vec_from_table (const ChunkKey &destination_chunk_key)
 
void refresh_foreign_table (Catalog_Namespace::Catalog &catalog, const std::string &table_name, const bool evict_cached_entries)
 
void set_value (rapidjson::Value &json_val, const RowGroupInterval &value, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, RowGroupInterval &value)
 
template<typename T >
ArrayDatum encode_as_array_datum (const std::vector< T > &data)
 
int64_t get_time_conversion_denominator (const parquet::LogicalType::TimeUnit::unit time_unit)
 
template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
get_null_value ()
 
void open_parquet_table (const std::string &file_path, std::unique_ptr< parquet::arrow::FileReader > &reader, std::shared_ptr< arrow::fs::FileSystem > &file_system)
 
std::pair< int, int > get_parquet_table_size (const std::unique_ptr< parquet::arrow::FileReader > &reader)
 
const parquet::ColumnDescriptor * get_column_descriptor (const parquet::arrow::FileReader *reader, const int logical_column_index)
 
parquet::Type::type get_physical_type (std::unique_ptr< parquet::arrow::FileReader > &reader, const int logical_column_index)
 
void validate_equal_column_descriptor (const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
 
std::unique_ptr< ColumnDescriptorget_sub_type_column_descriptor (const ColumnDescriptor *column)
 
void validate_non_foreign_table_write (const TableDescriptor *table_descriptor)
 

Typedef Documentation

◆ FileRegions

using foreign_storage::FileRegions = typedef std::vector<FileRegion>

Definition at line 53 of file CsvDataWrapper.h.

◆ read_lock

◆ write_lock

Function Documentation

◆ add_file_region()

void foreign_storage::add_file_region ( std::map< int, FileRegions > &  fragment_id_to_file_regions_map,
std::mutex &  file_region_mutex,
int  fragment_id,
size_t  first_row_index,
const csv_file_buffer_parser::ParseBufferResult result 
)

Creates a new file region based on metadata from parsed CSV file buffers and adds the new region to the fragment id to file regions map.

Definition at line 545 of file CsvDataWrapper.cpp.

References foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::FileRegion::region_size, foreign_storage::FileRegion::row_count, foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count, and foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_offsets.

Referenced by scan_metadata().

549  {
550  FileRegion file_region;
551  file_region.first_row_file_offset = result.row_offsets.front();
552  file_region.region_size = result.row_offsets.back() - file_region.first_row_file_offset;
553  file_region.first_row_index = first_row_index;
554  file_region.row_count = result.row_count;
555 
556  {
557  std::lock_guard<std::mutex> lock(file_region_mutex);
558  fragment_id_to_file_regions_map[fragment_id].emplace_back(file_region);
559  }
560 }
+ Here is the caller graph for this function:

◆ add_request_to_pool()

void foreign_storage::add_request_to_pool ( MetadataScanMultiThreadingParams multi_threading_params,
csv_file_buffer_parser::ParseBufferRequest request 
)

Adds the request object for a processed request back to the request pool for reuse in subsequent requests.

Definition at line 654 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by scan_metadata().

655  {
656  std::unique_lock<std::mutex> completed_requests_queue_lock(
657  multi_threading_params.request_pool_mutex);
658  multi_threading_params.request_pool.emplace(std::move(request));
659  completed_requests_queue_lock.unlock();
660  multi_threading_params.request_pool_condition.notify_all();
661 }
+ Here is the caller graph for this function:

◆ dispatch_metadata_scan_request()

void foreign_storage::dispatch_metadata_scan_request ( MetadataScanMultiThreadingParams multi_threading_params,
csv_file_buffer_parser::ParseBufferRequest request 
)

Dispatches a new metadata scan request by adding the request to the pending requests queue to be consumed by a worker thread.

Definition at line 727 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by dispatch_metadata_scan_requests().

729  {
730  std::unique_lock<std::mutex> pending_requests_lock(
731  multi_threading_params.pending_requests_mutex);
732  multi_threading_params.pending_requests.emplace(std::move(request));
733  pending_requests_lock.unlock();
734  multi_threading_params.pending_requests_condition.notify_all();
735 }
+ Here is the caller graph for this function:

◆ dispatch_metadata_scan_requests()

void foreign_storage::dispatch_metadata_scan_requests ( const size_t &  buffer_size,
const std::string &  file_path,
CsvReader csv_reader,
const import_export::CopyParams copy_params,
MetadataScanMultiThreadingParams multi_threading_params,
size_t &  first_row_index_in_buffer,
size_t &  current_file_offset 
)

Reads from a CSV file iteratively and dispatches metadata scan requests that are processed by worker threads.

Definition at line 755 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, dispatch_metadata_scan_request(), import_export::delimited_parser::find_row_end_pos(), get_request_from_pool(), foreign_storage::CsvReader::isScanFinished(), import_export::CopyParams::line_delim, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, foreign_storage::CsvReader::read(), and resize_buffer_if_needed().

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata().

762  {
763  auto alloc_size = buffer_size;
764  auto residual_buffer = std::make_unique<char[]>(alloc_size);
765  size_t residual_buffer_size = 0;
766  size_t residual_buffer_alloc_size = alloc_size;
767 
768  while (!csv_reader.isScanFinished()) {
769  auto request = get_request_from_pool(multi_threading_params);
770  resize_buffer_if_needed(request.buffer, request.buffer_alloc_size, alloc_size);
771 
772  if (residual_buffer_size > 0) {
773  memcpy(request.buffer.get(), residual_buffer.get(), residual_buffer_size);
774  }
775  size_t size = residual_buffer_size;
776  size += csv_reader.read(request.buffer.get() + residual_buffer_size,
777  alloc_size - residual_buffer_size);
778 
779  if (size == 0) {
780  // In some cases at the end of a file we will read 0 bytes even when
781  // csv_reader.isScanFinished() is false
782  continue;
783  } else if (size == 1 && request.buffer[0] == copy_params.line_delim) {
784  // In some cases files with newlines at the end will be encoded with a second
785  // newline that can end up being the only thing in the buffer
786  current_file_offset++;
787  continue;
788  }
789  unsigned int num_rows_in_buffer = 0;
790  request.end_pos =
792  request.buffer,
793  size,
794  copy_params,
795  first_row_index_in_buffer,
796  num_rows_in_buffer,
797  nullptr,
798  &csv_reader);
799  request.buffer_size = size;
800  request.buffer_alloc_size = alloc_size;
801  request.first_row_index = first_row_index_in_buffer;
802  request.file_offset = current_file_offset;
803  request.buffer_row_count = num_rows_in_buffer;
804 
805  residual_buffer_size = size - request.end_pos;
806  if (residual_buffer_size > 0) {
807  resize_buffer_if_needed(residual_buffer, residual_buffer_alloc_size, alloc_size);
808  memcpy(residual_buffer.get(),
809  request.buffer.get() + request.end_pos,
810  residual_buffer_size);
811  }
812 
813  current_file_offset += request.end_pos;
814  first_row_index_in_buffer += num_rows_in_buffer;
815 
816  dispatch_metadata_scan_request(multi_threading_params, request);
817  }
818 
819  std::unique_lock<std::mutex> pending_requests_queue_lock(
820  multi_threading_params.pending_requests_mutex);
821  multi_threading_params.pending_requests_condition.wait(
822  pending_requests_queue_lock, [&multi_threading_params] {
823  return multi_threading_params.pending_requests.empty();
824  });
825  multi_threading_params.continue_processing = false;
826  pending_requests_queue_lock.unlock();
827  multi_threading_params.pending_requests_condition.notify_all();
828 }
void dispatch_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
csv_file_buffer_parser::ParseBufferRequest get_request_from_pool(MetadataScanMultiThreadingParams &multi_threading_params)
void resize_buffer_if_needed(std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
size_t find_row_end_pos(size_t &alloc_size, std::unique_ptr< char[]> &buffer, size_t &buffer_size, const CopyParams &copy_params, const size_t buffer_first_row_index, unsigned int &num_rows_in_buffer, FILE *file, foreign_storage::CsvReader *csv_reader)
Finds the closest possible row ending to the end of the given buffer. The buffer is resized as needed...
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ encode_as_array_datum()

template<typename T >
ArrayDatum foreign_storage::encode_as_array_datum ( const std::vector< T > &  data)
inline

Definition at line 38 of file ParquetGeospatialEncoder.h.

Referenced by foreign_storage::ParquetGeospatialEncoder::processGeoElement(), and foreign_storage::ParquetGeospatialEncoder::processNullGeoElement().

38  {
39  const size_t num_bytes = data.size() * sizeof(T);
40  std::shared_ptr<int8_t> buffer(new int8_t[num_bytes], std::default_delete<int8_t[]>());
41  memcpy(buffer.get(), data.data(), num_bytes);
42  return ArrayDatum(num_bytes, buffer, false);
43 }
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:131
+ Here is the caller graph for this function:

◆ get_buffer_size() [1/2]

size_t foreign_storage::get_buffer_size ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size 
)

Gets the appropriate buffer size to be used when processing CSV file(s).

Definition at line 320 of file CsvDataWrapper.cpp.

References import_export::CopyParams::buffer_size.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata(), and foreign_storage::CsvDataWrapper::populateChunks().

322  {
323  size_t buffer_size = copy_params.buffer_size;
324  if (size_known && file_size < buffer_size) {
325  buffer_size = file_size + 1; // +1 for end of line character, if missing
326  }
327  return buffer_size;
328 }
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31
+ Here is the caller graph for this function:

◆ get_buffer_size() [2/2]

size_t foreign_storage::get_buffer_size ( const FileRegions file_regions)

Definition at line 330 of file CsvDataWrapper.cpp.

References CHECK.

330  {
331  size_t buffer_size = 0;
332  for (const auto& file_region : file_regions) {
333  buffer_size = std::max(buffer_size, file_region.region_size);
334  }
335  CHECK(buffer_size);
336  return buffer_size;
337 }
#define CHECK(condition)
Definition: Logger.h:197

◆ get_column_descriptor()

const parquet::ColumnDescriptor * foreign_storage::get_column_descriptor ( const parquet::arrow::FileReader *  reader,
const int  logical_column_index 
)

Definition at line 45 of file ParquetShared.cpp.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), foreign_storage::ParquetDataWrapper::populateChunkBuffers(), and foreign_storage::anonymous_namespace{LazyParquetImporter.cpp}::validate_equal_schema().

47  {
48  return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
49 }
+ Here is the caller graph for this function:

◆ get_interval_duration()

int64_t foreign_storage::get_interval_duration ( const std::string &  interval)
inline

Gets the interval duration in seconds.

Parameters
interval- interval string with format of {interval_count}{interval_type} (e.g. 5H for "every 5 hours")
Returns
internal duration in seconds

Definition at line 16 of file RefreshTimeCalculator.h.

References UNREACHABLE.

Referenced by get_next_refresh_time().

16  {
17  int interval_count = std::stoi(interval.substr(0, interval.length() - 1));
18  auto interval_type = std::tolower(interval[interval.length() - 1]);
19  int64_t duration{0};
20  if (interval_type == 's') {
21  duration = interval_count;
22  } else if (interval_type == 'h') {
23  duration = interval_count * 60 * 60;
24  } else if (interval_type == 'd') {
25  duration = interval_count * 60 * 60 * 24;
26  } else {
27  UNREACHABLE();
28  }
29  return duration;
30 }
#define UNREACHABLE()
Definition: Logger.h:241
+ Here is the caller graph for this function:

◆ get_keys_set_from_table()

std::set< ChunkKey > foreign_storage::get_keys_set_from_table ( const ChunkKey destination_chunk_key)

Definition at line 262 of file ForeignStorageMgr.cpp.

References Catalog_Namespace::Catalog::checkedGet(), CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_DB_IDX, CHUNK_KEY_FRAGMENT_IDX, CHUNK_KEY_TABLE_IDX, ColumnDescriptor::columnId, and foreign_storage::ForeignTableSchema::getLogicalColumn().

Referenced by foreign_storage::ForeignStorageMgr::fetchBuffer().

262  {
263  std::set<ChunkKey> chunk_keys;
264  auto db_id = destination_chunk_key[CHUNK_KEY_DB_IDX];
265  auto table_id = destination_chunk_key[CHUNK_KEY_TABLE_IDX];
266  auto destination_column_id = destination_chunk_key[CHUNK_KEY_COLUMN_IDX];
267  auto fragment_id = destination_chunk_key[CHUNK_KEY_FRAGMENT_IDX];
268  auto foreign_table =
269  Catalog_Namespace::Catalog::checkedGet(db_id)->getForeignTableUnlocked(table_id);
270 
271  ForeignTableSchema schema{db_id, foreign_table};
272  auto logical_column = schema.getLogicalColumn(destination_column_id);
273  auto logical_column_id = logical_column->columnId;
274 
275  for (auto column_id = logical_column_id;
276  column_id <= (logical_column_id + logical_column->columnType.get_physical_cols());
277  column_id++) {
278  auto column = schema.getColumnDescriptor(column_id);
279  if (column->columnType.is_varlen_indeed()) {
280  ChunkKey data_chunk_key = {db_id, table_id, column->columnId, fragment_id, 1};
281  chunk_keys.emplace(data_chunk_key);
282 
283  ChunkKey index_chunk_key{db_id, table_id, column->columnId, fragment_id, 2};
284  chunk_keys.emplace(index_chunk_key);
285  } else {
286  ChunkKey data_chunk_key = {db_id, table_id, column->columnId, fragment_id};
287  chunk_keys.emplace(data_chunk_key);
288  }
289  }
290  return chunk_keys;
291 }
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
static std::shared_ptr< Catalog > checkedGet(const int32_t db_id)
Definition: Catalog.cpp:3770
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41
std::vector< int > ChunkKey
Definition: types.h:37
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ get_keys_vec_from_table()

std::vector< ChunkKey > foreign_storage::get_keys_vec_from_table ( const ChunkKey destination_chunk_key)

Definition at line 293 of file ForeignStorageMgr.cpp.

References Catalog_Namespace::Catalog::checkedGet(), CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_DB_IDX, CHUNK_KEY_FRAGMENT_IDX, CHUNK_KEY_TABLE_IDX, ColumnDescriptor::columnId, and foreign_storage::ForeignTableSchema::getLogicalColumn().

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer().

293  {
294  std::vector<ChunkKey> chunk_keys;
295  auto db_id = destination_chunk_key[CHUNK_KEY_DB_IDX];
296  auto table_id = destination_chunk_key[CHUNK_KEY_TABLE_IDX];
297  auto destination_column_id = destination_chunk_key[CHUNK_KEY_COLUMN_IDX];
298  auto fragment_id = destination_chunk_key[CHUNK_KEY_FRAGMENT_IDX];
299  auto foreign_table =
300  Catalog_Namespace::Catalog::checkedGet(db_id)->getForeignTableUnlocked(table_id);
301 
302  ForeignTableSchema schema{db_id, foreign_table};
303  auto logical_column = schema.getLogicalColumn(destination_column_id);
304  auto logical_column_id = logical_column->columnId;
305 
306  for (auto column_id = logical_column_id;
307  column_id <= (logical_column_id + logical_column->columnType.get_physical_cols());
308  column_id++) {
309  auto column = schema.getColumnDescriptor(column_id);
310  if (column->columnType.is_varlen_indeed()) {
311  ChunkKey data_chunk_key = {db_id, table_id, column->columnId, fragment_id, 1};
312  chunk_keys.emplace_back(data_chunk_key);
313 
314  ChunkKey index_chunk_key{db_id, table_id, column->columnId, fragment_id, 2};
315  chunk_keys.emplace_back(index_chunk_key);
316  } else {
317  ChunkKey data_chunk_key = {db_id, table_id, column->columnId, fragment_id};
318  chunk_keys.emplace_back(data_chunk_key);
319  }
320  }
321  return chunk_keys;
322 }
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
static std::shared_ptr< Catalog > checkedGet(const int32_t db_id)
Definition: Catalog.cpp:3770
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41
std::vector< int > ChunkKey
Definition: types.h:37
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ get_next_metadata_scan_request()

std::optional<csv_file_buffer_parser::ParseBufferRequest> foreign_storage::get_next_metadata_scan_request ( MetadataScanMultiThreadingParams multi_threading_params)

Gets the next metadata scan request object from the pending requests queue. A null optional is returned if there are no further requests to be processed.

Definition at line 522 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by scan_metadata().

523  {
524  std::unique_lock<std::mutex> pending_requests_lock(
525  multi_threading_params.pending_requests_mutex);
526  multi_threading_params.pending_requests_condition.wait(
527  pending_requests_lock, [&multi_threading_params] {
528  return !multi_threading_params.pending_requests.empty() ||
529  !multi_threading_params.continue_processing;
530  });
531  if (multi_threading_params.pending_requests.empty()) {
532  return {};
533  }
534  auto request = std::move(multi_threading_params.pending_requests.front());
535  multi_threading_params.pending_requests.pop();
536  pending_requests_lock.unlock();
537  multi_threading_params.pending_requests_condition.notify_all();
538  return std::move(request);
539 }
+ Here is the caller graph for this function:

◆ get_next_refresh_time()

int64_t foreign_storage::get_next_refresh_time ( const std::map< std::string, std::string, std::less<>> &  foreign_table_options)
inline

Definition at line 33 of file RefreshTimeCalculator.h.

References CHECK, dateTimeParse< kTIMESTAMP >(), get_interval_duration(), foreign_storage::ForeignTable::NULL_REFRESH_TIME, foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY, and foreign_storage::ForeignTable::REFRESH_START_DATE_TIME_KEY.

Referenced by Catalog_Namespace::anonymous_namespace{Catalog.cpp}::get_next_refresh_time().

34  {
35  int64_t current_time = std::chrono::duration_cast<std::chrono::seconds>(
36  std::chrono::system_clock::now().time_since_epoch())
37  .count();
38  auto start_date_entry = foreign_table_options.find(
40  CHECK(start_date_entry != foreign_table_options.end());
41  auto start_date_time = dateTimeParse<kTIMESTAMP>(start_date_entry->second, 0);
42 
43  // If start date time is current or in the future, then that is the next refresh time
44  if (start_date_time >= current_time) {
45  return start_date_time;
46  }
47  auto interval_entry =
48  foreign_table_options.find(foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY);
49  if (interval_entry != foreign_table_options.end()) {
50  auto interval_duration = get_interval_duration(interval_entry->second);
51  auto num_intervals =
52  (current_time - start_date_time + interval_duration - 1) / interval_duration;
53  return start_date_time + (num_intervals * interval_duration);
54  } else {
55  // If this was a one time refresh, then there is no next refresh time
57  }
58 }
int64_t get_interval_duration(const std::string &interval)
int64_t dateTimeParse< kTIMESTAMP >(std::string_view str, unsigned const dim)
static constexpr const char * REFRESH_START_DATE_TIME_KEY
Definition: ForeignTable.h:30
static constexpr const char * REFRESH_INTERVAL_KEY
Definition: ForeignTable.h:31
#define CHECK(condition)
Definition: Logger.h:197
static constexpr int NULL_REFRESH_TIME
Definition: ForeignTable.h:37
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ get_null_value()

template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
V foreign_storage::get_null_value ( )
inline

Definition at line 48 of file ParquetInPlaceEncoder.h.

48  {
49  return inline_int_null_value<V>();
50 }

◆ get_parquet_table_size()

std::pair< int, int > foreign_storage::get_parquet_table_size ( const std::unique_ptr< parquet::arrow::FileReader > &  reader)

Definition at line 37 of file ParquetShared.cpp.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), and foreign_storage::LazyParquetImporter::metadataScan().

38  {
39  auto file_metadata = reader->parquet_reader()->metadata();
40  const auto num_row_groups = file_metadata->num_row_groups();
41  const auto num_columns = file_metadata->num_columns();
42  return std::make_pair(num_row_groups, num_columns);
43 }
+ Here is the caller graph for this function:

◆ get_physical_type()

parquet::Type::type foreign_storage::get_physical_type ( std::unique_ptr< parquet::arrow::FileReader > &  reader,
const int  logical_column_index 
)

Definition at line 51 of file ParquetShared.cpp.

52  {
53  return reader->parquet_reader()
54  ->metadata()
55  ->schema()
56  ->Column(logical_column_index)
57  ->physical_type();
58 }

◆ get_request_from_pool()

csv_file_buffer_parser::ParseBufferRequest foreign_storage::get_request_from_pool ( MetadataScanMultiThreadingParams multi_threading_params)

Gets a request from the metadata scan request pool.

Definition at line 709 of file CsvDataWrapper.cpp.

References CHECK, foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by dispatch_metadata_scan_requests().

710  {
711  std::unique_lock<std::mutex> request_pool_lock(
712  multi_threading_params.request_pool_mutex);
713  multi_threading_params.request_pool_condition.wait(
714  request_pool_lock,
715  [&multi_threading_params] { return !multi_threading_params.request_pool.empty(); });
716  auto request = std::move(multi_threading_params.request_pool.front());
717  multi_threading_params.request_pool.pop();
718  request_pool_lock.unlock();
719  CHECK(request.buffer);
720  return request;
721 }
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the caller graph for this function:

◆ get_sub_type_column_descriptor()

std::unique_ptr< ColumnDescriptor > foreign_storage::get_sub_type_column_descriptor ( const ColumnDescriptor column)

Definition at line 76 of file ParquetShared.cpp.

References ColumnDescriptor::columnId, ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_elem_type(), SQLTypeInfo::set_size(), and ColumnDescriptor::tableId.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder(), foreign_storage::anonymous_namespace{LazyParquetImporter.cpp}::read_parquet_metadata_into_import_buffer(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping().

77  {
78  auto column_type = column->columnType.get_elem_type();
79  if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
80  column_type.set_size(4); // override default size of -1
81  }
82  return std::make_unique<ColumnDescriptor>(
83  column->tableId, column->columnId, column->columnName, column_type);
84 }
void set_size(int s)
Definition: sqltypes.h:357
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:624
SQLTypeInfo columnType
std::string columnName
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ get_thread_count() [1/2]

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size,
const size_t  buffer_size 
)

Gets the appropriate number of threads to be used for concurrent processing within the data wrapper.

Definition at line 343 of file CsvDataWrapper.cpp.

References CHECK, and import_export::CopyParams::threads.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata(), and foreign_storage::CsvDataWrapper::populateChunks().

346  {
347  size_t thread_count = copy_params.threads;
348  if (thread_count == 0) {
349  thread_count = std::thread::hardware_concurrency();
350  }
351  if (size_known) {
352  size_t num_buffers_in_file = (file_size + buffer_size - 1) / buffer_size;
353  if (num_buffers_in_file < thread_count) {
354  thread_count = num_buffers_in_file;
355  }
356  }
357  CHECK(thread_count);
358  return thread_count;
359 }
#define CHECK(condition)
Definition: Logger.h:197
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31
+ Here is the caller graph for this function:

◆ get_thread_count() [2/2]

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const FileRegions file_regions 
)

Definition at line 361 of file CsvDataWrapper.cpp.

References CHECK, and import_export::CopyParams::threads.

362  {
363  size_t thread_count = copy_params.threads;
364  if (thread_count == 0) {
365  thread_count =
366  std::min<size_t>(std::thread::hardware_concurrency(), file_regions.size());
367  }
368  CHECK(thread_count);
369  return thread_count;
370 }
#define CHECK(condition)
Definition: Logger.h:197

◆ get_time_conversion_denominator()

int64_t foreign_storage::get_time_conversion_denominator ( const parquet::LogicalType::TimeUnit::unit  time_unit)
inline

Definition at line 28 of file ParquetInPlaceEncoder.h.

References UNREACHABLE.

Referenced by foreign_storage::ParquetTimeEncoder< V, T >::ParquetTimeEncoder(), and foreign_storage::ParquetTimestampEncoder< V, T >::ParquetTimestampEncoder().

29  {
30  int64_t conversion_denominator = 0;
31  switch (time_unit) {
32  case parquet::LogicalType::TimeUnit::MILLIS:
33  conversion_denominator = 1000L;
34  break;
35  case parquet::LogicalType::TimeUnit::MICROS:
36  conversion_denominator = 1000L * 1000L;
37  break;
38  case parquet::LogicalType::TimeUnit::NANOS:
39  conversion_denominator = 1000L * 1000L * 1000L;
40  break;
41  default:
42  UNREACHABLE();
43  }
44  return conversion_denominator;
45 }
#define UNREACHABLE()
Definition: Logger.h:241
+ Here is the caller graph for this function:

◆ get_value() [1/2]

void foreign_storage::get_value ( const rapidjson::Value &  json_val,
RowGroupInterval value 
)

Definition at line 755 of file ParquetDataWrapper.cpp.

References CHECK, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::json_utils::get_value_from_object(), and foreign_storage::RowGroupInterval::start_index.

755  {
756  CHECK(json_val.IsObject());
757  json_utils::get_value_from_object(json_val, value.file_path, "file_path");
758  json_utils::get_value_from_object(json_val, value.start_index, "start_index");
759  json_utils::get_value_from_object(json_val, value.end_index, "end_index");
760 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:126
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:

◆ get_value() [2/2]

void foreign_storage::get_value ( const rapidjson::Value &  json_val,
FileRegion file_region 
)

Definition at line 975 of file CsvDataWrapper.cpp.

References CHECK, foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::json_utils::get_value_from_object(), foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

975  {
976  CHECK(json_val.IsObject());
978  json_val, file_region.first_row_file_offset, "first_row_file_offset");
980  json_val, file_region.first_row_index, "first_row_index");
981  json_utils::get_value_from_object(json_val, file_region.region_size, "region_size");
982  json_utils::get_value_from_object(json_val, file_region.row_count, "row_count");
983 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:126
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:

◆ get_var_length_data_block_size()

size_t foreign_storage::get_var_length_data_block_size ( DataBlockPtr  data_block,
SQLTypeInfo  sql_type_info 
)

Get the total number of bytes in the given data block for a variable length column.

Definition at line 566 of file CsvDataWrapper.cpp.

References DataBlockPtr::arraysPtr, CHECK, SQLTypeInfo::is_array(), SQLTypeInfo::is_geometry(), SQLTypeInfo::is_string(), SQLTypeInfo::is_varlen(), DataBlockPtr::stringsPtr, and UNREACHABLE.

Referenced by update_metadata().

567  {
568  CHECK(sql_type_info.is_varlen());
569  size_t byte_count = 0;
570  if (sql_type_info.is_string() || sql_type_info.is_geometry()) {
571  for (const auto& str : *data_block.stringsPtr) {
572  byte_count += str.length();
573  }
574  } else if (sql_type_info.is_array()) {
575  for (const auto& array : *data_block.arraysPtr) {
576  byte_count += array.length;
577  }
578  } else {
579  UNREACHABLE();
580  }
581  return byte_count;
582 }
bool is_array() const
Definition: sqltypes.h:425
bool is_string() const
Definition: sqltypes.h:417
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:150
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:151
#define UNREACHABLE()
Definition: Logger.h:241
bool is_varlen() const
Definition: sqltypes.h:432
bool is_geometry() const
Definition: sqltypes.h:429
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ initialize_import_buffers()

void foreign_storage::initialize_import_buffers ( const std::list< const ColumnDescriptor *> &  columns,
std::shared_ptr< Catalog_Namespace::Catalog catalog,
std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &  import_buffers,
const std::set< int > &  column_filter_set = {} 
)

Initializes import buffers for each of the provided columns. If column_id_to_chunk_map is provided, only initialize buffers with valid entries

Definition at line 376 of file CsvDataWrapper.cpp.

References IS_STRING, and kENCODING_DICT.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata(), and foreign_storage::CsvDataWrapper::populateChunks().

380  {}) {
381  for (const auto column : columns) {
382  StringDictionary* string_dictionary = nullptr;
383  if (column->columnType.is_dict_encoded_string() ||
384  (column->columnType.is_array() && IS_STRING(column->columnType.get_subtype()) &&
385  column->columnType.get_compression() == kENCODING_DICT)) {
386  auto dict_descriptor =
387  catalog->getMetadataForDictUnlocked(column->columnType.get_comp_param(), true);
388  string_dictionary = dict_descriptor->stringDict.get();
389  }
390  if (column_filter_set.size() == 0 ||
391  column_filter_set.find(column->columnId) != column_filter_set.end()) {
392  import_buffers.emplace_back(
393  std::make_unique<import_export::TypedImportBuffer>(column, string_dictionary));
394  } else {
395  import_buffers.emplace_back(nullptr);
396  }
397  }
398 }
#define IS_STRING(T)
Definition: sqltypes.h:173
+ Here is the caller graph for this function:

◆ open_parquet_table()

void foreign_storage::open_parquet_table ( const std::string &  file_path,
std::unique_ptr< parquet::arrow::FileReader > &  reader,
std::shared_ptr< arrow::fs::FileSystem > &  file_system 
)

Definition at line 25 of file ParquetShared.cpp.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), import_export::DataStreamSink::archivePlumber(), foreign_storage::ParquetDataWrapper::fetchChunkMetadata(), foreign_storage::LazyParquetImporter::metadataScan(), foreign_storage::LazyParquetImporter::partialImport(), and foreign_storage::ParquetDataWrapper::populateChunkBuffers().

27  {
28  auto file_result = file_system->OpenInputFile(file_path);
29  if (!file_result.ok()) {
30  throw std::runtime_error{"Unable to access " + file_system->type_name() + " file: " +
31  file_path + ". " + file_result.status().message()};
32  }
33  auto infile = file_result.ValueOrDie();
34  PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
35 }
+ Here is the caller graph for this function:

◆ parse_file_regions()

ParseFileRegionResult foreign_storage::parse_file_regions ( const FileRegions file_regions,
const size_t  start_index,
const size_t  end_index,
CsvReader csv_reader,
std::mutex &  file_access_mutex,
csv_file_buffer_parser::ParseBufferRequest parse_file_request,
const std::map< int, Chunk_NS::Chunk > &  column_id_to_chunk_map 
)

Parses a set of file regions given a handle to the file and range of indexes for the file regions to be parsed.

Definition at line 278 of file CsvDataWrapper.cpp.

References foreign_storage::csv_file_buffer_parser::ParseBufferRequest::begin_pos, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::buffer, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::buffer_size, CHECK, CHECK_EQ, foreign_storage::csv_file_buffer_parser::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::end_pos, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::file_offset, foreign_storage::ParseFileRegionResult::file_offset, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::first_row_index, foreign_storage::csv_file_buffer_parser::parse_buffer(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::process_row_count, foreign_storage::CsvReader::readRegion(), run_benchmark_import::result, and foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count.

Referenced by foreign_storage::CsvDataWrapper::populateChunks().

285  {
286  ParseFileRegionResult load_file_region_result{};
287  load_file_region_result.file_offset = file_regions[start_index].first_row_file_offset;
288  load_file_region_result.row_count = 0;
289 
290  csv_file_buffer_parser::ParseBufferResult result;
291  for (size_t i = start_index; i <= end_index; i++) {
292  CHECK(file_regions[i].region_size <= parse_file_request.buffer_size);
293  size_t read_size;
294  {
295  std::lock_guard<std::mutex> lock(file_access_mutex);
296  read_size = csv_reader.readRegion(parse_file_request.buffer.get(),
297  file_regions[i].first_row_file_offset,
298  file_regions[i].region_size);
299  }
300 
301  CHECK_EQ(file_regions[i].region_size, read_size);
302  parse_file_request.begin_pos = 0;
303  parse_file_request.end_pos = file_regions[i].region_size;
304  parse_file_request.first_row_index = file_regions[i].first_row_index;
305  parse_file_request.file_offset = file_regions[i].first_row_file_offset;
306  parse_file_request.process_row_count = file_regions[i].row_count;
307 
308  result = parse_buffer(parse_file_request);
309  CHECK_EQ(file_regions[i].row_count, result.row_count);
310  load_file_region_result.row_count += result.row_count;
311  }
312  load_file_region_result.column_id_to_data_blocks_map =
313  result.column_id_to_data_blocks_map;
314  return load_file_region_result;
315 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
ParseBufferResult parse_buffer(ParseBufferRequest &request)
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ partition_by_fragment()

std::vector<size_t> foreign_storage::partition_by_fragment ( const size_t  start_row_index,
const size_t  max_fragment_size,
const size_t  buffer_row_count 
)

Given a start row index, maximum fragment size, and number of rows in a buffer, this function returns a vector indicating how the rows in the buffer should be partitioned in order to fill up available fragment slots while staying within the capacity of fragments.

Definition at line 470 of file CsvDataWrapper.cpp.

References CHECK.

Referenced by scan_metadata().

472  {
473  CHECK(buffer_row_count > 0);
474  std::vector<size_t> partitions{};
475  size_t remaining_rows_in_last_fragment;
476  if (start_row_index % max_fragment_size == 0) {
477  remaining_rows_in_last_fragment = 0;
478  } else {
479  remaining_rows_in_last_fragment =
480  max_fragment_size - (start_row_index % max_fragment_size);
481  }
482  if (buffer_row_count <= remaining_rows_in_last_fragment) {
483  partitions.emplace_back(buffer_row_count);
484  } else {
485  if (remaining_rows_in_last_fragment > 0) {
486  partitions.emplace_back(remaining_rows_in_last_fragment);
487  }
488  size_t remaining_buffer_row_count =
489  buffer_row_count - remaining_rows_in_last_fragment;
490  while (remaining_buffer_row_count > 0) {
491  partitions.emplace_back(
492  std::min<size_t>(remaining_buffer_row_count, max_fragment_size));
493  remaining_buffer_row_count -= partitions.back();
494  }
495  }
496  return partitions;
497 }
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the caller graph for this function:

◆ refresh_foreign_table()

void foreign_storage::refresh_foreign_table ( Catalog_Namespace::Catalog catalog,
const std::string &  table_name,
const bool  evict_cached_entries 
)

Definition at line 22 of file ForeignTableRefresh.cpp.

References Data_Namespace::CPU_LEVEL, Catalog_Namespace::DBMetadata::dbId, StorageType::FOREIGN_TABLE, Catalog_Namespace::Catalog::getCurrentDB(), Catalog_Namespace::Catalog::getDataMgr(), PostEvictionRefreshException::getOriginalException(), Data_Namespace::GPU_LEVEL, Catalog_Namespace::Catalog::removeFragmenterForTable(), and Catalog_Namespace::Catalog::updateForeignTableRefreshTimes().

Referenced by RefreshForeignTablesCommand::execute(), and foreign_storage::ForeignTableRefreshScheduler::start().

24  {
25  auto& data_mgr = catalog.getDataMgr();
26  auto table_lock =
27  std::make_unique<lockmgr::TableSchemaLockContainer<lockmgr::WriteLock>>(
29  catalog, table_name, false));
30 
31  const TableDescriptor* td = (*table_lock)();
32  if (td->storageType != StorageType::FOREIGN_TABLE) {
33  throw std::runtime_error{
34  table_name +
35  " is not a foreign table. Refreshes are applicable to only foreign tables."};
36  }
37 
38  catalog.removeFragmenterForTable(td->tableId);
39  ChunkKey table_key{catalog.getCurrentDB().dbId, td->tableId};
40  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::CPU_LEVEL);
41  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::GPU_LEVEL);
42 
43  try {
44  data_mgr.getPersistentStorageMgr()->getForeignStorageMgr()->refreshTable(
45  table_key, evict_cached_entries);
46  catalog.updateForeignTableRefreshTimes(td->tableId);
47  } catch (PostEvictionRefreshException& e) {
48  catalog.updateForeignTableRefreshTimes(td->tableId);
49  throw e.getOriginalException();
50  }
51 }
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:209
std::runtime_error getOriginalException()
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:208
void removeFragmenterForTable(const int table_id)
Definition: Catalog.cpp:3172
std::vector< int > ChunkKey
Definition: types.h:37
specifies the content in-memory of a row in the table metadata table
static constexpr char const * FOREIGN_TABLE
void updateForeignTableRefreshTimes(const int32_t table_id)
Definition: Catalog.cpp:4329
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ resize_buffer_if_needed()

void foreign_storage::resize_buffer_if_needed ( std::unique_ptr< char[]> &  buffer,
size_t &  buffer_size,
const size_t  alloc_size 
)

Optionally resizes the given buffer if the buffer size is less than the current buffer allocation size.

Definition at line 741 of file CsvDataWrapper.cpp.

References CHECK_LE.

Referenced by dispatch_metadata_scan_requests().

743  {
744  CHECK_LE(buffer_size, alloc_size);
745  if (buffer_size < alloc_size) {
746  buffer = std::make_unique<char[]>(alloc_size);
747  buffer_size = alloc_size;
748  }
749 }
#define CHECK_LE(x, y)
Definition: Logger.h:208
+ Here is the caller graph for this function:

◆ scan_metadata()

void foreign_storage::scan_metadata ( MetadataScanMultiThreadingParams multi_threading_params,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map,
std::mutex &  file_region_mutex 
)

Consumes and processes metadata scan requests from a pending requests queue and updates existing metadata objects based on newly scanned metadata.

Definition at line 667 of file CsvDataWrapper.cpp.

References add_file_region(), add_request_to_pool(), get_next_metadata_scan_request(), foreign_storage::csv_file_buffer_parser::parse_buffer(), partition_by_fragment(), run_benchmark_import::result, and update_metadata().

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata().

669  {
670  std::map<int, const ColumnDescriptor*> column_by_id{};
671  while (true) {
672  auto request_opt = get_next_metadata_scan_request(multi_threading_params);
673  if (!request_opt.has_value()) {
674  break;
675  }
676  auto& request = request_opt.value();
677  if (column_by_id.empty()) {
678  for (const auto column : request.columns) {
679  column_by_id[column->columnId] = column;
680  }
681  }
682  auto partitions = partition_by_fragment(
683  request.first_row_index, request.max_fragment_rows, request.buffer_row_count);
684  request.begin_pos = 0;
685  size_t row_index = request.first_row_index;
686  for (const auto partition : partitions) {
687  request.process_row_count = partition;
688  for (const auto& import_buffer : request.import_buffers) {
689  import_buffer->clear();
690  }
691  auto result = parse_buffer(request);
692  int fragment_id = row_index / request.max_fragment_rows;
693  add_file_region(fragment_id_to_file_regions_map,
694  file_region_mutex,
695  fragment_id,
696  request.first_row_index,
697  result);
698  update_metadata(multi_threading_params, fragment_id, request, result, column_by_id);
699  row_index += result.row_count;
700  request.begin_pos = result.row_offsets.back() - request.file_offset;
701  }
702  add_request_to_pool(multi_threading_params, request);
703  }
704 }
void add_request_to_pool(MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
ParseBufferResult parse_buffer(ParseBufferRequest &request)
void update_metadata(MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const csv_file_buffer_parser::ParseBufferRequest &request, const csv_file_buffer_parser::ParseBufferResult &result, std::map< int, const ColumnDescriptor *> &column_by_id)
std::vector< size_t > partition_by_fragment(const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
void add_file_region(std::map< int, FileRegions > &fragment_id_to_file_regions_map, std::mutex &file_region_mutex, int fragment_id, size_t first_row_index, const csv_file_buffer_parser::ParseBufferResult &result)
std::optional< csv_file_buffer_parser::ParseBufferRequest > get_next_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ set_value() [1/2]

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const RowGroupInterval value,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 746 of file ParquetDataWrapper.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, and foreign_storage::RowGroupInterval::start_index.

748  {
749  json_val.SetObject();
750  json_utils::add_value_to_object(json_val, value.file_path, "file_path", allocator);
751  json_utils::add_value_to_object(json_val, value.start_index, "start_index", allocator);
752  json_utils::add_value_to_object(json_val, value.end_index, "end_index", allocator);
753 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:111
+ Here is the call graph for this function:

◆ set_value() [2/2]

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const FileRegion file_region,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 961 of file CsvDataWrapper.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

963  {
964  json_val.SetObject();
966  json_val, file_region.first_row_file_offset, "first_row_file_offset", allocator);
968  json_val, file_region.first_row_index, "first_row_index", allocator);
970  json_val, file_region.region_size, "region_size", allocator);
972  json_val, file_region.row_count, "row_count", allocator);
973 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:111
+ Here is the call graph for this function:

◆ throw_removed_file_error()

void foreign_storage::throw_removed_file_error ( const std::string &  file_path)
inline

Definition at line 29 of file ForeignDataWrapperShared.h.

Referenced by foreign_storage::LocalMultiFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

29  {
30  throw std::runtime_error{
31  "Refresh of foreign table created with \"APPEND\" update type failed as "
32  "file \"" +
33  file_path + "\" was removed."};
34 }
+ Here is the caller graph for this function:

◆ throw_removed_row_error()

void foreign_storage::throw_removed_row_error ( const std::string &  file_path)
inline

Definition at line 22 of file ForeignDataWrapperShared.h.

Referenced by foreign_storage::SingleFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

22  {
23  throw std::runtime_error{
24  "Refresh of foreign table created with \"APPEND\" update type failed as file "
25  "reduced in size: " +
26  file_path};
27 }
+ Here is the caller graph for this function:

◆ update_metadata()

void foreign_storage::update_metadata ( MetadataScanMultiThreadingParams multi_threading_params,
int  fragment_id,
const csv_file_buffer_parser::ParseBufferRequest request,
const csv_file_buffer_parser::ParseBufferResult result,
std::map< int, const ColumnDescriptor *> &  column_by_id 
)

Updates metadata encapsulated in encoders for all table columns given new data blocks gotten from parsing a new set of rows in a CSV file buffer.

Definition at line 605 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::chunk_byte_count, foreign_storage::MetadataScanMultiThreadingParams::chunk_byte_count_mutex, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers_mutex, foreign_storage::csv_file_buffer_parser::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::db_id, get_var_length_data_block_size(), foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::table_id, and update_stats().

Referenced by scan_metadata().

609  {
610  for (auto& [column_id, data_block] : result.column_id_to_data_blocks_map) {
611  ChunkKey chunk_key{request.db_id, request.table_id, column_id, fragment_id};
612  const auto column = column_by_id[column_id];
613  size_t byte_count;
614  if (column->columnType.is_varlen_indeed()) {
615  chunk_key.emplace_back(1);
616  byte_count = get_var_length_data_block_size(data_block, column->columnType);
617  } else {
618  byte_count = column->columnType.get_size() * result.row_count;
619  }
620 
621  {
622  std::lock_guard<std::mutex> lock(multi_threading_params.chunk_byte_count_mutex);
623  multi_threading_params.chunk_byte_count[chunk_key] += byte_count;
624  }
625 
626  {
627  std::lock_guard<std::mutex> lock(
628  multi_threading_params.chunk_encoder_buffers_mutex);
629  if (multi_threading_params.chunk_encoder_buffers.find(chunk_key) ==
630  multi_threading_params.chunk_encoder_buffers.end()) {
631  multi_threading_params.chunk_encoder_buffers[chunk_key] =
632  std::make_unique<ForeignStorageBuffer>();
633  multi_threading_params.chunk_encoder_buffers[chunk_key]->initEncoder(
634  column->columnType);
635  }
636  update_stats(multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder(),
637  column->columnType,
638  data_block,
639  result.row_count);
640  size_t num_elements = multi_threading_params.chunk_encoder_buffers[chunk_key]
641  ->getEncoder()
642  ->getNumElems() +
643  result.row_count;
644  multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder()->setNumElems(
645  num_elements);
646  }
647  }
648 }
size_t get_var_length_data_block_size(DataBlockPtr data_block, SQLTypeInfo sql_type_info)
void update_stats(Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
std::vector< int > ChunkKey
Definition: types.h:37
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ update_stats()

void foreign_storage::update_stats ( Encoder encoder,
const SQLTypeInfo column_type,
DataBlockPtr  data_block,
const size_t  row_count 
)

Updates the statistics metadata encapsulated in the encoder given new data in a data block.

Definition at line 588 of file CsvDataWrapper.cpp.

References DataBlockPtr::arraysPtr, SQLTypeInfo::is_array(), SQLTypeInfo::is_varlen(), DataBlockPtr::numbersPtr, DataBlockPtr::stringsPtr, and Encoder::updateStats().

Referenced by update_metadata(), and Fragmenter_Namespace::InsertOrderFragmenter::updateColumnMetadata().

591  {
592  if (column_type.is_array()) {
593  encoder->updateStats(data_block.arraysPtr, 0, row_count);
594  } else if (!column_type.is_varlen()) {
595  encoder->updateStats(data_block.numbersPtr, row_count);
596  } else {
597  encoder->updateStats(data_block.stringsPtr, 0, row_count);
598  }
599 }
bool is_array() const
Definition: sqltypes.h:425
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:150
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:151
bool is_varlen() const
Definition: sqltypes.h:432
int8_t * numbersPtr
Definition: sqltypes.h:149
virtual void updateStats(const int64_t val, const bool is_null)=0
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ validate_equal_column_descriptor()

void foreign_storage::validate_equal_column_descriptor ( const parquet::ColumnDescriptor *  reference_descriptor,
const parquet::ColumnDescriptor *  new_descriptor,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 60 of file ParquetShared.cpp.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), and foreign_storage::anonymous_namespace{LazyParquetImporter.cpp}::validate_equal_schema().

64  {
65  if (!reference_descriptor->Equals(*new_descriptor)) {
66  throw std::runtime_error{"Parquet file \"" + new_file_path +
67  "\" has a different schema. Please ensure that all Parquet "
68  "files use the same schema. Reference Parquet file: " +
69  reference_file_path +
70  ", column name: " + reference_descriptor->name() +
71  ". New Parquet file: " + new_file_path +
72  ", column name: " + new_descriptor->name() + "."};
73  }
74 }
+ Here is the caller graph for this function:

◆ validate_non_foreign_table_write()

void foreign_storage::validate_non_foreign_table_write ( const TableDescriptor table_descriptor)
inline

Definition at line 22 of file FsiUtils.h.

References StorageType::FOREIGN_TABLE, and TableDescriptor::storageType.

Referenced by Parser::InsertStmt::analyze(), Parser::InsertIntoTableAsSelectStmt::populateData(), and RelModify::RelModify().

22  {
23  if (table_descriptor && table_descriptor->storageType == StorageType::FOREIGN_TABLE) {
24  throw std::runtime_error{
25  "DELETE, INSERT, OR UPDATE commands are not supported for foreign tables."};
26  }
27 }
std::string storageType
static constexpr char const * FOREIGN_TABLE
+ Here is the caller graph for this function: