OmniSciDB  bf83d84833
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
foreign_storage Namespace Reference

Namespaces

 anonymous_namespace{CachingForeignStorageMgr.cpp}
 
 anonymous_namespace{CsvDataWrapper.cpp}
 
 anonymous_namespace{CsvReader.cpp}
 
 anonymous_namespace{ForeignStorageCache.cpp}
 
 anonymous_namespace{LazyParquetChunkLoader.cpp}
 
 anonymous_namespace{ParquetDataWrapper.cpp}
 
 csv_file_buffer_parser
 
 json_utils
 

Classes

struct  DataWrapperType
 Encapsulates an enumeration of foreign data wrapper type strings. More...
 
struct  ForeignServer
 
struct  ForeignTable
 
struct  OptionsContainer
 
class  CachingForeignStorageMgr
 
struct  ParseFileRegionResult
 
struct  MetadataScanMultiThreadingParams
 
struct  FileRegion
 
class  CsvDataWrapper
 
class  CsvReader
 
class  SingleFileReader
 
class  ArchiveWrapper
 
class  CompressedFileReader
 
class  MultiFileReader
 
class  LocalMultiFileReader
 
class  ForeignDataWrapper
 
class  ForeignStorageBuffer
 
class  ForeignStorageCache
 
class  ForeignStorageException
 
class  MockForeignDataWrapper
 
class  ForeignStorageMgr
 
class  ForeignTableRefreshScheduler
 
class  ForeignTableSchema
 
struct  ColumnType
 
struct  FragmentType
 
struct  Interval
 
class  LazyParquetChunkLoader
 
class  ParquetArrayEncoder
 
class  ParquetDataWrapper
 
class  ParquetDateFromTimestampEncoder
 
class  ParquetDateInSecondsEncoder
 
class  ParquetDecimalEncoder
 
class  ParquetEncoder
 
class  ParquetScalarEncoder
 
class  ParquetFixedLengthArrayEncoder
 
class  ParquetFixedLengthEncoder
 
class  ParquetUnsignedFixedLengthEncoder
 
class  ParquetGeospatialEncoder
 
class  ParquetInPlaceEncoder
 
class  TypedParquetInPlaceEncoder
 
class  ParquetMetadataValidator
 
class  TimestampBoundsValidator
 
class  IntegralFixedLengthBoundsValidator
 
class  DateInSecondsBoundsValidator
 
class  FloatPointValidator
 
struct  RowGroupInterval
 
struct  RowGroupMetadata
 
class  ParquetStringEncoder
 
class  ParquetStringNoneEncoder
 
class  ParquetTimeEncoder
 
class  ParquetTimestampEncoder
 
class  ParquetVariableLengthArrayEncoder
 

Typedefs

using OptionsMap = std::map< std::string, std::string, std::less<>>
 
using FileRegions = std::vector< FileRegion >
 
using read_lock = mapd_shared_lock< mapd_shared_mutex >
 
using write_lock = mapd_unique_lock< mapd_shared_mutex >
 

Functions

template<typename T >
bool contains (const T &set, const std::string_view element)
 
int64_t get_interval_duration (const std::string &interval)
 
int64_t get_next_refresh_time (const std::map< std::string, std::string, std::less<>> &foreign_table_options)
 
ParseFileRegionResult parse_file_regions (const FileRegions &file_regions, const size_t start_index, const size_t end_index, CsvReader &csv_reader, csv_file_buffer_parser::ParseBufferRequest &parse_file_request, const std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map)
 
size_t get_buffer_size (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size)
 
size_t get_buffer_size (const FileRegions &file_regions)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size, const size_t buffer_size)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const FileRegions &file_regions)
 
std::vector< size_t > partition_by_fragment (const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
 
std::optional
< csv_file_buffer_parser::ParseBufferRequest
get_next_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params)
 
void add_file_region (std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const csv_file_buffer_parser::ParseBufferResult &result, const std::string &file_path)
 
size_t get_var_length_data_block_size (DataBlockPtr data_block, SQLTypeInfo sql_type_info)
 
void update_stats (Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
 
void cache_blocks (std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block)
 
void process_data_blocks (MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const csv_file_buffer_parser::ParseBufferRequest &request, csv_file_buffer_parser::ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
 
void add_request_to_pool (MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
 
void scan_metadata (MetadataScanMultiThreadingParams &multi_threading_params, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
 
csv_file_buffer_parser::ParseBufferRequest get_request_from_pool (MetadataScanMultiThreadingParams &multi_threading_params)
 
void dispatch_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
 
void resize_buffer_if_needed (std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
 
void dispatch_metadata_scan_requests (const size_t &buffer_size, const std::string &file_path, CsvReader &csv_reader, const import_export::CopyParams &copy_params, MetadataScanMultiThreadingParams &multi_threading_params, size_t &first_row_index_in_buffer, size_t &current_file_offset)
 
void set_value (rapidjson::Value &json_val, const FileRegion &file_region, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, FileRegion &file_region)
 
void throw_removed_row_error (const std::string &file_path)
 
void throw_removed_file_error (const std::string &file_path)
 
void throw_number_of_columns_mismatch_error (size_t num_table_cols, size_t num_file_cols, const std::string &file_path)
 
std::vector< ChunkKeyget_keys_vec_from_table (const ChunkKey &destination_chunk_key)
 
std::set< ChunkKeyget_keys_set_from_table (const ChunkKey &destination_chunk_key)
 
void refresh_foreign_table (Catalog_Namespace::Catalog &catalog, const std::string &table_name, const bool evict_cached_entries)
 
void set_value (rapidjson::Value &json_val, const RowGroupInterval &value, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, RowGroupInterval &value)
 
template<typename T >
ArrayDatum encode_as_array_datum (const std::vector< T > &data)
 
template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
get_null_value ()
 
template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > get_min_max_bounds ()
 
template<typename D , typename T >
bool check_bounds (const T &value)
 
template<typename D >
std::string datetime_to_string (const D &timestamp, const SQLTypeInfo &column_type)
 
void throw_parquet_metadata_out_of_bounds_error (const std::string &min_value, const std::string &max_value, const std::string &encountered_value)
 
void open_parquet_table (const std::string &file_path, std::unique_ptr< parquet::arrow::FileReader > &reader, std::shared_ptr< arrow::fs::FileSystem > &file_system)
 
std::pair< int, int > get_parquet_table_size (const std::unique_ptr< parquet::arrow::FileReader > &reader)
 
const parquet::ColumnDescriptor * get_column_descriptor (const parquet::arrow::FileReader *reader, const int logical_column_index)
 
parquet::Type::type get_physical_type (std::unique_ptr< parquet::arrow::FileReader > &reader, const int logical_column_index)
 
void validate_equal_column_descriptor (const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
 
std::unique_ptr< ColumnDescriptorget_sub_type_column_descriptor (const ColumnDescriptor *column)
 
std::shared_ptr
< parquet::Statistics > 
validate_and_get_column_metadata_statistics (const parquet::ColumnChunkMetaData *column_metadata)
 
void validate_non_foreign_table_write (const TableDescriptor *table_descriptor)
 

Typedef Documentation

using foreign_storage::FileRegions = typedef std::vector<FileRegion>

Definition at line 64 of file CsvDataWrapper.h.

using foreign_storage::OptionsMap = typedef std::map<std::string, std::string, std::less<>>

Definition at line 30 of file OptionsContainer.h.

Function Documentation

void foreign_storage::add_file_region ( std::map< int, FileRegions > &  fragment_id_to_file_regions_map,
int  fragment_id,
size_t  first_row_index,
const csv_file_buffer_parser::ParseBufferResult &  result,
const std::string &  file_path 
)

Creates a new file region based on metadata from parsed CSV file buffers and adds the new region to the fragment id to file regions map.

Definition at line 533 of file CsvDataWrapper.cpp.

References foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count, and foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_offsets.

Referenced by process_data_blocks().

537  {
538  fragment_id_to_file_regions_map[fragment_id].emplace_back(
539  FileRegion(file_path,
540  result.row_offsets.front(),
541  first_row_index,
542  result.row_count,
543  result.row_offsets.back() - result.row_offsets.front()));
544 }

+ Here is the caller graph for this function:

void foreign_storage::add_request_to_pool ( MetadataScanMultiThreadingParams &  multi_threading_params,
csv_file_buffer_parser::ParseBufferRequest &  request 
)

Adds the request object for a processed request back to the request pool for reuse in subsequent requests.

Definition at line 709 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by scan_metadata().

710  {
711  std::unique_lock<std::mutex> completed_requests_queue_lock(
712  multi_threading_params.request_pool_mutex);
713  multi_threading_params.request_pool.emplace(std::move(request));
714  completed_requests_queue_lock.unlock();
715  multi_threading_params.request_pool_condition.notify_all();
716 }

+ Here is the caller graph for this function:

void foreign_storage::cache_blocks ( std::map< ChunkKey, Chunk_NS::Chunk > &  cached_chunks,
DataBlockPtr  data_block,
size_t  row_count,
ChunkKey chunk_key,
const ColumnDescriptor column,
bool  is_first_block,
bool  is_last_block 
)

Definition at line 599 of file CsvDataWrapper.cpp.

References Catalog_Namespace::SysCatalog::checkedGetCatalog(), CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_DB_IDX, CHUNK_KEY_FRAGMENT_IDX, CHUNK_KEY_TABLE_IDX, ColumnDescriptor::columnType, foreign_storage::anonymous_namespace{CsvDataWrapper.cpp}::get_cache_if_enabled(), Catalog_Namespace::SysCatalog::instance(), and SQLTypeInfo::is_varlen_indeed().

Referenced by process_data_blocks().

605  {
607  chunk_key[CHUNK_KEY_DB_IDX]);
608  auto cache = get_cache_if_enabled(catalog);
609  if (cache) {
610  ChunkKey index_key = {chunk_key[CHUNK_KEY_DB_IDX],
611  chunk_key[CHUNK_KEY_TABLE_IDX],
612  chunk_key[CHUNK_KEY_COLUMN_IDX],
613  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
614  2};
615  // Create actual data chunks to prepopulate cache
616  if (cached_chunks.find(chunk_key) == cached_chunks.end()) {
617  cached_chunks[chunk_key] = Chunk_NS::Chunk{column};
618  cached_chunks[chunk_key].setBuffer(
619  cache->getChunkBufferForPrecaching(chunk_key, is_first_block));
620  if (column->columnType.is_varlen_indeed()) {
621  cached_chunks[chunk_key].setIndexBuffer(
622  cache->getChunkBufferForPrecaching(index_key, is_first_block));
623  }
624  if (is_first_block) {
625  cached_chunks[chunk_key].initEncoder();
626  }
627  }
628  cached_chunks[chunk_key].appendData(data_block, row_count, 0);
629 
630  if (is_last_block) {
631  // cache the chunks now so they are tracked by eviction algorithm
632  std::vector<ChunkKey> key_to_cache{chunk_key};
633  if (column->columnType.is_varlen_indeed()) {
634  key_to_cache.push_back(index_key);
635  }
636  cache->cacheTableChunks(key_to_cache);
637  }
638  }
639 }
std::vector< int > ChunkKey
Definition: types.h:37
foreign_storage::ForeignStorageCache * get_cache_if_enabled(std::shared_ptr< Catalog_Namespace::Catalog > &catalog)
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
static SysCatalog & instance()
Definition: SysCatalog.h:288
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
std::shared_ptr< Catalog > checkedGetCatalog(const int32_t db_id)
SQLTypeInfo columnType
bool is_varlen_indeed() const
Definition: sqltypes.h:506
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename D , typename T >
bool foreign_storage::check_bounds ( const T &  value)
inline

Definition at line 53 of file ParquetMetadataValidator.h.

53  {
54  auto [min_value, max_value] = get_min_max_bounds<D>();
55  return value >= min_value && value <= max_value;
56 }
template<typename T >
bool foreign_storage::contains ( const T &  set,
const std::string_view  element 
)

Definition at line 26 of file ForeignTable.h.

26  {
27  if (std::find(set.begin(), set.end(), element) == set.end()) {
28  return false;
29  } else {
30  return true;
31  }
32 }
template<typename D >
std::string foreign_storage::datetime_to_string ( const D &  timestamp,
const SQLTypeInfo column_type 
)
inline

Definition at line 59 of file ParquetMetadataValidator.h.

References Datum::bigintval, CHECK, DatumToString(), SQLTypeInfo::is_date(), and SQLTypeInfo::is_timestamp().

Referenced by foreign_storage::TimestampBoundsValidator< T >::getMinMaxBoundsAsStrings(), foreign_storage::DateInSecondsBoundsValidator< T >::getMinMaxBoundsAsStrings(), foreign_storage::TimestampBoundsValidator< T >::validateValue(), and foreign_storage::DateInSecondsBoundsValidator< T >::validateValue().

60  {
61  CHECK(column_type.is_timestamp() || column_type.is_date());
62  Datum d;
63  d.bigintval = timestamp;
64  return DatumToString(d, column_type);
65 }
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:240
bool is_timestamp() const
Definition: sqltypes.h:727
int64_t bigintval
Definition: sqltypes.h:206
#define CHECK(condition)
Definition: Logger.h:197
bool is_date() const
Definition: sqltypes.h:715

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params,
csv_file_buffer_parser::ParseBufferRequest &  request 
)

Dispatches a new metadata scan request by adding the request to the pending requests queue to be consumed by a worker thread.

Definition at line 794 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by dispatch_metadata_scan_requests().

796  {
797  {
798  std::unique_lock<std::mutex> pending_requests_lock(
799  multi_threading_params.pending_requests_mutex);
800  multi_threading_params.pending_requests.emplace(std::move(request));
801  }
802  multi_threading_params.pending_requests_condition.notify_all();
803 }

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_requests ( const size_t &  buffer_size,
const std::string &  file_path,
CsvReader &  csv_reader,
const import_export::CopyParams copy_params,
MetadataScanMultiThreadingParams &  multi_threading_params,
size_t &  first_row_index_in_buffer,
size_t &  current_file_offset 
)

Reads from a CSV file iteratively and dispatches metadata scan requests that are processed by worker threads.

Definition at line 823 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, dispatch_metadata_scan_request(), import_export::delimited_parser::find_row_end_pos(), get_request_from_pool(), foreign_storage::CsvReader::isScanFinished(), import_export::CopyParams::line_delim, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, foreign_storage::CsvReader::read(), and resize_buffer_if_needed().

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata().

830  {
831  auto alloc_size = buffer_size;
832  auto residual_buffer = std::make_unique<char[]>(alloc_size);
833  size_t residual_buffer_size = 0;
834  size_t residual_buffer_alloc_size = alloc_size;
835 
836  while (!csv_reader.isScanFinished()) {
837  {
838  std::lock_guard<std::mutex> pending_requests_lock(
839  multi_threading_params.pending_requests_mutex);
840  if (!multi_threading_params.continue_processing) {
841  break;
842  }
843  }
844  auto request = get_request_from_pool(multi_threading_params);
845  resize_buffer_if_needed(request.buffer, request.buffer_alloc_size, alloc_size);
846 
847  if (residual_buffer_size > 0) {
848  memcpy(request.buffer.get(), residual_buffer.get(), residual_buffer_size);
849  }
850  size_t size = residual_buffer_size;
851  size += csv_reader.read(request.buffer.get() + residual_buffer_size,
852  alloc_size - residual_buffer_size);
853 
854  if (size == 0) {
855  // In some cases at the end of a file we will read 0 bytes even when
856  // csv_reader.isScanFinished() is false
857  continue;
858  } else if (size == 1 && request.buffer[0] == copy_params.line_delim) {
859  // In some cases files with newlines at the end will be encoded with a second
860  // newline that can end up being the only thing in the buffer
861  current_file_offset++;
862  continue;
863  }
864  unsigned int num_rows_in_buffer = 0;
865  request.end_pos =
867  request.buffer,
868  size,
869  copy_params,
870  first_row_index_in_buffer,
871  num_rows_in_buffer,
872  nullptr,
873  &csv_reader);
874  request.buffer_size = size;
875  request.buffer_alloc_size = alloc_size;
876  request.first_row_index = first_row_index_in_buffer;
877  request.file_offset = current_file_offset;
878  request.buffer_row_count = num_rows_in_buffer;
879 
880  residual_buffer_size = size - request.end_pos;
881  if (residual_buffer_size > 0) {
882  resize_buffer_if_needed(residual_buffer, residual_buffer_alloc_size, alloc_size);
883  memcpy(residual_buffer.get(),
884  request.buffer.get() + request.end_pos,
885  residual_buffer_size);
886  }
887 
888  current_file_offset += request.end_pos;
889  first_row_index_in_buffer += num_rows_in_buffer;
890 
891  dispatch_metadata_scan_request(multi_threading_params, request);
892  }
893 
894  std::unique_lock<std::mutex> pending_requests_queue_lock(
895  multi_threading_params.pending_requests_mutex);
896  multi_threading_params.pending_requests_condition.wait(
897  pending_requests_queue_lock, [&multi_threading_params] {
898  return multi_threading_params.pending_requests.empty() ||
899  (multi_threading_params.continue_processing == false);
900  });
901  multi_threading_params.continue_processing = false;
902  pending_requests_queue_lock.unlock();
903  multi_threading_params.pending_requests_condition.notify_all();
904 }
void dispatch_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
csv_file_buffer_parser::ParseBufferRequest get_request_from_pool(MetadataScanMultiThreadingParams &multi_threading_params)
void resize_buffer_if_needed(std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
size_t find_row_end_pos(size_t &alloc_size, std::unique_ptr< char[]> &buffer, size_t &buffer_size, const CopyParams &copy_params, const size_t buffer_first_row_index, unsigned int &num_rows_in_buffer, FILE *file, foreign_storage::CsvReader *csv_reader)
Finds the closest possible row ending to the end of the given buffer. The buffer is resized as needed...

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
ArrayDatum foreign_storage::encode_as_array_datum ( const std::vector< T > &  data)
inline

Definition at line 39 of file ParquetGeospatialEncoder.h.

References omnisci.dtypes::T.

Referenced by foreign_storage::ParquetGeospatialEncoder::processGeoElement(), and foreign_storage::ParquetGeospatialEncoder::processNullGeoElement().

39  {
40  const size_t num_bytes = data.size() * sizeof(T);
41  std::shared_ptr<int8_t> buffer(new int8_t[num_bytes], std::default_delete<int8_t[]>());
42  memcpy(buffer.get(), data.data(), num_bytes);
43  return ArrayDatum(num_bytes, buffer, false);
44 }
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:199

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size 
)

Gets the appropriate buffer size to be used when processing CSV file(s).

Definition at line 331 of file CsvDataWrapper.cpp.

References import_export::CopyParams::buffer_size.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata(), and foreign_storage::CsvDataWrapper::populateChunks().

333  {
334  size_t buffer_size = copy_params.buffer_size;
335  if (size_known && file_size < buffer_size) {
336  buffer_size = file_size + 1; // +1 for end of line character, if missing
337  }
338  return buffer_size;
339 }
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const FileRegions &  file_regions)

Definition at line 341 of file CsvDataWrapper.cpp.

References CHECK.

341  {
342  size_t buffer_size = 0;
343  for (const auto& file_region : file_regions) {
344  buffer_size = std::max(buffer_size, file_region.region_size);
345  }
346  CHECK(buffer_size);
347  return buffer_size;
348 }
#define CHECK(condition)
Definition: Logger.h:197
const parquet::ColumnDescriptor * foreign_storage::get_column_descriptor ( const parquet::arrow::FileReader *  reader,
const int  logical_column_index 
)

Definition at line 45 of file ParquetShared.cpp.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

47  {
48  return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
49 }

+ Here is the caller graph for this function:

int64_t foreign_storage::get_interval_duration ( const std::string &  interval)
inline

Gets the interval duration in seconds.

Parameters
interval- interval string with format of {interval_count}{interval_type} (e.g. 5H for "every 5 hours")
Returns
internal duration in seconds

Definition at line 16 of file RefreshTimeCalculator.h.

References UNREACHABLE.

Referenced by get_next_refresh_time().

16  {
17  int interval_count = std::stoi(interval.substr(0, interval.length() - 1));
18  auto interval_type = std::tolower(interval[interval.length() - 1]);
19  int64_t duration{0};
20  if (interval_type == 's') {
21  duration = interval_count;
22  } else if (interval_type == 'h') {
23  duration = interval_count * 60 * 60;
24  } else if (interval_type == 'd') {
25  duration = interval_count * 60 * 60 * 24;
26  } else {
27  UNREACHABLE();
28  }
29  return duration;
30 }
#define UNREACHABLE()
Definition: Logger.h:241

+ Here is the caller graph for this function:

std::set<ChunkKey> foreign_storage::get_keys_set_from_table ( const ChunkKey destination_chunk_key)

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer().

+ Here is the caller graph for this function:

std::vector<ChunkKey> foreign_storage::get_keys_vec_from_table ( const ChunkKey destination_chunk_key)

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer().

+ Here is the caller graph for this function:

template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > foreign_storage::get_min_max_bounds ( )
inline

Definition at line 41 of file ParquetMetadataValidator.h.

41  {
42  static_assert(std::is_signed<D>::value,
43  "'get_min_max_bounds' is only valid for signed types");
44  return {get_null_value<D>() + 1, std::numeric_limits<D>::max()};
45 }
std::optional<csv_file_buffer_parser::ParseBufferRequest> foreign_storage::get_next_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets the next metadata scan request object from the pending requests queue. A null optional is returned if there are no further requests to be processed.

Definition at line 510 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by scan_metadata().

511  {
512  std::unique_lock<std::mutex> pending_requests_lock(
513  multi_threading_params.pending_requests_mutex);
514  multi_threading_params.pending_requests_condition.wait(
515  pending_requests_lock, [&multi_threading_params] {
516  return !multi_threading_params.pending_requests.empty() ||
517  !multi_threading_params.continue_processing;
518  });
519  if (multi_threading_params.pending_requests.empty()) {
520  return {};
521  }
522  auto request = std::move(multi_threading_params.pending_requests.front());
523  multi_threading_params.pending_requests.pop();
524  pending_requests_lock.unlock();
525  multi_threading_params.pending_requests_condition.notify_all();
526  return std::move(request);
527 }

+ Here is the caller graph for this function:

int64_t foreign_storage::get_next_refresh_time ( const std::map< std::string, std::string, std::less<>> &  foreign_table_options)
inline

Definition at line 33 of file RefreshTimeCalculator.h.

References CHECK, dateTimeParse< kTIMESTAMP >(), get_interval_duration(), foreign_storage::ForeignTable::NULL_REFRESH_TIME, foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY, and foreign_storage::ForeignTable::REFRESH_START_DATE_TIME_KEY.

Referenced by Catalog_Namespace::anonymous_namespace{Catalog.cpp}::get_next_refresh_time().

34  {
35  int64_t current_time = std::chrono::duration_cast<std::chrono::seconds>(
36  std::chrono::system_clock::now().time_since_epoch())
37  .count();
38  auto start_date_entry = foreign_table_options.find(
40  CHECK(start_date_entry != foreign_table_options.end());
41  auto start_date_time = dateTimeParse<kTIMESTAMP>(start_date_entry->second, 0);
42 
43  // If start date time is current or in the future, then that is the next refresh time
44  if (start_date_time >= current_time) {
45  return start_date_time;
46  }
47  auto interval_entry =
48  foreign_table_options.find(foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY);
49  if (interval_entry != foreign_table_options.end()) {
50  auto interval_duration = get_interval_duration(interval_entry->second);
51  auto num_intervals =
52  (current_time - start_date_time + interval_duration - 1) / interval_duration;
53  return start_date_time + (num_intervals * interval_duration);
54  } else {
55  // If this was a one time refresh, then there is no next refresh time
57  }
58 }
int64_t get_interval_duration(const std::string &interval)
int64_t dateTimeParse< kTIMESTAMP >(std::string_view str, unsigned const dim)
static constexpr const char * REFRESH_START_DATE_TIME_KEY
Definition: ForeignTable.h:53
static constexpr const char * REFRESH_INTERVAL_KEY
Definition: ForeignTable.h:54
#define CHECK(condition)
Definition: Logger.h:197
static constexpr int NULL_REFRESH_TIME
Definition: ForeignTable.h:61

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
V foreign_storage::get_null_value ( )
inline

Definition at line 31 of file ParquetMetadataValidator.h.

31  {
32  return inline_int_null_value<V>();
33 }
std::pair< int, int > foreign_storage::get_parquet_table_size ( const std::unique_ptr< parquet::arrow::FileReader > &  reader)

Definition at line 37 of file ParquetShared.cpp.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), and foreign_storage::LazyParquetChunkLoader::metadataScan().

38  {
39  auto file_metadata = reader->parquet_reader()->metadata();
40  const auto num_row_groups = file_metadata->num_row_groups();
41  const auto num_columns = file_metadata->num_columns();
42  return std::make_pair(num_row_groups, num_columns);
43 }

+ Here is the caller graph for this function:

parquet::Type::type foreign_storage::get_physical_type ( std::unique_ptr< parquet::arrow::FileReader > &  reader,
const int  logical_column_index 
)

Definition at line 51 of file ParquetShared.cpp.

Referenced by anonymous_namespace{ArrowResultSetConverter.cpp}::get_arrow_type(), and ArrowResultSetConverter::initializeColumnBuilder().

52  {
53  return reader->parquet_reader()
54  ->metadata()
55  ->schema()
56  ->Column(logical_column_index)
57  ->physical_type();
58 }

+ Here is the caller graph for this function:

csv_file_buffer_parser::ParseBufferRequest foreign_storage::get_request_from_pool ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets a request from the metadata scan request pool.

Definition at line 776 of file CsvDataWrapper.cpp.

References CHECK, foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by dispatch_metadata_scan_requests().

777  {
778  std::unique_lock<std::mutex> request_pool_lock(
779  multi_threading_params.request_pool_mutex);
780  multi_threading_params.request_pool_condition.wait(
781  request_pool_lock,
782  [&multi_threading_params] { return !multi_threading_params.request_pool.empty(); });
783  auto request = std::move(multi_threading_params.request_pool.front());
784  multi_threading_params.request_pool.pop();
785  request_pool_lock.unlock();
786  CHECK(request.buffer);
787  return request;
788 }
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the caller graph for this function:

std::unique_ptr< ColumnDescriptor > foreign_storage::get_sub_type_column_descriptor ( const ColumnDescriptor column)

Definition at line 76 of file ParquetShared.cpp.

References ColumnDescriptor::columnId, ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_elem_type(), SQLTypeInfo::set_size(), and ColumnDescriptor::tableId.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping().

77  {
78  auto column_type = column->columnType.get_elem_type();
79  if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
80  column_type.set_size(4); // override default size of -1
81  }
82  return std::make_unique<ColumnDescriptor>(
83  column->tableId, column->columnId, column->columnName, column_type);
84 }
void set_size(int s)
Definition: sqltypes.h:409
SQLTypeInfo columnType
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:697
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size,
const size_t  buffer_size 
)

Gets the appropriate number of threads to be used for concurrent processing within the data wrapper.

Definition at line 354 of file CsvDataWrapper.cpp.

References CHECK, and import_export::CopyParams::threads.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata(), and foreign_storage::CsvDataWrapper::populateChunks().

357  {
358  size_t thread_count = copy_params.threads;
359  if (thread_count == 0) {
360  thread_count = std::thread::hardware_concurrency();
361  }
362  if (size_known) {
363  size_t num_buffers_in_file = (file_size + buffer_size - 1) / buffer_size;
364  if (num_buffers_in_file < thread_count) {
365  thread_count = num_buffers_in_file;
366  }
367  }
368  CHECK(thread_count);
369  return thread_count;
370 }
#define CHECK(condition)
Definition: Logger.h:197
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const FileRegions &  file_regions 
)

Definition at line 372 of file CsvDataWrapper.cpp.

References CHECK, and import_export::CopyParams::threads.

373  {
374  size_t thread_count = copy_params.threads;
375  if (thread_count == 0) {
376  thread_count =
377  std::min<size_t>(std::thread::hardware_concurrency(), file_regions.size());
378  }
379  CHECK(thread_count);
380  return thread_count;
381 }
#define CHECK(condition)
Definition: Logger.h:197
void foreign_storage::get_value ( const rapidjson::Value &  json_val,
RowGroupInterval &  value 
)

Definition at line 536 of file ParquetDataWrapper.cpp.

References CHECK, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::json_utils::get_value_from_object(), and foreign_storage::RowGroupInterval::start_index.

536  {
537  CHECK(json_val.IsObject());
538  json_utils::get_value_from_object(json_val, value.file_path, "file_path");
539  json_utils::get_value_from_object(json_val, value.start_index, "start_index");
540  json_utils::get_value_from_object(json_val, value.end_index, "end_index");
541 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:126
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

void foreign_storage::get_value ( const rapidjson::Value &  json_val,
FileRegion &  file_region 
)

Definition at line 1118 of file CsvDataWrapper.cpp.

References CHECK, foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::json_utils::get_value_from_object(), foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

1118  {
1119  CHECK(json_val.IsObject());
1121  json_val, file_region.first_row_file_offset, "first_row_file_offset");
1123  json_val, file_region.first_row_index, "first_row_index");
1124  json_utils::get_value_from_object(json_val, file_region.region_size, "region_size");
1125  json_utils::get_value_from_object(json_val, file_region.row_count, "row_count");
1126 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:126
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

size_t foreign_storage::get_var_length_data_block_size ( DataBlockPtr  data_block,
SQLTypeInfo  sql_type_info 
)

Get the total number of bytes in the given data block for a variable length column.

Definition at line 550 of file CsvDataWrapper.cpp.

References DataBlockPtr::arraysPtr, CHECK, SQLTypeInfo::is_array(), SQLTypeInfo::is_geometry(), SQLTypeInfo::is_string(), SQLTypeInfo::is_varlen(), DataBlockPtr::stringsPtr, and UNREACHABLE.

Referenced by process_data_blocks().

551  {
552  CHECK(sql_type_info.is_varlen());
553  size_t byte_count = 0;
554  if (sql_type_info.is_string() || sql_type_info.is_geometry()) {
555  for (const auto& str : *data_block.stringsPtr) {
556  byte_count += str.length();
557  }
558  } else if (sql_type_info.is_array()) {
559  for (const auto& array : *data_block.arraysPtr) {
560  byte_count += array.length;
561  }
562  } else {
563  UNREACHABLE();
564  }
565  return byte_count;
566 }
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:218
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:219
bool is_varlen() const
Definition: sqltypes.h:500
#define UNREACHABLE()
Definition: Logger.h:241
#define CHECK(condition)
Definition: Logger.h:197
bool is_geometry() const
Definition: sqltypes.h:490
bool is_string() const
Definition: sqltypes.h:478
bool is_array() const
Definition: sqltypes.h:486

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::open_parquet_table ( const std::string &  file_path,
std::unique_ptr< parquet::arrow::FileReader > &  reader,
std::shared_ptr< arrow::fs::FileSystem > &  file_system 
)

Definition at line 25 of file ParquetShared.cpp.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), foreign_storage::ParquetDataWrapper::fetchChunkMetadata(), and foreign_storage::LazyParquetChunkLoader::metadataScan().

27  {
28  auto file_result = file_system->OpenInputFile(file_path);
29  if (!file_result.ok()) {
30  throw std::runtime_error{"Unable to access " + file_system->type_name() + " file: " +
31  file_path + ". " + file_result.status().message()};
32  }
33  auto infile = file_result.ValueOrDie();
34  PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
35 }

+ Here is the caller graph for this function:

ParseFileRegionResult foreign_storage::parse_file_regions ( const FileRegions &  file_regions,
const size_t  start_index,
const size_t  end_index,
CsvReader &  csv_reader,
csv_file_buffer_parser::ParseBufferRequest &  parse_file_request,
const std::map< int, Chunk_NS::Chunk > &  column_id_to_chunk_map 
)

Parses a set of file regions given a handle to the file and range of indexes for the file regions to be parsed.

Definition at line 291 of file CsvDataWrapper.cpp.

References foreign_storage::csv_file_buffer_parser::ParseBufferRequest::begin_pos, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::buffer, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::buffer_size, CHECK, CHECK_EQ, foreign_storage::csv_file_buffer_parser::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::end_pos, foreign_storage::ParseFileRegionResult::file_offset, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::file_offset, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::first_row_index, foreign_storage::csv_file_buffer_parser::parse_buffer(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::process_row_count, foreign_storage::CsvReader::readRegion(), run_benchmark_import::result, and foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count.

Referenced by foreign_storage::CsvDataWrapper::populateChunks().

297  {
298  ParseFileRegionResult load_file_region_result{};
299  load_file_region_result.file_offset = file_regions[start_index].first_row_file_offset;
300  load_file_region_result.row_count = 0;
301 
302  csv_file_buffer_parser::ParseBufferResult result;
303  for (size_t i = start_index; i <= end_index; i++) {
304  CHECK(file_regions[i].region_size <= parse_file_request.buffer_size);
305  size_t read_size;
306  {
307  read_size = csv_reader.readRegion(parse_file_request.buffer.get(),
308  file_regions[i].first_row_file_offset,
309  file_regions[i].region_size);
310  }
311 
312  CHECK_EQ(file_regions[i].region_size, read_size);
313  parse_file_request.begin_pos = 0;
314  parse_file_request.end_pos = file_regions[i].region_size;
315  parse_file_request.first_row_index = file_regions[i].first_row_index;
316  parse_file_request.file_offset = file_regions[i].first_row_file_offset;
317  parse_file_request.process_row_count = file_regions[i].row_count;
318 
319  result = parse_buffer(parse_file_request, i == end_index);
320  CHECK_EQ(file_regions[i].row_count, result.row_count);
321  load_file_region_result.row_count += result.row_count;
322  }
323  load_file_region_result.column_id_to_data_blocks_map =
324  result.column_id_to_data_blocks_map;
325  return load_file_region_result;
326 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
ParseBufferResult parse_buffer(ParseBufferRequest &request, bool convert_data_blocks)
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector<size_t> foreign_storage::partition_by_fragment ( const size_t  start_row_index,
const size_t  max_fragment_size,
const size_t  buffer_row_count 
)

Given a start row index, maximum fragment size, and number of rows in a buffer, this function returns a vector indicating how the rows in the buffer should be partitioned in order to fill up available fragment slots while staying within the capacity of fragments.

Definition at line 457 of file CsvDataWrapper.cpp.

References CHECK.

Referenced by scan_metadata().

459  {
460  CHECK(buffer_row_count > 0);
461  std::vector<size_t> partitions{};
462  size_t remaining_rows_in_last_fragment;
463  if (start_row_index % max_fragment_size == 0) {
464  remaining_rows_in_last_fragment = 0;
465  } else {
466  remaining_rows_in_last_fragment =
467  max_fragment_size - (start_row_index % max_fragment_size);
468  }
469  if (buffer_row_count <= remaining_rows_in_last_fragment) {
470  partitions.emplace_back(buffer_row_count);
471  } else {
472  if (remaining_rows_in_last_fragment > 0) {
473  partitions.emplace_back(remaining_rows_in_last_fragment);
474  }
475  size_t remaining_buffer_row_count =
476  buffer_row_count - remaining_rows_in_last_fragment;
477  while (remaining_buffer_row_count > 0) {
478  partitions.emplace_back(
479  std::min<size_t>(remaining_buffer_row_count, max_fragment_size));
480  remaining_buffer_row_count -= partitions.back();
481  }
482  }
483  return partitions;
484 }
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the caller graph for this function:

void foreign_storage::process_data_blocks ( MetadataScanMultiThreadingParams &  multi_threading_params,
int  fragment_id,
const csv_file_buffer_parser::ParseBufferRequest &  request,
csv_file_buffer_parser::ParseBufferResult &  result,
std::map< int, const ColumnDescriptor * > &  column_by_id,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map 
)

Updates metadata encapsulated in encoders for all table columns given new data blocks gotten from parsing a new set of rows in a CSV file buffer. If cache is available, also append the data_blocks to chunks in the cache

Definition at line 646 of file CsvDataWrapper.cpp.

References add_file_region(), cache_blocks(), foreign_storage::MetadataScanMultiThreadingParams::cached_chunks, foreign_storage::MetadataScanMultiThreadingParams::chunk_byte_count, foreign_storage::MetadataScanMultiThreadingParams::chunk_byte_count_mutex, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers_mutex, foreign_storage::csv_file_buffer_parser::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::db_id, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::first_row_index, get_var_length_data_block_size(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::getFilePath(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::getMaxFragRows(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::getTableId(), foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count, and update_stats().

Referenced by scan_metadata().

651  {
652  std::lock_guard<std::mutex> lock(multi_threading_params.chunk_encoder_buffers_mutex);
653  // File regions should be added in same order as appendData
654  add_file_region(fragment_id_to_file_regions_map,
655  fragment_id,
656  request.first_row_index,
657  result,
658  request.getFilePath());
659 
660  for (auto& [column_id, data_block] : result.column_id_to_data_blocks_map) {
661  ChunkKey chunk_key{request.db_id, request.getTableId(), column_id, fragment_id};
662  const auto column = column_by_id[column_id];
663  size_t byte_count;
664  if (column->columnType.is_varlen_indeed()) {
665  chunk_key.emplace_back(1);
666  byte_count = get_var_length_data_block_size(data_block, column->columnType);
667  } else {
668  byte_count = column->columnType.get_size() * result.row_count;
669  }
670 
671  {
672  std::lock_guard<std::mutex> lock(multi_threading_params.chunk_byte_count_mutex);
673  multi_threading_params.chunk_byte_count[chunk_key] += byte_count;
674  }
675 
676  if (multi_threading_params.chunk_encoder_buffers.find(chunk_key) ==
677  multi_threading_params.chunk_encoder_buffers.end()) {
678  multi_threading_params.chunk_encoder_buffers[chunk_key] =
679  std::make_unique<ForeignStorageBuffer>();
680  multi_threading_params.chunk_encoder_buffers[chunk_key]->initEncoder(
681  column->columnType);
682  }
683  update_stats(multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder(),
684  column->columnType,
685  data_block,
686  result.row_count);
687  size_t num_elements = multi_threading_params.chunk_encoder_buffers[chunk_key]
688  ->getEncoder()
689  ->getNumElems() +
690  result.row_count;
691  multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder()->setNumElems(
692  num_elements);
693  cache_blocks(
694  multi_threading_params.cached_chunks,
695  data_block,
696  result.row_count,
697  chunk_key,
698  column,
699  (num_elements - result.row_count) == 0, // Is the first block added to this chunk
700  num_elements == request.getMaxFragRows() // Is the last block for this chunk
701  );
702  }
703 }
std::vector< int > ChunkKey
Definition: types.h:37
size_t get_var_length_data_block_size(DataBlockPtr data_block, SQLTypeInfo sql_type_info)
void update_stats(Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
void cache_blocks(std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block)
void add_file_region(std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const csv_file_buffer_parser::ParseBufferResult &result, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::refresh_foreign_table ( Catalog_Namespace::Catalog catalog,
const std::string &  table_name,
const bool  evict_cached_entries 
)

Definition at line 22 of file ForeignTableRefresh.cpp.

References Data_Namespace::CPU_LEVEL, Catalog_Namespace::DBMetadata::dbId, StorageType::FOREIGN_TABLE, Catalog_Namespace::Catalog::getCurrentDB(), Catalog_Namespace::Catalog::getDataMgr(), PostEvictionRefreshException::getOriginalException(), Data_Namespace::GPU_LEVEL, Catalog_Namespace::Catalog::removeFragmenterForTable(), and Catalog_Namespace::Catalog::updateForeignTableRefreshTimes().

Referenced by RefreshForeignTablesCommand::execute(), and foreign_storage::ForeignTableRefreshScheduler::start().

24  {
25  auto& data_mgr = catalog.getDataMgr();
26  auto table_lock =
27  std::make_unique<lockmgr::TableSchemaLockContainer<lockmgr::WriteLock>>(
29  catalog, table_name, false));
30 
31  const TableDescriptor* td = (*table_lock)();
32  if (td->storageType != StorageType::FOREIGN_TABLE) {
33  throw std::runtime_error{
34  table_name +
35  " is not a foreign table. Refreshes are applicable to only foreign tables."};
36  }
37 
38  catalog.removeFragmenterForTable(td->tableId);
39  ChunkKey table_key{catalog.getCurrentDB().dbId, td->tableId};
40  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::CPU_LEVEL);
41  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::GPU_LEVEL);
42 
43  try {
44  data_mgr.getPersistentStorageMgr()->getForeignStorageMgr()->refreshTable(
45  table_key, evict_cached_entries);
46  catalog.updateForeignTableRefreshTimes(td->tableId);
47  } catch (PostEvictionRefreshException& e) {
48  catalog.updateForeignTableRefreshTimes(td->tableId);
49  throw e.getOriginalException();
50  }
51 }
std::vector< int > ChunkKey
Definition: types.h:37
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:222
std::runtime_error getOriginalException()
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:221
void removeFragmenterForTable(const int table_id)
Definition: Catalog.cpp:3371
static constexpr char const * FOREIGN_TABLE
void updateForeignTableRefreshTimes(const int32_t table_id)
Definition: Catalog.cpp:4526

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::resize_buffer_if_needed ( std::unique_ptr< char[]> &  buffer,
size_t &  buffer_size,
const size_t  alloc_size 
)

Optionally resizes the given buffer if the buffer size is less than the current buffer allocation size.

Definition at line 809 of file CsvDataWrapper.cpp.

References CHECK_LE.

Referenced by dispatch_metadata_scan_requests().

811  {
812  CHECK_LE(buffer_size, alloc_size);
813  if (buffer_size < alloc_size) {
814  buffer = std::make_unique<char[]>(alloc_size);
815  buffer_size = alloc_size;
816  }
817 }
#define CHECK_LE(x, y)
Definition: Logger.h:208

+ Here is the caller graph for this function:

void foreign_storage::scan_metadata ( MetadataScanMultiThreadingParams &  multi_threading_params,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map 
)

Consumes and processes metadata scan requests from a pending requests queue and updates existing metadata objects based on newly scanned metadata.

Definition at line 722 of file CsvDataWrapper.cpp.

References add_request_to_pool(), foreign_storage::MetadataScanMultiThreadingParams::continue_processing, get_next_metadata_scan_request(), foreign_storage::csv_file_buffer_parser::parse_buffer(), partition_by_fragment(), foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, process_data_blocks(), and run_benchmark_import::result.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata().

723  {
724  std::map<int, const ColumnDescriptor*> column_by_id{};
725  while (true) {
726  auto request_opt = get_next_metadata_scan_request(multi_threading_params);
727  if (!request_opt.has_value()) {
728  break;
729  }
730  auto& request = request_opt.value();
731  try {
732  if (column_by_id.empty()) {
733  for (const auto column : request.getColumns()) {
734  column_by_id[column->columnId] = column;
735  }
736  }
737  auto partitions = partition_by_fragment(
738  request.first_row_index, request.getMaxFragRows(), request.buffer_row_count);
739  request.begin_pos = 0;
740  size_t row_index = request.first_row_index;
741  for (const auto partition : partitions) {
742  request.process_row_count = partition;
743  for (const auto& import_buffer : request.import_buffers) {
744  if (import_buffer != nullptr) {
745  import_buffer->clear();
746  }
747  }
748  auto result = parse_buffer(request, true);
749  int fragment_id = row_index / request.getMaxFragRows();
750  process_data_blocks(multi_threading_params,
751  fragment_id,
752  request,
753  result,
754  column_by_id,
755  fragment_id_to_file_regions_map);
756  row_index += result.row_count;
757  request.begin_pos = result.row_offsets.back() - request.file_offset;
758  }
759  } catch (...) {
760  // Re-add request to pool so we dont block any other threads
761  {
762  std::lock_guard<std::mutex> pending_requests_lock(
763  multi_threading_params.pending_requests_mutex);
764  multi_threading_params.continue_processing = false;
765  }
766  add_request_to_pool(multi_threading_params, request);
767  throw;
768  }
769  add_request_to_pool(multi_threading_params, request);
770  }
771 }
void add_request_to_pool(MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
ParseBufferResult parse_buffer(ParseBufferRequest &request, bool convert_data_blocks)
std::vector< size_t > partition_by_fragment(const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
void process_data_blocks(MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const csv_file_buffer_parser::ParseBufferRequest &request, csv_file_buffer_parser::ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
std::optional< csv_file_buffer_parser::ParseBufferRequest > get_next_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const RowGroupInterval &  value,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 527 of file ParquetDataWrapper.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, and foreign_storage::RowGroupInterval::start_index.

529  {
530  json_val.SetObject();
531  json_utils::add_value_to_object(json_val, value.file_path, "file_path", allocator);
532  json_utils::add_value_to_object(json_val, value.start_index, "start_index", allocator);
533  json_utils::add_value_to_object(json_val, value.end_index, "end_index", allocator);
534 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:111

+ Here is the call graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const FileRegion &  file_region,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 1104 of file CsvDataWrapper.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

1106  {
1107  json_val.SetObject();
1109  json_val, file_region.first_row_file_offset, "first_row_file_offset", allocator);
1111  json_val, file_region.first_row_index, "first_row_index", allocator);
1113  json_val, file_region.region_size, "region_size", allocator);
1115  json_val, file_region.row_count, "row_count", allocator);
1116 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:111

+ Here is the call graph for this function:

void foreign_storage::throw_number_of_columns_mismatch_error ( size_t  num_table_cols,
size_t  num_file_cols,
const std::string &  file_path 
)
inline

Definition at line 35 of file ForeignDataWrapperShared.h.

References to_string().

Referenced by foreign_storage::csv_file_buffer_parser::validate_expected_column_count(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns().

37  {
38  throw std::runtime_error{"Mismatched number of logical columns: (expected " +
39  std::to_string(num_table_cols) + " columns, has " +
40  std::to_string(num_file_cols) + "): in file '" + file_path +
41  "'"};
42 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::throw_parquet_metadata_out_of_bounds_error ( const std::string &  min_value,
const std::string &  max_value,
const std::string &  encountered_value 
)
inline

Definition at line 67 of file ParquetMetadataValidator.h.

Referenced by foreign_storage::TimestampBoundsValidator< T >::validateValue(), foreign_storage::IntegralFixedLengthBoundsValidator< T >::validateValue(), foreign_storage::DateInSecondsBoundsValidator< T >::validateValue(), and foreign_storage::FloatPointValidator< T >::validateValue().

70  {
71  std::stringstream error_message;
72  error_message << "Parquet column contains values that are outside the range of the "
73  "OmniSci column "
74  "type. Consider using a wider column type. Min allowed value: "
75  << min_value << ". Max allowed value: " << max_value
76  << ". Encountered value: " << encountered_value << ".";
77  throw std::runtime_error(error_message.str());
78 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_file_error ( const std::string &  file_path)
inline

Definition at line 29 of file ForeignDataWrapperShared.h.

Referenced by foreign_storage::LocalMultiFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

29  {
30  throw std::runtime_error{
31  "Refresh of foreign table created with \"APPEND\" update type failed as "
32  "file \"" +
33  file_path + "\" was removed."};
34 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_row_error ( const std::string &  file_path)
inline

Definition at line 22 of file ForeignDataWrapperShared.h.

Referenced by foreign_storage::SingleFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

22  {
23  throw std::runtime_error{
24  "Refresh of foreign table created with \"APPEND\" update type failed as file "
25  "reduced in size: " +
26  file_path};
27 }

+ Here is the caller graph for this function:

void foreign_storage::update_stats ( Encoder encoder,
const SQLTypeInfo column_type,
DataBlockPtr  data_block,
const size_t  row_count 
)

Updates the statistics metadata encapsulated in the encoder given new data in a data block.

Definition at line 572 of file CsvDataWrapper.cpp.

References DataBlockPtr::arraysPtr, SQLTypeInfo::is_array(), SQLTypeInfo::is_varlen(), DataBlockPtr::numbersPtr, DataBlockPtr::stringsPtr, and Encoder::updateStats().

Referenced by process_data_blocks(), and Fragmenter_Namespace::InsertOrderFragmenter::updateColumnMetadata().

575  {
576  if (column_type.is_array()) {
577  encoder->updateStats(data_block.arraysPtr, 0, row_count);
578  } else if (!column_type.is_varlen()) {
579  encoder->updateStats(data_block.numbersPtr, row_count);
580  } else {
581  encoder->updateStats(data_block.stringsPtr, 0, row_count);
582  }
583 }
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:218
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:219
bool is_varlen() const
Definition: sqltypes.h:500
int8_t * numbersPtr
Definition: sqltypes.h:217
virtual void updateStats(const int64_t val, const bool is_null)=0
bool is_array() const
Definition: sqltypes.h:486

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr< parquet::Statistics > foreign_storage::validate_and_get_column_metadata_statistics ( const parquet::ColumnChunkMetaData *  column_metadata)

Definition at line 86 of file ParquetShared.cpp.

References CHECK.

Referenced by foreign_storage::ParquetEncoder::getRowGroupMetadata(), and foreign_storage::TypedParquetInPlaceEncoder< int64_t, int32_t >::getRowGroupMetadata().

87  {
88  CHECK(column_metadata->is_stats_set());
89  std::shared_ptr<parquet::Statistics> stats = column_metadata->statistics();
90  bool is_all_nulls = stats->null_count() == column_metadata->num_values();
91  CHECK(is_all_nulls || stats->HasMinMax());
92  return stats;
93 }
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the caller graph for this function:

void foreign_storage::validate_equal_column_descriptor ( const parquet::ColumnDescriptor *  reference_descriptor,
const parquet::ColumnDescriptor *  new_descriptor,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 60 of file ParquetShared.cpp.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::append_row_groups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

64  {
65  if (!reference_descriptor->Equals(*new_descriptor)) {
66  throw std::runtime_error{"Parquet file \"" + new_file_path +
67  "\" has a different schema. Please ensure that all Parquet "
68  "files use the same schema. Reference Parquet file: " +
69  reference_file_path +
70  ", column name: " + reference_descriptor->name() +
71  ". New Parquet file: " + new_file_path +
72  ", column name: " + new_descriptor->name() + "."};
73  }
74 }

+ Here is the caller graph for this function:

void foreign_storage::validate_non_foreign_table_write ( const TableDescriptor table_descriptor)
inline

Definition at line 22 of file FsiUtils.h.

References StorageType::FOREIGN_TABLE, and TableDescriptor::storageType.

Referenced by Parser::InsertStmt::analyze(), Parser::InsertValuesStmt::determineLeafIndex(), Parser::TruncateTableStmt::execute(), Parser::InsertValuesStmt::execute(), Parser::InsertIntoTableAsSelectStmt::populateData(), and RelModify::RelModify().

22  {
23  if (table_descriptor && table_descriptor->storageType == StorageType::FOREIGN_TABLE) {
24  throw std::runtime_error{
25  "DELETE, INSERT, TRUNCATE, OR UPDATE commands are not supported for foreign "
26  "tables."};
27  }
28 }
std::string storageType
static constexpr char const * FOREIGN_TABLE

+ Here is the caller graph for this function: