OmniSciDB  8fa3bf436f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
foreign_storage Namespace Reference

Namespaces

 anonymous_namespace{AbstractFileStorageDataWrapper.cpp}
 
 anonymous_namespace{CachingForeignStorageMgr.cpp}
 
 anonymous_namespace{CsvDataWrapper.cpp}
 
 anonymous_namespace{CsvReader.cpp}
 
 anonymous_namespace{ForeignStorageCache.cpp}
 
 anonymous_namespace{LazyParquetChunkLoader.cpp}
 
 anonymous_namespace{ParquetDataWrapper.cpp}
 
 Csv
 
 csv_file_buffer_parser
 
 json_utils
 

Classes

struct  ForeignServer
 
struct  ForeignTable
 
struct  OptionsContainer
 
class  AbstractFileStorageDataWrapper
 
class  CachingForeignStorageMgr
 
struct  ParseFileRegionResult
 
struct  MetadataScanMultiThreadingParams
 
class  CsvDataWrapper
 
class  CsvReader
 
class  SingleFileReader
 
class  ArchiveWrapper
 
class  CompressedFileReader
 
class  MultiFileReader
 
class  LocalMultiFileReader
 
struct  FileRegion
 
class  ForeignDataWrapper
 
struct  DataWrapperType
 Encapsulates an enumeration of foreign data wrapper type strings. More...
 
class  ForeignDataWrapperFactory
 
class  ForeignStorageBuffer
 
class  ForeignStorageCache
 
class  ForeignStorageException
 
class  MockForeignDataWrapper
 
class  ForeignStorageMgr
 
class  ForeignTableSchema
 
struct  ColumnType
 
struct  FragmentType
 
struct  Interval
 
class  LazyParquetChunkLoader
 
class  ParquetArrayEncoder
 
class  ParquetDataWrapper
 
class  ParquetDateFromTimestampEncoder
 
class  ParquetDateInSecondsEncoder
 
class  ParquetDecimalEncoder
 
class  ParquetEncoder
 
class  ParquetScalarEncoder
 
class  ParquetFixedLengthArrayEncoder
 
class  ParquetFixedLengthEncoder
 
class  ParquetUnsignedFixedLengthEncoder
 
class  ParquetGeospatialEncoder
 
class  ParquetInPlaceEncoder
 
class  TypedParquetInPlaceEncoder
 
class  ParquetMetadataValidator
 
class  TimestampBoundsValidator
 
class  IntegralFixedLengthBoundsValidator
 
class  DateInSecondsBoundsValidator
 
class  FloatPointValidator
 
struct  RowGroupInterval
 
struct  RowGroupMetadata
 
class  FileReaderMap
 
class  ParquetStringEncoder
 
class  ParquetStringNoneEncoder
 
class  ParquetTimeEncoder
 
class  ParquetTimestampEncoder
 
class  ParquetVariableLengthArrayEncoder
 
class  ForeignTableRefreshScheduler
 

Typedefs

using OptionsMap = std::map< std::string, std::string, std::less<>>
 
using FileRegions = std::vector< FileRegion >
 
using ChunkToBufferMap = std::map< ChunkKey, AbstractBuffer * >
 
using read_lock = mapd_shared_lock< mapd_shared_mutex >
 
using write_lock = mapd_unique_lock< mapd_shared_mutex >
 
using UniqueReaderPtr = std::unique_ptr< parquet::arrow::FileReader >
 
using ReaderPtr = parquet::arrow::FileReader *
 

Functions

int64_t get_interval_duration (const std::string &interval)
 
int64_t get_next_refresh_time (const std::map< std::string, std::string, std::less<>> &foreign_table_options)
 
ParseFileRegionResult parse_file_regions (const FileRegions &file_regions, const size_t start_index, const size_t end_index, CsvReader &csv_reader, csv_file_buffer_parser::ParseBufferRequest &parse_file_request, const std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map)
 
size_t get_buffer_size (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size)
 
size_t get_buffer_size (const FileRegions &file_regions)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size, const size_t buffer_size)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const FileRegions &file_regions)
 
std::vector< size_t > partition_by_fragment (const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
 
std::optional
< csv_file_buffer_parser::ParseBufferRequest
get_next_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params)
 
void add_file_region (std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const csv_file_buffer_parser::ParseBufferResult &result, const std::string &file_path)
 
void update_stats (Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
 
void cache_blocks (std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block)
 
void process_data_blocks (MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const csv_file_buffer_parser::ParseBufferRequest &request, csv_file_buffer_parser::ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
 
void add_request_to_pool (MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
 
void scan_metadata (MetadataScanMultiThreadingParams &multi_threading_params, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
 
csv_file_buffer_parser::ParseBufferRequest get_request_from_pool (MetadataScanMultiThreadingParams &multi_threading_params)
 
void dispatch_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
 
void resize_buffer_if_needed (std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
 
void dispatch_metadata_scan_requests (const size_t &buffer_size, const std::string &file_path, CsvReader &csv_reader, const import_export::CopyParams &copy_params, MetadataScanMultiThreadingParams &multi_threading_params, size_t &first_row_index_in_buffer, size_t &current_file_offset)
 
void set_value (rapidjson::Value &json_val, const FileRegion &file_region, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, FileRegion &file_region)
 
void throw_removed_row_error (const std::string &file_path)
 
void throw_removed_file_error (const std::string &file_path)
 
void throw_number_of_columns_mismatch_error (size_t num_table_cols, size_t num_file_cols, const std::string &file_path)
 
std::vector< ChunkKeyget_keys_vec_from_table (const ChunkKey &destination_chunk_key)
 
std::set< ChunkKeyget_keys_set_from_table (const ChunkKey &destination_chunk_key)
 
void refresh_foreign_table (Catalog_Namespace::Catalog &catalog, const std::string &table_name, const bool evict_cached_entries)
 
bool is_metadata_placeholder (const ChunkMetadata &metadata)
 
void set_value (rapidjson::Value &json_val, const RowGroupInterval &value, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, RowGroupInterval &value)
 
template<typename T >
ArrayDatum encode_as_array_datum (const std::vector< T > &data)
 
template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
get_null_value ()
 
template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > get_min_max_bounds ()
 
template<typename D , typename T >
bool check_bounds (const T &value)
 
template<typename D >
std::string datetime_to_string (const D &timestamp, const SQLTypeInfo &column_type)
 
void throw_parquet_metadata_out_of_bounds_error (const std::string &min_value, const std::string &max_value, const std::string &encountered_value)
 
UniqueReaderPtr open_parquet_table (const std::string &file_path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
 
std::pair< int, int > get_parquet_table_size (const ReaderPtr &reader)
 
const parquet::ColumnDescriptor * get_column_descriptor (const parquet::arrow::FileReader *reader, const int logical_column_index)
 
parquet::Type::type get_physical_type (ReaderPtr &reader, const int logical_column_index)
 
void validate_equal_column_descriptor (const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
 
std::unique_ptr< ColumnDescriptorget_sub_type_column_descriptor (const ColumnDescriptor *column)
 
std::shared_ptr
< parquet::Statistics > 
validate_and_get_column_metadata_statistics (const parquet::ColumnChunkMetaData *column_metadata)
 
template<typename T >
auto partition_for_threads (const std::set< T > &items, size_t max_threads)
 
void validate_non_foreign_table_write (const TableDescriptor *table_descriptor)
 

Variables

const std::string wrapper_file_name = "wrapper_metadata.json"
 

Typedef Documentation

Definition at line 28 of file ForeignDataWrapper.h.

using foreign_storage::FileRegions = typedef std::vector<FileRegion>

Definition at line 71 of file CsvShared.h.

using foreign_storage::OptionsMap = typedef std::map<std::string, std::string, std::less<>>

Definition at line 30 of file OptionsContainer.h.

using foreign_storage::ReaderPtr = typedef parquet::arrow::FileReader*

Definition at line 33 of file ParquetShared.h.

using foreign_storage::UniqueReaderPtr = typedef std::unique_ptr<parquet::arrow::FileReader>

Definition at line 32 of file ParquetShared.h.

Function Documentation

void foreign_storage::add_file_region ( std::map< int, FileRegions > &  fragment_id_to_file_regions_map,
int  fragment_id,
size_t  first_row_index,
const csv_file_buffer_parser::ParseBufferResult &  result,
const std::string &  file_path 
)

Creates a new file region based on metadata from parsed CSV file buffers and adds the new region to the fragment id to file regions map.

Definition at line 418 of file CsvDataWrapper.cpp.

References foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count, and foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_offsets.

Referenced by process_data_blocks().

422  {
423  fragment_id_to_file_regions_map[fragment_id].emplace_back(
424  // file naming is handled by CsvReader
425  FileRegion(result.row_offsets.front(),
426  first_row_index,
427  result.row_count,
428  result.row_offsets.back() - result.row_offsets.front()));
429 }

+ Here is the caller graph for this function:

void foreign_storage::add_request_to_pool ( MetadataScanMultiThreadingParams &  multi_threading_params,
csv_file_buffer_parser::ParseBufferRequest &  request 
)

Adds the request object for a processed request back to the request pool for reuse in subsequent requests.

Definition at line 562 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by scan_metadata().

563  {
564  std::unique_lock<std::mutex> completed_requests_queue_lock(
565  multi_threading_params.request_pool_mutex);
566  multi_threading_params.request_pool.emplace(std::move(request));
567  completed_requests_queue_lock.unlock();
568  multi_threading_params.request_pool_condition.notify_all();
569 }

+ Here is the caller graph for this function:

void foreign_storage::cache_blocks ( std::map< ChunkKey, Chunk_NS::Chunk > &  cached_chunks,
DataBlockPtr  data_block,
size_t  row_count,
ChunkKey chunk_key,
const ColumnDescriptor column,
bool  is_first_block,
bool  is_last_block 
)

Definition at line 462 of file CsvDataWrapper.cpp.

References CHECK, CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_DB_IDX, CHUNK_KEY_FRAGMENT_IDX, CHUNK_KEY_TABLE_IDX, ColumnDescriptor::columnType, foreign_storage::anonymous_namespace{CsvDataWrapper.cpp}::get_cache_if_enabled(), Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), and SQLTypeInfo::is_varlen_indeed().

Referenced by process_data_blocks().

468  {
469  auto catalog =
471  CHECK(catalog);
472  auto cache = get_cache_if_enabled(catalog);
473  if (cache) {
474  ChunkKey index_key = {chunk_key[CHUNK_KEY_DB_IDX],
475  chunk_key[CHUNK_KEY_TABLE_IDX],
476  chunk_key[CHUNK_KEY_COLUMN_IDX],
477  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
478  2};
479  // Create actual data chunks to prepopulate cache
480  if (cached_chunks.find(chunk_key) == cached_chunks.end()) {
481  cached_chunks[chunk_key] = Chunk_NS::Chunk{column};
482  cached_chunks[chunk_key].setBuffer(
483  cache->getChunkBufferForPrecaching(chunk_key, is_first_block));
484  if (column->columnType.is_varlen_indeed()) {
485  cached_chunks[chunk_key].setIndexBuffer(
486  cache->getChunkBufferForPrecaching(index_key, is_first_block));
487  }
488  if (is_first_block) {
489  cached_chunks[chunk_key].initEncoder();
490  }
491  }
492  cached_chunks[chunk_key].appendData(data_block, row_count, 0);
493  if (is_last_block) {
494  // cache the chunks now so they are tracked by eviction algorithm
495  std::vector<ChunkKey> key_to_cache{chunk_key};
496  if (column->columnType.is_varlen_indeed()) {
497  key_to_cache.push_back(index_key);
498  }
499  cache->cacheTableChunks(key_to_cache);
500  }
501  }
502 }
std::vector< int > ChunkKey
Definition: types.h:37
foreign_storage::ForeignStorageCache * get_cache_if_enabled(std::shared_ptr< Catalog_Namespace::Catalog > &catalog)
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
static SysCatalog & instance()
Definition: SysCatalog.h:292
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
#define CHECK(condition)
Definition: Logger.h:203
SQLTypeInfo columnType
bool is_varlen_indeed() const
Definition: sqltypes.h:520
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename D , typename T >
bool foreign_storage::check_bounds ( const T &  value)
inline

Definition at line 53 of file ParquetMetadataValidator.h.

53  {
54  auto [min_value, max_value] = get_min_max_bounds<D>();
55  return value >= min_value && value <= max_value;
56 }
template<typename D >
std::string foreign_storage::datetime_to_string ( const D &  timestamp,
const SQLTypeInfo column_type 
)
inline

Definition at line 59 of file ParquetMetadataValidator.h.

References CHECK, test_fsi::d, DatumToString(), SQLTypeInfo::is_date(), and SQLTypeInfo::is_timestamp().

Referenced by foreign_storage::TimestampBoundsValidator< T >::getMinMaxBoundsAsStrings(), foreign_storage::DateInSecondsBoundsValidator< T >::getMinMaxBoundsAsStrings(), foreign_storage::TimestampBoundsValidator< T >::validateValue(), and foreign_storage::DateInSecondsBoundsValidator< T >::validateValue().

60  {
61  CHECK(column_type.is_timestamp() || column_type.is_date());
62  Datum d;
63  d.bigintval = timestamp;
64  return DatumToString(d, column_type);
65 }
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:356
bool is_timestamp() const
Definition: sqltypes.h:743
tuple d
Definition: test_fsi.py:9
#define CHECK(condition)
Definition: Logger.h:203
bool is_date() const
Definition: sqltypes.h:731

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params,
csv_file_buffer_parser::ParseBufferRequest &  request 
)

Dispatches a new metadata scan request by adding the request to the pending requests queue to be consumed by a worker thread.

Definition at line 647 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by dispatch_metadata_scan_requests().

649  {
650  {
651  std::unique_lock<std::mutex> pending_requests_lock(
652  multi_threading_params.pending_requests_mutex);
653  multi_threading_params.pending_requests.emplace(std::move(request));
654  }
655  multi_threading_params.pending_requests_condition.notify_all();
656 }

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_requests ( const size_t &  buffer_size,
const std::string &  file_path,
CsvReader &  csv_reader,
const import_export::CopyParams copy_params,
MetadataScanMultiThreadingParams &  multi_threading_params,
size_t &  first_row_index_in_buffer,
size_t &  current_file_offset 
)

Reads from a CSV file iteratively and dispatches metadata scan requests that are processed by worker threads.

Definition at line 676 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, dispatch_metadata_scan_request(), import_export::delimited_parser::find_row_end_pos(), get_request_from_pool(), foreign_storage::CsvReader::isScanFinished(), import_export::CopyParams::line_delim, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, foreign_storage::CsvReader::read(), and resize_buffer_if_needed().

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata().

683  {
684  auto alloc_size = buffer_size;
685  auto residual_buffer = std::make_unique<char[]>(alloc_size);
686  size_t residual_buffer_size = 0;
687  size_t residual_buffer_alloc_size = alloc_size;
688 
689  while (!csv_reader.isScanFinished()) {
690  {
691  std::lock_guard<std::mutex> pending_requests_lock(
692  multi_threading_params.pending_requests_mutex);
693  if (!multi_threading_params.continue_processing) {
694  break;
695  }
696  }
697  auto request = get_request_from_pool(multi_threading_params);
698  resize_buffer_if_needed(request.buffer, request.buffer_alloc_size, alloc_size);
699 
700  if (residual_buffer_size > 0) {
701  memcpy(request.buffer.get(), residual_buffer.get(), residual_buffer_size);
702  }
703  size_t size = residual_buffer_size;
704  size += csv_reader.read(request.buffer.get() + residual_buffer_size,
705  alloc_size - residual_buffer_size);
706 
707  if (size == 0) {
708  // In some cases at the end of a file we will read 0 bytes even when
709  // csv_reader.isScanFinished() is false
710  continue;
711  } else if (size == 1 && request.buffer[0] == copy_params.line_delim) {
712  // In some cases files with newlines at the end will be encoded with a second
713  // newline that can end up being the only thing in the buffer
714  current_file_offset++;
715  continue;
716  }
717  unsigned int num_rows_in_buffer = 0;
718  request.end_pos =
720  request.buffer,
721  size,
722  copy_params,
723  first_row_index_in_buffer,
724  num_rows_in_buffer,
725  nullptr,
726  &csv_reader);
727  request.buffer_size = size;
728  request.buffer_alloc_size = alloc_size;
729  request.first_row_index = first_row_index_in_buffer;
730  request.file_offset = current_file_offset;
731  request.buffer_row_count = num_rows_in_buffer;
732 
733  residual_buffer_size = size - request.end_pos;
734  if (residual_buffer_size > 0) {
735  resize_buffer_if_needed(residual_buffer, residual_buffer_alloc_size, alloc_size);
736  memcpy(residual_buffer.get(),
737  request.buffer.get() + request.end_pos,
738  residual_buffer_size);
739  }
740 
741  current_file_offset += request.end_pos;
742  first_row_index_in_buffer += num_rows_in_buffer;
743 
744  dispatch_metadata_scan_request(multi_threading_params, request);
745  }
746 
747  std::unique_lock<std::mutex> pending_requests_queue_lock(
748  multi_threading_params.pending_requests_mutex);
749  multi_threading_params.pending_requests_condition.wait(
750  pending_requests_queue_lock, [&multi_threading_params] {
751  return multi_threading_params.pending_requests.empty() ||
752  (multi_threading_params.continue_processing == false);
753  });
754  multi_threading_params.continue_processing = false;
755  pending_requests_queue_lock.unlock();
756  multi_threading_params.pending_requests_condition.notify_all();
757 }
void dispatch_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
csv_file_buffer_parser::ParseBufferRequest get_request_from_pool(MetadataScanMultiThreadingParams &multi_threading_params)
void resize_buffer_if_needed(std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
size_t find_row_end_pos(size_t &alloc_size, std::unique_ptr< char[]> &buffer, size_t &buffer_size, const CopyParams &copy_params, const size_t buffer_first_row_index, unsigned int &num_rows_in_buffer, FILE *file, foreign_storage::CsvReader *csv_reader)
Finds the closest possible row ending to the end of the given buffer. The buffer is resized as needed...

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
ArrayDatum foreign_storage::encode_as_array_datum ( const std::vector< T > &  data)
inline

Definition at line 39 of file ParquetGeospatialEncoder.h.

References omnisci.dtypes::T.

Referenced by foreign_storage::ParquetGeospatialEncoder::processGeoElement(), and foreign_storage::ParquetGeospatialEncoder::processNullGeoElement().

39  {
40  const size_t num_bytes = data.size() * sizeof(T);
41  std::shared_ptr<int8_t> buffer(new int8_t[num_bytes], std::default_delete<int8_t[]>());
42  memcpy(buffer.get(), data.data(), num_bytes);
43  return ArrayDatum(num_bytes, buffer, false);
44 }
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:202

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size 
)

Gets the appropriate buffer size to be used when processing CSV file(s).

Definition at line 215 of file CsvDataWrapper.cpp.

References import_export::CopyParams::buffer_size.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata(), and foreign_storage::CsvDataWrapper::populateChunks().

217  {
218  size_t buffer_size = copy_params.buffer_size;
219  if (size_known && file_size < buffer_size) {
220  buffer_size = file_size + 1; // +1 for end of line character, if missing
221  }
222  return buffer_size;
223 }
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const FileRegions &  file_regions)

Definition at line 225 of file CsvDataWrapper.cpp.

References CHECK.

225  {
226  size_t buffer_size = 0;
227  for (const auto& file_region : file_regions) {
228  buffer_size = std::max(buffer_size, file_region.region_size);
229  }
230  CHECK(buffer_size);
231  return buffer_size;
232 }
#define CHECK(condition)
Definition: Logger.h:203
const parquet::ColumnDescriptor * foreign_storage::get_column_descriptor ( const parquet::arrow::FileReader *  reader,
const int  logical_column_index 
)

Definition at line 45 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

47  {
48  return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
49 }

+ Here is the caller graph for this function:

int64_t foreign_storage::get_interval_duration ( const std::string &  interval)
inline

Gets the interval duration in seconds.

Parameters
interval- interval string with format of {interval_count}{interval_type} (e.g. 5H for "every 5 hours")
Returns
internal duration in seconds

Definition at line 16 of file RefreshTimeCalculator.h.

References UNREACHABLE.

Referenced by get_next_refresh_time().

16  {
17  int interval_count = std::stoi(interval.substr(0, interval.length() - 1));
18  auto interval_type = std::tolower(interval[interval.length() - 1]);
19  int64_t duration{0};
20  if (interval_type == 's') {
21  duration = interval_count;
22  } else if (interval_type == 'h') {
23  duration = interval_count * 60 * 60;
24  } else if (interval_type == 'd') {
25  duration = interval_count * 60 * 60 * 24;
26  } else {
27  UNREACHABLE();
28  }
29  return duration;
30 }
#define UNREACHABLE()
Definition: Logger.h:247

+ Here is the caller graph for this function:

std::set<ChunkKey> foreign_storage::get_keys_set_from_table ( const ChunkKey destination_chunk_key)

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer().

+ Here is the caller graph for this function:

std::vector<ChunkKey> foreign_storage::get_keys_vec_from_table ( const ChunkKey destination_chunk_key)

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer().

+ Here is the caller graph for this function:

template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > foreign_storage::get_min_max_bounds ( )
inline

Definition at line 41 of file ParquetMetadataValidator.h.

41  {
42  static_assert(std::is_signed<D>::value,
43  "'get_min_max_bounds' is only valid for signed types");
44  return {get_null_value<D>() + 1, std::numeric_limits<D>::max()};
45 }
std::optional<csv_file_buffer_parser::ParseBufferRequest> foreign_storage::get_next_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets the next metadata scan request object from the pending requests queue. A null optional is returned if there are no further requests to be processed.

Definition at line 395 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by scan_metadata().

396  {
397  std::unique_lock<std::mutex> pending_requests_lock(
398  multi_threading_params.pending_requests_mutex);
399  multi_threading_params.pending_requests_condition.wait(
400  pending_requests_lock, [&multi_threading_params] {
401  return !multi_threading_params.pending_requests.empty() ||
402  !multi_threading_params.continue_processing;
403  });
404  if (multi_threading_params.pending_requests.empty()) {
405  return {};
406  }
407  auto request = std::move(multi_threading_params.pending_requests.front());
408  multi_threading_params.pending_requests.pop();
409  pending_requests_lock.unlock();
410  multi_threading_params.pending_requests_condition.notify_all();
411  return std::move(request);
412 }

+ Here is the caller graph for this function:

int64_t foreign_storage::get_next_refresh_time ( const std::map< std::string, std::string, std::less<>> &  foreign_table_options)
inline

Definition at line 33 of file RefreshTimeCalculator.h.

References CHECK, count, get_interval_duration(), foreign_storage::ForeignTable::NULL_REFRESH_TIME, foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY, and foreign_storage::ForeignTable::REFRESH_START_DATE_TIME_KEY.

Referenced by Catalog_Namespace::anonymous_namespace{Catalog.cpp}::get_next_refresh_time().

34  {
35  int64_t current_time = std::chrono::duration_cast<std::chrono::seconds>(
36  std::chrono::system_clock::now().time_since_epoch())
37  .count();
38  auto start_date_entry = foreign_table_options.find(
40  CHECK(start_date_entry != foreign_table_options.end());
41  auto start_date_time = dateTimeParse<kTIMESTAMP>(start_date_entry->second, 0);
42 
43  // If start date time is current or in the future, then that is the next refresh time
44  if (start_date_time >= current_time) {
45  return start_date_time;
46  }
47  auto interval_entry =
48  foreign_table_options.find(foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY);
49  if (interval_entry != foreign_table_options.end()) {
50  auto interval_duration = get_interval_duration(interval_entry->second);
51  auto num_intervals =
52  (current_time - start_date_time + interval_duration - 1) / interval_duration;
53  return start_date_time + (num_intervals * interval_duration);
54  } else {
55  // If this was a one time refresh, then there is no next refresh time
57  }
58 }
int64_t get_interval_duration(const std::string &interval)
static constexpr const char * REFRESH_START_DATE_TIME_KEY
Definition: ForeignTable.h:43
int count
static constexpr const char * REFRESH_INTERVAL_KEY
Definition: ForeignTable.h:44
#define CHECK(condition)
Definition: Logger.h:203
static constexpr int NULL_REFRESH_TIME
Definition: ForeignTable.h:51

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
V foreign_storage::get_null_value ( )
inline

Definition at line 31 of file ParquetMetadataValidator.h.

31  {
32  return inline_int_null_value<V>();
33 }
std::pair< int, int > foreign_storage::get_parquet_table_size ( const ReaderPtr &  reader)

Definition at line 38 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::LazyParquetChunkLoader::metadataScan().

38  {
39  auto file_metadata = reader->parquet_reader()->metadata();
40  const auto num_row_groups = file_metadata->num_row_groups();
41  const auto num_columns = file_metadata->num_columns();
42  return std::make_pair(num_row_groups, num_columns);
43 }

+ Here is the caller graph for this function:

parquet::Type::type foreign_storage::get_physical_type ( ReaderPtr &  reader,
const int  logical_column_index 
)

Definition at line 51 of file ParquetShared.cpp.

Referenced by anonymous_namespace{ArrowResultSetConverter.cpp}::get_arrow_type(), and ArrowResultSetConverter::initializeColumnBuilder().

51  {
52  return reader->parquet_reader()
53  ->metadata()
54  ->schema()
55  ->Column(logical_column_index)
56  ->physical_type();
57 }

+ Here is the caller graph for this function:

csv_file_buffer_parser::ParseBufferRequest foreign_storage::get_request_from_pool ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets a request from the metadata scan request pool.

Definition at line 629 of file CsvDataWrapper.cpp.

References CHECK, foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by dispatch_metadata_scan_requests().

630  {
631  std::unique_lock<std::mutex> request_pool_lock(
632  multi_threading_params.request_pool_mutex);
633  multi_threading_params.request_pool_condition.wait(
634  request_pool_lock,
635  [&multi_threading_params] { return !multi_threading_params.request_pool.empty(); });
636  auto request = std::move(multi_threading_params.request_pool.front());
637  multi_threading_params.request_pool.pop();
638  request_pool_lock.unlock();
639  CHECK(request.buffer);
640  return request;
641 }
#define CHECK(condition)
Definition: Logger.h:203

+ Here is the caller graph for this function:

std::unique_ptr< ColumnDescriptor > foreign_storage::get_sub_type_column_descriptor ( const ColumnDescriptor column)

Definition at line 75 of file ParquetShared.cpp.

References ColumnDescriptor::columnId, ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_elem_type(), SQLTypeInfo::set_size(), and ColumnDescriptor::tableId.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping().

76  {
77  auto column_type = column->columnType.get_elem_type();
78  if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
79  column_type.set_size(4); // override default size of -1
80  }
81  return std::make_unique<ColumnDescriptor>(
82  column->tableId, column->columnId, column->columnName, column_type);
83 }
void set_size(int s)
Definition: sqltypes.h:412
SQLTypeInfo columnType
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:713
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size,
const size_t  buffer_size 
)

Gets the appropriate number of threads to be used for concurrent processing within the data wrapper.

Definition at line 238 of file CsvDataWrapper.cpp.

References CHECK, and import_export::CopyParams::threads.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata(), and foreign_storage::CsvDataWrapper::populateChunks().

241  {
242  size_t thread_count = copy_params.threads;
243  if (thread_count == 0) {
244  thread_count = std::thread::hardware_concurrency();
245  }
246  if (size_known) {
247  size_t num_buffers_in_file = (file_size + buffer_size - 1) / buffer_size;
248  if (num_buffers_in_file < thread_count) {
249  thread_count = num_buffers_in_file;
250  }
251  }
252  CHECK(thread_count);
253  return thread_count;
254 }
#define CHECK(condition)
Definition: Logger.h:203
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const FileRegions &  file_regions 
)

Definition at line 256 of file CsvDataWrapper.cpp.

References CHECK, and import_export::CopyParams::threads.

257  {
258  size_t thread_count = copy_params.threads;
259  if (thread_count == 0) {
260  thread_count =
261  std::min<size_t>(std::thread::hardware_concurrency(), file_regions.size());
262  }
263  CHECK(thread_count);
264  return thread_count;
265 }
#define CHECK(condition)
Definition: Logger.h:203
void foreign_storage::get_value ( const rapidjson::Value &  json_val,
FileRegion &  file_region 
)

Definition at line 44 of file CsvShared.cpp.

References CHECK, foreign_storage::FileRegion::filename, foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::json_utils::get_value_from_object(), foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

44  {
45  CHECK(json_val.IsObject());
47  json_val, file_region.first_row_file_offset, "first_row_file_offset");
49  json_val, file_region.first_row_index, "first_row_index");
50  json_utils::get_value_from_object(json_val, file_region.region_size, "region_size");
51  json_utils::get_value_from_object(json_val, file_region.row_count, "row_count");
52  if (json_val.HasMember("filename")) {
53  json_utils::get_value_from_object(json_val, file_region.filename, "filename");
54  }
55 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:126
#define CHECK(condition)
Definition: Logger.h:203

+ Here is the call graph for this function:

void foreign_storage::get_value ( const rapidjson::Value &  json_val,
RowGroupInterval &  value 
)

Definition at line 505 of file ParquetDataWrapper.cpp.

References CHECK, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::json_utils::get_value_from_object(), and foreign_storage::RowGroupInterval::start_index.

505  {
506  CHECK(json_val.IsObject());
507  json_utils::get_value_from_object(json_val, value.file_path, "file_path");
508  json_utils::get_value_from_object(json_val, value.start_index, "start_index");
509  json_utils::get_value_from_object(json_val, value.end_index, "end_index");
510 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:126
#define CHECK(condition)
Definition: Logger.h:203

+ Here is the call graph for this function:

bool foreign_storage::is_metadata_placeholder ( const ChunkMetadata metadata)
inline

Check if ChunkMetadata corresponds to a chunk for which metadata must be populated.

Definition at line 27 of file MetadataPlaceholder.h.

References ChunkMetadata::chunkStats, Datum::intval, SQLTypeInfo::is_dict_encoded_type(), ChunkStats::max, ChunkStats::min, and ChunkMetadata::sqlType.

Referenced by anonymous_namespace{RelAlgExecutor.cpp}::prepare_string_dictionaries().

27  {
28  return metadata.chunkStats.min.intval > metadata.chunkStats.max.intval &&
29  metadata.sqlType.is_dict_encoded_type(); // Only supported type for now
30 }
int32_t intval
Definition: sqltypes.h:208
ChunkStats chunkStats
Definition: ChunkMetadata.h:35
bool is_dict_encoded_type() const
Definition: sqltypes.h:530
SQLTypeInfo sqlType
Definition: ChunkMetadata.h:32

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

UniqueReaderPtr foreign_storage::open_parquet_table ( const std::string &  file_path,
std::shared_ptr< arrow::fs::FileSystem > &  file_system 
)

Definition at line 25 of file ParquetShared.cpp.

Referenced by foreign_storage::FileReaderMap::getOrInsert(), and foreign_storage::FileReaderMap::insert().

26  {
27  UniqueReaderPtr reader;
28  auto file_result = file_system->OpenInputFile(file_path);
29  if (!file_result.ok()) {
30  throw std::runtime_error{"Unable to access " + file_system->type_name() + " file: " +
31  file_path + ". " + file_result.status().message()};
32  }
33  auto infile = file_result.ValueOrDie();
34  PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
35  return reader;
36 }
std::unique_ptr< parquet::arrow::FileReader > UniqueReaderPtr
Definition: ParquetShared.h:32

+ Here is the caller graph for this function:

ParseFileRegionResult foreign_storage::parse_file_regions ( const FileRegions &  file_regions,
const size_t  start_index,
const size_t  end_index,
CsvReader &  csv_reader,
csv_file_buffer_parser::ParseBufferRequest &  parse_file_request,
const std::map< int, Chunk_NS::Chunk > &  column_id_to_chunk_map 
)

Parses a set of file regions given a handle to the file and range of indexes for the file regions to be parsed.

Definition at line 175 of file CsvDataWrapper.cpp.

References foreign_storage::csv_file_buffer_parser::ParseBufferRequest::begin_pos, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::buffer, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::buffer_size, CHECK, CHECK_EQ, foreign_storage::csv_file_buffer_parser::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::end_pos, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::file_offset, foreign_storage::ParseFileRegionResult::file_offset, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::first_row_index, i, foreign_storage::csv_file_buffer_parser::parse_buffer(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::process_row_count, foreign_storage::CsvReader::readRegion(), run_benchmark_import::result, and foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count.

Referenced by foreign_storage::CsvDataWrapper::populateChunks().

181  {
182  ParseFileRegionResult load_file_region_result{};
183  load_file_region_result.file_offset = file_regions[start_index].first_row_file_offset;
184  load_file_region_result.row_count = 0;
185 
186  csv_file_buffer_parser::ParseBufferResult result;
187  for (size_t i = start_index; i <= end_index; i++) {
188  CHECK(file_regions[i].region_size <= parse_file_request.buffer_size);
189  size_t read_size;
190  {
191  read_size = csv_reader.readRegion(parse_file_request.buffer.get(),
192  file_regions[i].first_row_file_offset,
193  file_regions[i].region_size);
194  }
195 
196  CHECK_EQ(file_regions[i].region_size, read_size);
197  parse_file_request.begin_pos = 0;
198  parse_file_request.end_pos = file_regions[i].region_size;
199  parse_file_request.first_row_index = file_regions[i].first_row_index;
200  parse_file_request.file_offset = file_regions[i].first_row_file_offset;
201  parse_file_request.process_row_count = file_regions[i].row_count;
202 
203  result = parse_buffer(parse_file_request, i == end_index);
204  CHECK_EQ(file_regions[i].row_count, result.row_count);
205  load_file_region_result.row_count += result.row_count;
206  }
207  load_file_region_result.column_id_to_data_blocks_map =
208  result.column_id_to_data_blocks_map;
209  return load_file_region_result;
210 }
#define CHECK_EQ(x, y)
Definition: Logger.h:211
ParseBufferResult parse_buffer(ParseBufferRequest &request, bool convert_data_blocks, bool columns_are_pre_filtered)
#define CHECK(condition)
Definition: Logger.h:203

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector<size_t> foreign_storage::partition_by_fragment ( const size_t  start_row_index,
const size_t  max_fragment_size,
const size_t  buffer_row_count 
)

Given a start row index, maximum fragment size, and number of rows in a buffer, this function returns a vector indicating how the rows in the buffer should be partitioned in order to fill up available fragment slots while staying within the capacity of fragments.

Definition at line 344 of file CsvDataWrapper.cpp.

References CHECK.

Referenced by scan_metadata().

346  {
347  CHECK(buffer_row_count > 0);
348  std::vector<size_t> partitions{};
349  size_t remaining_rows_in_last_fragment;
350  if (start_row_index % max_fragment_size == 0) {
351  remaining_rows_in_last_fragment = 0;
352  } else {
353  remaining_rows_in_last_fragment =
354  max_fragment_size - (start_row_index % max_fragment_size);
355  }
356  if (buffer_row_count <= remaining_rows_in_last_fragment) {
357  partitions.emplace_back(buffer_row_count);
358  } else {
359  if (remaining_rows_in_last_fragment > 0) {
360  partitions.emplace_back(remaining_rows_in_last_fragment);
361  }
362  size_t remaining_buffer_row_count =
363  buffer_row_count - remaining_rows_in_last_fragment;
364  while (remaining_buffer_row_count > 0) {
365  partitions.emplace_back(
366  std::min<size_t>(remaining_buffer_row_count, max_fragment_size));
367  remaining_buffer_row_count -= partitions.back();
368  }
369  }
370  return partitions;
371 }
#define CHECK(condition)
Definition: Logger.h:203

+ Here is the caller graph for this function:

template<typename T >
auto foreign_storage::partition_for_threads ( const std::set< T > &  items,
size_t  max_threads 
)

Definition at line 41 of file ParquetShared.h.

References i.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan(), and foreign_storage::ParquetDataWrapper::populateChunkBuffers().

41  {
42  const size_t items_per_thread = (items.size() + (max_threads - 1)) / max_threads;
43  std::list<std::set<T>> items_by_thread;
44  auto i = 0U;
45  for (auto item : items) {
46  if (i++ % items_per_thread == 0) {
47  items_by_thread.emplace_back(std::set<T>{});
48  }
49  items_by_thread.back().emplace(item);
50  }
51  return items_by_thread;
52 }

+ Here is the caller graph for this function:

void foreign_storage::process_data_blocks ( MetadataScanMultiThreadingParams &  multi_threading_params,
int  fragment_id,
const csv_file_buffer_parser::ParseBufferRequest &  request,
csv_file_buffer_parser::ParseBufferResult &  result,
std::map< int, const ColumnDescriptor * > &  column_by_id,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map 
)

Updates metadata encapsulated in encoders for all table columns given new data blocks gotten from parsing a new set of rows in a CSV file buffer. If cache is available, also append the data_blocks to chunks in the cache

Definition at line 509 of file CsvDataWrapper.cpp.

References add_file_region(), cache_blocks(), foreign_storage::MetadataScanMultiThreadingParams::cached_chunks, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers_mutex, foreign_storage::csv_file_buffer_parser::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::db_id, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::first_row_index, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::getFilePath(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::getMaxFragRows(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::getTableId(), foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count, and update_stats().

Referenced by scan_metadata().

514  {
515  std::lock_guard<std::mutex> lock(multi_threading_params.chunk_encoder_buffers_mutex);
516  // File regions should be added in same order as appendData
517  add_file_region(fragment_id_to_file_regions_map,
518  fragment_id,
519  request.first_row_index,
520  result,
521  request.getFilePath());
522 
523  for (auto& [column_id, data_block] : result.column_id_to_data_blocks_map) {
524  ChunkKey chunk_key{request.db_id, request.getTableId(), column_id, fragment_id};
525  const auto column = column_by_id[column_id];
526  if (column->columnType.is_varlen_indeed()) {
527  chunk_key.emplace_back(1);
528  }
529  if (multi_threading_params.chunk_encoder_buffers.find(chunk_key) ==
530  multi_threading_params.chunk_encoder_buffers.end()) {
531  multi_threading_params.chunk_encoder_buffers[chunk_key] =
532  std::make_unique<ForeignStorageBuffer>();
533  multi_threading_params.chunk_encoder_buffers[chunk_key]->initEncoder(
534  column->columnType);
535  }
536  update_stats(multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder(),
537  column->columnType,
538  data_block,
539  result.row_count);
540  size_t num_elements = multi_threading_params.chunk_encoder_buffers[chunk_key]
541  ->getEncoder()
542  ->getNumElems() +
543  result.row_count;
544  multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder()->setNumElems(
545  num_elements);
546  cache_blocks(
547  multi_threading_params.cached_chunks,
548  data_block,
549  result.row_count,
550  chunk_key,
551  column,
552  (num_elements - result.row_count) == 0, // Is the first block added to this chunk
553  num_elements == request.getMaxFragRows() // Is the last block for this chunk
554  );
555  }
556 }
std::vector< int > ChunkKey
Definition: types.h:37
void update_stats(Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
void cache_blocks(std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block)
void add_file_region(std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const csv_file_buffer_parser::ParseBufferResult &result, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::refresh_foreign_table ( Catalog_Namespace::Catalog catalog,
const std::string &  table_name,
const bool  evict_cached_entries 
)

Definition at line 22 of file ForeignTableRefresh.cpp.

References CHUNK_KEY_FRAGMENT_IDX, Data_Namespace::CPU_LEVEL, Catalog_Namespace::DBMetadata::dbId, StorageType::FOREIGN_TABLE, Catalog_Namespace::Catalog::getCurrentDB(), Catalog_Namespace::Catalog::getDataMgr(), Catalog_Namespace::Catalog::getForeignTableUnlocked(), PostEvictionRefreshException::getOriginalException(), Data_Namespace::GPU_LEVEL, foreign_storage::ForeignTable::isAppendMode(), Catalog_Namespace::Catalog::removeFragmenterForTable(), and Catalog_Namespace::Catalog::updateForeignTableRefreshTimes().

Referenced by RefreshForeignTablesCommand::execute(), and foreign_storage::ForeignTableRefreshScheduler::start().

24  {
25  auto& data_mgr = catalog.getDataMgr();
26  auto table_lock =
27  std::make_unique<lockmgr::TableSchemaLockContainer<lockmgr::WriteLock>>(
29  catalog, table_name, false));
30 
31  const TableDescriptor* td = (*table_lock)();
32  if (td->storageType != StorageType::FOREIGN_TABLE) {
33  throw std::runtime_error{
34  table_name +
35  " is not a foreign table. Refreshes are applicable to only foreign tables."};
36  }
37 
38  catalog.removeFragmenterForTable(td->tableId);
39  ChunkKey table_key{catalog.getCurrentDB().dbId, td->tableId};
40 
41  if (catalog.getForeignTableUnlocked(td->tableId)->isAppendMode() &&
42  !evict_cached_entries) {
43  ChunkMetadataVector metadata_vec;
44  data_mgr.getChunkMetadataVecForKeyPrefix(metadata_vec, table_key);
45  int last_fragment_id = 0;
46  for (const auto& [key, metadata] : metadata_vec) {
47  if (key[CHUNK_KEY_FRAGMENT_IDX] > last_fragment_id) {
48  last_fragment_id = key[CHUNK_KEY_FRAGMENT_IDX];
49  }
50  }
51  for (const auto& [key, metadata] : metadata_vec) {
52  if (key[CHUNK_KEY_FRAGMENT_IDX] == last_fragment_id) {
53  data_mgr.deleteChunksWithPrefix(key, MemoryLevel::CPU_LEVEL);
54  data_mgr.deleteChunksWithPrefix(key, MemoryLevel::GPU_LEVEL);
55  }
56  }
57  } else {
58  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::CPU_LEVEL);
59  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::GPU_LEVEL);
60  }
61 
62  try {
63  data_mgr.getPersistentStorageMgr()->getForeignStorageMgr()->refreshTable(
64  table_key, evict_cached_entries);
65  catalog.updateForeignTableRefreshTimes(td->tableId);
66  } catch (PostEvictionRefreshException& e) {
67  catalog.updateForeignTableRefreshTimes(td->tableId);
68  throw e.getOriginalException();
69  }
70 }
std::vector< int > ChunkKey
Definition: types.h:37
const foreign_storage::ForeignTable * getForeignTableUnlocked(int tableId) const
Definition: Catalog.cpp:1727
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:223
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
std::runtime_error getOriginalException()
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:222
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
bool isAppendMode() const
Checks if the table is in append mode.
void removeFragmenterForTable(const int table_id) const
Definition: Catalog.cpp:3444
static constexpr char const * FOREIGN_TABLE
void updateForeignTableRefreshTimes(const int32_t table_id)
Definition: Catalog.cpp:4727

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::resize_buffer_if_needed ( std::unique_ptr< char[]> &  buffer,
size_t &  buffer_size,
const size_t  alloc_size 
)

Optionally resizes the given buffer if the buffer size is less than the current buffer allocation size.

Definition at line 662 of file CsvDataWrapper.cpp.

References CHECK_LE.

Referenced by dispatch_metadata_scan_requests().

664  {
665  CHECK_LE(buffer_size, alloc_size);
666  if (buffer_size < alloc_size) {
667  buffer = std::make_unique<char[]>(alloc_size);
668  buffer_size = alloc_size;
669  }
670 }
#define CHECK_LE(x, y)
Definition: Logger.h:214

+ Here is the caller graph for this function:

void foreign_storage::scan_metadata ( MetadataScanMultiThreadingParams &  multi_threading_params,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map 
)

Consumes and processes metadata scan requests from a pending requests queue and updates existing metadata objects based on newly scanned metadata.

Definition at line 575 of file CsvDataWrapper.cpp.

References add_request_to_pool(), foreign_storage::MetadataScanMultiThreadingParams::continue_processing, get_next_metadata_scan_request(), foreign_storage::csv_file_buffer_parser::parse_buffer(), partition_by_fragment(), foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, process_data_blocks(), and run_benchmark_import::result.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata().

576  {
577  std::map<int, const ColumnDescriptor*> column_by_id{};
578  while (true) {
579  auto request_opt = get_next_metadata_scan_request(multi_threading_params);
580  if (!request_opt.has_value()) {
581  break;
582  }
583  auto& request = request_opt.value();
584  try {
585  if (column_by_id.empty()) {
586  for (const auto column : request.getColumns()) {
587  column_by_id[column->columnId] = column;
588  }
589  }
590  auto partitions = partition_by_fragment(
591  request.first_row_index, request.getMaxFragRows(), request.buffer_row_count);
592  request.begin_pos = 0;
593  size_t row_index = request.first_row_index;
594  for (const auto partition : partitions) {
595  request.process_row_count = partition;
596  for (const auto& import_buffer : request.import_buffers) {
597  if (import_buffer != nullptr) {
598  import_buffer->clear();
599  }
600  }
601  auto result = parse_buffer(request, true);
602  int fragment_id = row_index / request.getMaxFragRows();
603  process_data_blocks(multi_threading_params,
604  fragment_id,
605  request,
606  result,
607  column_by_id,
608  fragment_id_to_file_regions_map);
609  row_index += result.row_count;
610  request.begin_pos = result.row_offsets.back() - request.file_offset;
611  }
612  } catch (...) {
613  // Re-add request to pool so we dont block any other threads
614  {
615  std::lock_guard<std::mutex> pending_requests_lock(
616  multi_threading_params.pending_requests_mutex);
617  multi_threading_params.continue_processing = false;
618  }
619  add_request_to_pool(multi_threading_params, request);
620  throw;
621  }
622  add_request_to_pool(multi_threading_params, request);
623  }
624 }
void add_request_to_pool(MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
std::vector< size_t > partition_by_fragment(const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
ParseBufferResult parse_buffer(ParseBufferRequest &request, bool convert_data_blocks, bool columns_are_pre_filtered)
void process_data_blocks(MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const csv_file_buffer_parser::ParseBufferRequest &request, csv_file_buffer_parser::ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
std::optional< csv_file_buffer_parser::ParseBufferRequest > get_next_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const FileRegion &  file_region,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 26 of file CsvShared.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::FileRegion::filename, foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

28  {
29  json_val.SetObject();
31  json_val, file_region.first_row_file_offset, "first_row_file_offset", allocator);
33  json_val, file_region.first_row_index, "first_row_index", allocator);
35  json_val, file_region.region_size, "region_size", allocator);
37  json_val, file_region.row_count, "row_count", allocator);
38  if (file_region.filename.size()) {
40  json_val, file_region.filename, "filename", allocator);
41  }
42 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:111

+ Here is the call graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const RowGroupInterval &  value,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 496 of file ParquetDataWrapper.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, and foreign_storage::RowGroupInterval::start_index.

498  {
499  json_val.SetObject();
500  json_utils::add_value_to_object(json_val, value.file_path, "file_path", allocator);
501  json_utils::add_value_to_object(json_val, value.start_index, "start_index", allocator);
502  json_utils::add_value_to_object(json_val, value.end_index, "end_index", allocator);
503 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:111

+ Here is the call graph for this function:

void foreign_storage::throw_number_of_columns_mismatch_error ( size_t  num_table_cols,
size_t  num_file_cols,
const std::string &  file_path 
)
inline

Definition at line 36 of file ForeignDataWrapperShared.h.

References to_string().

Referenced by foreign_storage::csv_file_buffer_parser::validate_expected_column_count(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns().

38  {
39  throw std::runtime_error{"Mismatched number of logical columns: (expected " +
40  std::to_string(num_table_cols) + " columns, has " +
41  std::to_string(num_file_cols) + "): in file '" + file_path +
42  "'"};
43 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::throw_parquet_metadata_out_of_bounds_error ( const std::string &  min_value,
const std::string &  max_value,
const std::string &  encountered_value 
)
inline

Definition at line 67 of file ParquetMetadataValidator.h.

Referenced by foreign_storage::TimestampBoundsValidator< T >::validateValue(), foreign_storage::IntegralFixedLengthBoundsValidator< T >::validateValue(), foreign_storage::DateInSecondsBoundsValidator< T >::validateValue(), and foreign_storage::FloatPointValidator< T >::validateValue().

70  {
71  std::stringstream error_message;
72  error_message << "Parquet column contains values that are outside the range of the "
73  "OmniSci column "
74  "type. Consider using a wider column type. Min allowed value: "
75  << min_value << ". Max allowed value: " << max_value
76  << ". Encountered value: " << encountered_value << ".";
77  throw std::runtime_error(error_message.str());
78 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_file_error ( const std::string &  file_path)
inline

Definition at line 30 of file ForeignDataWrapperShared.h.

Referenced by foreign_storage::LocalMultiFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

30  {
31  throw std::runtime_error{
32  "Refresh of foreign table created with \"APPEND\" update type failed as "
33  "file \"" +
34  file_path + "\" was removed."};
35 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_row_error ( const std::string &  file_path)
inline

Definition at line 23 of file ForeignDataWrapperShared.h.

Referenced by foreign_storage::SingleFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

23  {
24  throw std::runtime_error{
25  "Refresh of foreign table created with \"APPEND\" update type failed as file "
26  "reduced in size: " +
27  file_path};
28 }

+ Here is the caller graph for this function:

void foreign_storage::update_stats ( Encoder encoder,
const SQLTypeInfo column_type,
DataBlockPtr  data_block,
const size_t  row_count 
)

Updates the statistics metadata encapsulated in the encoder given new data in a data block.

Definition at line 435 of file CsvDataWrapper.cpp.

References DataBlockPtr::arraysPtr, SQLTypeInfo::is_array(), SQLTypeInfo::is_varlen(), DataBlockPtr::numbersPtr, DataBlockPtr::stringsPtr, and Encoder::updateStats().

Referenced by process_data_blocks(), Fragmenter_Namespace::InsertOrderFragmenter::updateColumn(), Fragmenter_Namespace::InsertOrderFragmenter::updateColumnMetadata(), and StorageIOFacility::yieldUpdateCallback().

438  {
439  if (column_type.is_array()) {
440  encoder->updateStats(data_block.arraysPtr, 0, row_count);
441  } else if (!column_type.is_varlen()) {
442  encoder->updateStats(data_block.numbersPtr, row_count);
443  } else {
444  encoder->updateStats(data_block.stringsPtr, 0, row_count);
445  }
446 }
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:221
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:222
bool is_varlen() const
Definition: sqltypes.h:514
int8_t * numbersPtr
Definition: sqltypes.h:220
virtual void updateStats(const int64_t val, const bool is_null)=0
bool is_array() const
Definition: sqltypes.h:497

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr< parquet::Statistics > foreign_storage::validate_and_get_column_metadata_statistics ( const parquet::ColumnChunkMetaData *  column_metadata)

Definition at line 85 of file ParquetShared.cpp.

References CHECK.

Referenced by foreign_storage::ParquetEncoder::getRowGroupMetadata(), and foreign_storage::TypedParquetInPlaceEncoder< int64_t, int32_t >::getRowGroupMetadata().

86  {
87  CHECK(column_metadata->is_stats_set());
88  std::shared_ptr<parquet::Statistics> stats = column_metadata->statistics();
89  bool is_all_nulls = stats->null_count() == column_metadata->num_values();
90  CHECK(is_all_nulls || stats->HasMinMax());
91  return stats;
92 }
#define CHECK(condition)
Definition: Logger.h:203

+ Here is the caller graph for this function:

void foreign_storage::validate_equal_column_descriptor ( const parquet::ColumnDescriptor *  reference_descriptor,
const parquet::ColumnDescriptor *  new_descriptor,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 59 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

63  {
64  if (!reference_descriptor->Equals(*new_descriptor)) {
65  throw std::runtime_error{"Parquet file \"" + new_file_path +
66  "\" has a different schema. Please ensure that all Parquet "
67  "files use the same schema. Reference Parquet file: " +
68  reference_file_path +
69  ", column name: " + reference_descriptor->name() +
70  ". New Parquet file: " + new_file_path +
71  ", column name: " + new_descriptor->name() + "."};
72  }
73 }

+ Here is the caller graph for this function:

void foreign_storage::validate_non_foreign_table_write ( const TableDescriptor table_descriptor)
inline

Definition at line 22 of file FsiUtils.h.

References StorageType::FOREIGN_TABLE, and TableDescriptor::storageType.

Referenced by Parser::InsertStmt::analyze(), Parser::InsertValuesStmt::determineLeafIndex(), Parser::TruncateTableStmt::execute(), Parser::InsertValuesStmt::execute(), Parser::InsertIntoTableAsSelectStmt::populateData(), and RelModify::RelModify().

22  {
23  if (table_descriptor && table_descriptor->storageType == StorageType::FOREIGN_TABLE) {
24  throw std::runtime_error{
25  "DELETE, INSERT, TRUNCATE, OR UPDATE commands are not supported for foreign "
26  "tables."};
27  }
28 }
std::string storageType
static constexpr char const * FOREIGN_TABLE

+ Here is the caller graph for this function:

Variable Documentation