OmniSciDB  a667adc9c8
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
foreign_storage Namespace Reference

Namespaces

 anonymous_namespace{AbstractFileStorageDataWrapper.cpp}
 
 anonymous_namespace{CachingForeignStorageMgr.cpp}
 
 anonymous_namespace{CsvDataWrapper.cpp}
 
 anonymous_namespace{CsvReader.cpp}
 
 anonymous_namespace{ForeignStorageCache.cpp}
 
 anonymous_namespace{LazyParquetChunkLoader.cpp}
 
 anonymous_namespace{ParquetDataWrapper.cpp}
 
 Csv
 
 csv_file_buffer_parser
 
 json_utils
 

Classes

struct  ForeignServer
 
struct  ForeignTable
 
struct  OptionsContainer
 
class  AbstractFileStorageDataWrapper
 
class  CachingForeignStorageMgr
 
struct  ParseFileRegionResult
 
struct  MetadataScanMultiThreadingParams
 
class  CsvDataWrapper
 
class  CsvReader
 
class  SingleFileReader
 
class  ArchiveWrapper
 
class  CompressedFileReader
 
class  MultiFileReader
 
class  LocalMultiFileReader
 
struct  FileRegion
 
class  ForeignDataWrapper
 
struct  DataWrapperType
 Encapsulates an enumeration of foreign data wrapper type strings. More...
 
class  ForeignDataWrapperFactory
 
class  ForeignStorageBuffer
 
class  ForeignStorageCache
 
class  ForeignStorageException
 
class  MockForeignDataWrapper
 
class  ForeignStorageMgr
 
class  ForeignTableSchema
 
struct  ColumnType
 
struct  FragmentType
 
struct  Interval
 
class  LazyParquetChunkLoader
 
class  ParquetArrayEncoder
 
class  ParquetDataWrapper
 
class  ParquetDateFromTimestampEncoder
 
class  ParquetDateInSecondsEncoder
 
class  ParquetDecimalEncoder
 
class  ParquetEncoder
 
class  ParquetScalarEncoder
 
class  ParquetFixedLengthArrayEncoder
 
class  ParquetFixedLengthEncoder
 
class  ParquetUnsignedFixedLengthEncoder
 
class  ParquetGeospatialEncoder
 
class  ParquetInPlaceEncoder
 
class  TypedParquetInPlaceEncoder
 
class  ParquetMetadataValidator
 
class  TimestampBoundsValidator
 
class  IntegralFixedLengthBoundsValidator
 
class  DateInSecondsBoundsValidator
 
class  FloatPointValidator
 
struct  RowGroupInterval
 
struct  RowGroupMetadata
 
class  FileReaderMap
 
class  ParquetStringEncoder
 
class  ParquetStringNoneEncoder
 
class  ParquetTimeEncoder
 
class  ParquetTimestampEncoder
 
class  ParquetVariableLengthArrayEncoder
 
class  ForeignTableRefreshScheduler
 

Typedefs

using OptionsMap = std::map< std::string, std::string, std::less<>>
 
using FileRegions = std::vector< FileRegion >
 
using ChunkToBufferMap = std::map< ChunkKey, AbstractBuffer * >
 
using read_lock = mapd_shared_lock< mapd_shared_mutex >
 
using write_lock = mapd_unique_lock< mapd_shared_mutex >
 
using UniqueReaderPtr = std::unique_ptr< parquet::arrow::FileReader >
 
using ReaderPtr = parquet::arrow::FileReader *
 

Functions

int64_t get_interval_duration (const std::string &interval)
 
int64_t get_next_refresh_time (const std::map< std::string, std::string, std::less<>> &foreign_table_options)
 
ParseFileRegionResult parse_file_regions (const FileRegions &file_regions, const size_t start_index, const size_t end_index, CsvReader &csv_reader, csv_file_buffer_parser::ParseBufferRequest &parse_file_request, const std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map)
 
size_t get_buffer_size (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size)
 
size_t get_buffer_size (const FileRegions &file_regions)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size, const size_t buffer_size)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const FileRegions &file_regions)
 
std::vector< size_t > partition_by_fragment (const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
 
std::optional
< csv_file_buffer_parser::ParseBufferRequest
get_next_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params)
 
void add_file_region (std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const csv_file_buffer_parser::ParseBufferResult &result, const std::string &file_path)
 
size_t get_var_length_data_block_size (DataBlockPtr data_block, SQLTypeInfo sql_type_info)
 
void update_stats (Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
 
void cache_blocks (std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block)
 
void process_data_blocks (MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const csv_file_buffer_parser::ParseBufferRequest &request, csv_file_buffer_parser::ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
 
void add_request_to_pool (MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
 
void scan_metadata (MetadataScanMultiThreadingParams &multi_threading_params, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
 
csv_file_buffer_parser::ParseBufferRequest get_request_from_pool (MetadataScanMultiThreadingParams &multi_threading_params)
 
void dispatch_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
 
void resize_buffer_if_needed (std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
 
void dispatch_metadata_scan_requests (const size_t &buffer_size, const std::string &file_path, CsvReader &csv_reader, const import_export::CopyParams &copy_params, MetadataScanMultiThreadingParams &multi_threading_params, size_t &first_row_index_in_buffer, size_t &current_file_offset)
 
void set_value (rapidjson::Value &json_val, const FileRegion &file_region, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, FileRegion &file_region)
 
void throw_removed_row_error (const std::string &file_path)
 
void throw_removed_file_error (const std::string &file_path)
 
void throw_number_of_columns_mismatch_error (size_t num_table_cols, size_t num_file_cols, const std::string &file_path)
 
std::vector< ChunkKeyget_keys_vec_from_table (const ChunkKey &destination_chunk_key)
 
std::set< ChunkKeyget_keys_set_from_table (const ChunkKey &destination_chunk_key)
 
void refresh_foreign_table (Catalog_Namespace::Catalog &catalog, const std::string &table_name, const bool evict_cached_entries)
 
bool is_metadata_placeholder (const ChunkMetadata &metadata)
 
void set_value (rapidjson::Value &json_val, const RowGroupInterval &value, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, RowGroupInterval &value)
 
template<typename T >
ArrayDatum encode_as_array_datum (const std::vector< T > &data)
 
template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
get_null_value ()
 
template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > get_min_max_bounds ()
 
template<typename D , typename T >
bool check_bounds (const T &value)
 
template<typename D >
std::string datetime_to_string (const D &timestamp, const SQLTypeInfo &column_type)
 
void throw_parquet_metadata_out_of_bounds_error (const std::string &min_value, const std::string &max_value, const std::string &encountered_value)
 
UniqueReaderPtr open_parquet_table (const std::string &file_path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
 
std::pair< int, int > get_parquet_table_size (const ReaderPtr &reader)
 
const parquet::ColumnDescriptor * get_column_descriptor (const parquet::arrow::FileReader *reader, const int logical_column_index)
 
parquet::Type::type get_physical_type (ReaderPtr &reader, const int logical_column_index)
 
void validate_equal_column_descriptor (const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
 
std::unique_ptr< ColumnDescriptorget_sub_type_column_descriptor (const ColumnDescriptor *column)
 
std::shared_ptr
< parquet::Statistics > 
validate_and_get_column_metadata_statistics (const parquet::ColumnChunkMetaData *column_metadata)
 
template<typename T >
auto partition_for_threads (const std::set< T > &items, size_t max_threads)
 
void validate_non_foreign_table_write (const TableDescriptor *table_descriptor)
 

Variables

const std::string wrapper_file_name = "wrapper_metadata.json"
 

Typedef Documentation

Definition at line 28 of file ForeignDataWrapper.h.

using foreign_storage::FileRegions = typedef std::vector<FileRegion>

Definition at line 71 of file CsvShared.h.

using foreign_storage::OptionsMap = typedef std::map<std::string, std::string, std::less<>>

Definition at line 30 of file OptionsContainer.h.

using foreign_storage::ReaderPtr = typedef parquet::arrow::FileReader*

Definition at line 33 of file ParquetShared.h.

using foreign_storage::UniqueReaderPtr = typedef std::unique_ptr<parquet::arrow::FileReader>

Definition at line 32 of file ParquetShared.h.

Function Documentation

void foreign_storage::add_file_region ( std::map< int, FileRegions > &  fragment_id_to_file_regions_map,
int  fragment_id,
size_t  first_row_index,
const csv_file_buffer_parser::ParseBufferResult &  result,
const std::string &  file_path 
)

Creates a new file region based on metadata from parsed CSV file buffers and adds the new region to the fragment id to file regions map.

Definition at line 420 of file CsvDataWrapper.cpp.

References foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count, and foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_offsets.

Referenced by process_data_blocks().

424  {
425  fragment_id_to_file_regions_map[fragment_id].emplace_back(
426  // file naming is handled by CsvReader
427  FileRegion(result.row_offsets.front(),
428  first_row_index,
429  result.row_count,
430  result.row_offsets.back() - result.row_offsets.front()));
431 }

+ Here is the caller graph for this function:

void foreign_storage::add_request_to_pool ( MetadataScanMultiThreadingParams &  multi_threading_params,
csv_file_buffer_parser::ParseBufferRequest &  request 
)

Adds the request object for a processed request back to the request pool for reuse in subsequent requests.

Definition at line 596 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by scan_metadata().

597  {
598  std::unique_lock<std::mutex> completed_requests_queue_lock(
599  multi_threading_params.request_pool_mutex);
600  multi_threading_params.request_pool.emplace(std::move(request));
601  completed_requests_queue_lock.unlock();
602  multi_threading_params.request_pool_condition.notify_all();
603 }

+ Here is the caller graph for this function:

void foreign_storage::cache_blocks ( std::map< ChunkKey, Chunk_NS::Chunk > &  cached_chunks,
DataBlockPtr  data_block,
size_t  row_count,
ChunkKey chunk_key,
const ColumnDescriptor column,
bool  is_first_block,
bool  is_last_block 
)

Definition at line 486 of file CsvDataWrapper.cpp.

References CHECK, CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_DB_IDX, CHUNK_KEY_FRAGMENT_IDX, CHUNK_KEY_TABLE_IDX, ColumnDescriptor::columnType, foreign_storage::anonymous_namespace{CsvDataWrapper.cpp}::get_cache_if_enabled(), Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), and SQLTypeInfo::is_varlen_indeed().

Referenced by process_data_blocks().

492  {
493  auto catalog =
495  CHECK(catalog);
496  auto cache = get_cache_if_enabled(catalog);
497  if (cache) {
498  ChunkKey index_key = {chunk_key[CHUNK_KEY_DB_IDX],
499  chunk_key[CHUNK_KEY_TABLE_IDX],
500  chunk_key[CHUNK_KEY_COLUMN_IDX],
501  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
502  2};
503  // Create actual data chunks to prepopulate cache
504  if (cached_chunks.find(chunk_key) == cached_chunks.end()) {
505  cached_chunks[chunk_key] = Chunk_NS::Chunk{column};
506  cached_chunks[chunk_key].setBuffer(
507  cache->getChunkBufferForPrecaching(chunk_key, is_first_block));
508  if (column->columnType.is_varlen_indeed()) {
509  cached_chunks[chunk_key].setIndexBuffer(
510  cache->getChunkBufferForPrecaching(index_key, is_first_block));
511  }
512  if (is_first_block) {
513  cached_chunks[chunk_key].initEncoder();
514  }
515  }
516  cached_chunks[chunk_key].appendData(data_block, row_count, 0);
517  if (is_last_block) {
518  // cache the chunks now so they are tracked by eviction algorithm
519  std::vector<ChunkKey> key_to_cache{chunk_key};
520  if (column->columnType.is_varlen_indeed()) {
521  key_to_cache.push_back(index_key);
522  }
523  cache->cacheTableChunks(key_to_cache);
524  }
525  }
526 }
std::vector< int > ChunkKey
Definition: types.h:37
foreign_storage::ForeignStorageCache * get_cache_if_enabled(std::shared_ptr< Catalog_Namespace::Catalog > &catalog)
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
static SysCatalog & instance()
Definition: SysCatalog.h:292
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
#define CHECK(condition)
Definition: Logger.h:197
SQLTypeInfo columnType
bool is_varlen_indeed() const
Definition: sqltypes.h:519
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename D , typename T >
bool foreign_storage::check_bounds ( const T &  value)
inline

Definition at line 53 of file ParquetMetadataValidator.h.

53  {
54  auto [min_value, max_value] = get_min_max_bounds<D>();
55  return value >= min_value && value <= max_value;
56 }
template<typename D >
std::string foreign_storage::datetime_to_string ( const D &  timestamp,
const SQLTypeInfo column_type 
)
inline

Definition at line 59 of file ParquetMetadataValidator.h.

References CHECK, test_fsi::d, DatumToString(), SQLTypeInfo::is_date(), and SQLTypeInfo::is_timestamp().

Referenced by foreign_storage::TimestampBoundsValidator< T >::getMinMaxBoundsAsStrings(), foreign_storage::DateInSecondsBoundsValidator< T >::getMinMaxBoundsAsStrings(), foreign_storage::TimestampBoundsValidator< T >::validateValue(), and foreign_storage::DateInSecondsBoundsValidator< T >::validateValue().

60  {
61  CHECK(column_type.is_timestamp() || column_type.is_date());
62  Datum d;
63  d.bigintval = timestamp;
64  return DatumToString(d, column_type);
65 }
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:356
bool is_timestamp() const
Definition: sqltypes.h:742
tuple d
Definition: test_fsi.py:9
#define CHECK(condition)
Definition: Logger.h:197
bool is_date() const
Definition: sqltypes.h:730

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params,
csv_file_buffer_parser::ParseBufferRequest &  request 
)

Dispatches a new metadata scan request by adding the request to the pending requests queue to be consumed by a worker thread.

Definition at line 681 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by dispatch_metadata_scan_requests().

683  {
684  {
685  std::unique_lock<std::mutex> pending_requests_lock(
686  multi_threading_params.pending_requests_mutex);
687  multi_threading_params.pending_requests.emplace(std::move(request));
688  }
689  multi_threading_params.pending_requests_condition.notify_all();
690 }

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_requests ( const size_t &  buffer_size,
const std::string &  file_path,
CsvReader &  csv_reader,
const import_export::CopyParams copy_params,
MetadataScanMultiThreadingParams &  multi_threading_params,
size_t &  first_row_index_in_buffer,
size_t &  current_file_offset 
)

Reads from a CSV file iteratively and dispatches metadata scan requests that are processed by worker threads.

Definition at line 710 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, dispatch_metadata_scan_request(), import_export::delimited_parser::find_row_end_pos(), get_request_from_pool(), foreign_storage::CsvReader::isScanFinished(), import_export::CopyParams::line_delim, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, foreign_storage::CsvReader::read(), and resize_buffer_if_needed().

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata().

717  {
718  auto alloc_size = buffer_size;
719  auto residual_buffer = std::make_unique<char[]>(alloc_size);
720  size_t residual_buffer_size = 0;
721  size_t residual_buffer_alloc_size = alloc_size;
722 
723  while (!csv_reader.isScanFinished()) {
724  {
725  std::lock_guard<std::mutex> pending_requests_lock(
726  multi_threading_params.pending_requests_mutex);
727  if (!multi_threading_params.continue_processing) {
728  break;
729  }
730  }
731  auto request = get_request_from_pool(multi_threading_params);
732  resize_buffer_if_needed(request.buffer, request.buffer_alloc_size, alloc_size);
733 
734  if (residual_buffer_size > 0) {
735  memcpy(request.buffer.get(), residual_buffer.get(), residual_buffer_size);
736  }
737  size_t size = residual_buffer_size;
738  size += csv_reader.read(request.buffer.get() + residual_buffer_size,
739  alloc_size - residual_buffer_size);
740 
741  if (size == 0) {
742  // In some cases at the end of a file we will read 0 bytes even when
743  // csv_reader.isScanFinished() is false
744  continue;
745  } else if (size == 1 && request.buffer[0] == copy_params.line_delim) {
746  // In some cases files with newlines at the end will be encoded with a second
747  // newline that can end up being the only thing in the buffer
748  current_file_offset++;
749  continue;
750  }
751  unsigned int num_rows_in_buffer = 0;
752  request.end_pos =
754  request.buffer,
755  size,
756  copy_params,
757  first_row_index_in_buffer,
758  num_rows_in_buffer,
759  nullptr,
760  &csv_reader);
761  request.buffer_size = size;
762  request.buffer_alloc_size = alloc_size;
763  request.first_row_index = first_row_index_in_buffer;
764  request.file_offset = current_file_offset;
765  request.buffer_row_count = num_rows_in_buffer;
766 
767  residual_buffer_size = size - request.end_pos;
768  if (residual_buffer_size > 0) {
769  resize_buffer_if_needed(residual_buffer, residual_buffer_alloc_size, alloc_size);
770  memcpy(residual_buffer.get(),
771  request.buffer.get() + request.end_pos,
772  residual_buffer_size);
773  }
774 
775  current_file_offset += request.end_pos;
776  first_row_index_in_buffer += num_rows_in_buffer;
777 
778  dispatch_metadata_scan_request(multi_threading_params, request);
779  }
780 
781  std::unique_lock<std::mutex> pending_requests_queue_lock(
782  multi_threading_params.pending_requests_mutex);
783  multi_threading_params.pending_requests_condition.wait(
784  pending_requests_queue_lock, [&multi_threading_params] {
785  return multi_threading_params.pending_requests.empty() ||
786  (multi_threading_params.continue_processing == false);
787  });
788  multi_threading_params.continue_processing = false;
789  pending_requests_queue_lock.unlock();
790  multi_threading_params.pending_requests_condition.notify_all();
791 }
void dispatch_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
csv_file_buffer_parser::ParseBufferRequest get_request_from_pool(MetadataScanMultiThreadingParams &multi_threading_params)
void resize_buffer_if_needed(std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
size_t find_row_end_pos(size_t &alloc_size, std::unique_ptr< char[]> &buffer, size_t &buffer_size, const CopyParams &copy_params, const size_t buffer_first_row_index, unsigned int &num_rows_in_buffer, FILE *file, foreign_storage::CsvReader *csv_reader)
Finds the closest possible row ending to the end of the given buffer. The buffer is resized as needed...

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
ArrayDatum foreign_storage::encode_as_array_datum ( const std::vector< T > &  data)
inline

Definition at line 39 of file ParquetGeospatialEncoder.h.

References omnisci.dtypes::T.

Referenced by foreign_storage::ParquetGeospatialEncoder::processGeoElement(), and foreign_storage::ParquetGeospatialEncoder::processNullGeoElement().

39  {
40  const size_t num_bytes = data.size() * sizeof(T);
41  std::shared_ptr<int8_t> buffer(new int8_t[num_bytes], std::default_delete<int8_t[]>());
42  memcpy(buffer.get(), data.data(), num_bytes);
43  return ArrayDatum(num_bytes, buffer, false);
44 }
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:202

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size 
)

Gets the appropriate buffer size to be used when processing CSV file(s).

Definition at line 215 of file CsvDataWrapper.cpp.

References import_export::CopyParams::buffer_size.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata(), and foreign_storage::CsvDataWrapper::populateChunks().

217  {
218  size_t buffer_size = copy_params.buffer_size;
219  if (size_known && file_size < buffer_size) {
220  buffer_size = file_size + 1; // +1 for end of line character, if missing
221  }
222  return buffer_size;
223 }
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const FileRegions &  file_regions)

Definition at line 225 of file CsvDataWrapper.cpp.

References CHECK.

225  {
226  size_t buffer_size = 0;
227  for (const auto& file_region : file_regions) {
228  buffer_size = std::max(buffer_size, file_region.region_size);
229  }
230  CHECK(buffer_size);
231  return buffer_size;
232 }
#define CHECK(condition)
Definition: Logger.h:197
const parquet::ColumnDescriptor * foreign_storage::get_column_descriptor ( const parquet::arrow::FileReader *  reader,
const int  logical_column_index 
)

Definition at line 45 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

47  {
48  return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
49 }

+ Here is the caller graph for this function:

int64_t foreign_storage::get_interval_duration ( const std::string &  interval)
inline

Gets the interval duration in seconds.

Parameters
interval- interval string with format of {interval_count}{interval_type} (e.g. 5H for "every 5 hours")
Returns
internal duration in seconds

Definition at line 16 of file RefreshTimeCalculator.h.

References UNREACHABLE.

Referenced by get_next_refresh_time().

16  {
17  int interval_count = std::stoi(interval.substr(0, interval.length() - 1));
18  auto interval_type = std::tolower(interval[interval.length() - 1]);
19  int64_t duration{0};
20  if (interval_type == 's') {
21  duration = interval_count;
22  } else if (interval_type == 'h') {
23  duration = interval_count * 60 * 60;
24  } else if (interval_type == 'd') {
25  duration = interval_count * 60 * 60 * 24;
26  } else {
27  UNREACHABLE();
28  }
29  return duration;
30 }
#define UNREACHABLE()
Definition: Logger.h:241

+ Here is the caller graph for this function:

std::set<ChunkKey> foreign_storage::get_keys_set_from_table ( const ChunkKey destination_chunk_key)

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer().

+ Here is the caller graph for this function:

std::vector<ChunkKey> foreign_storage::get_keys_vec_from_table ( const ChunkKey destination_chunk_key)

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer().

+ Here is the caller graph for this function:

template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > foreign_storage::get_min_max_bounds ( )
inline

Definition at line 41 of file ParquetMetadataValidator.h.

41  {
42  static_assert(std::is_signed<D>::value,
43  "'get_min_max_bounds' is only valid for signed types");
44  return {get_null_value<D>() + 1, std::numeric_limits<D>::max()};
45 }
std::optional<csv_file_buffer_parser::ParseBufferRequest> foreign_storage::get_next_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets the next metadata scan request object from the pending requests queue. A null optional is returned if there are no further requests to be processed.

Definition at line 397 of file CsvDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by scan_metadata().

398  {
399  std::unique_lock<std::mutex> pending_requests_lock(
400  multi_threading_params.pending_requests_mutex);
401  multi_threading_params.pending_requests_condition.wait(
402  pending_requests_lock, [&multi_threading_params] {
403  return !multi_threading_params.pending_requests.empty() ||
404  !multi_threading_params.continue_processing;
405  });
406  if (multi_threading_params.pending_requests.empty()) {
407  return {};
408  }
409  auto request = std::move(multi_threading_params.pending_requests.front());
410  multi_threading_params.pending_requests.pop();
411  pending_requests_lock.unlock();
412  multi_threading_params.pending_requests_condition.notify_all();
413  return std::move(request);
414 }

+ Here is the caller graph for this function:

int64_t foreign_storage::get_next_refresh_time ( const std::map< std::string, std::string, std::less<>> &  foreign_table_options)
inline

Definition at line 33 of file RefreshTimeCalculator.h.

References CHECK, count, get_interval_duration(), foreign_storage::ForeignTable::NULL_REFRESH_TIME, foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY, and foreign_storage::ForeignTable::REFRESH_START_DATE_TIME_KEY.

Referenced by Catalog_Namespace::anonymous_namespace{Catalog.cpp}::get_next_refresh_time().

34  {
35  int64_t current_time = std::chrono::duration_cast<std::chrono::seconds>(
36  std::chrono::system_clock::now().time_since_epoch())
37  .count();
38  auto start_date_entry = foreign_table_options.find(
40  CHECK(start_date_entry != foreign_table_options.end());
41  auto start_date_time = dateTimeParse<kTIMESTAMP>(start_date_entry->second, 0);
42 
43  // If start date time is current or in the future, then that is the next refresh time
44  if (start_date_time >= current_time) {
45  return start_date_time;
46  }
47  auto interval_entry =
48  foreign_table_options.find(foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY);
49  if (interval_entry != foreign_table_options.end()) {
50  auto interval_duration = get_interval_duration(interval_entry->second);
51  auto num_intervals =
52  (current_time - start_date_time + interval_duration - 1) / interval_duration;
53  return start_date_time + (num_intervals * interval_duration);
54  } else {
55  // If this was a one time refresh, then there is no next refresh time
57  }
58 }
int64_t get_interval_duration(const std::string &interval)
static constexpr const char * REFRESH_START_DATE_TIME_KEY
Definition: ForeignTable.h:43
int count
static constexpr const char * REFRESH_INTERVAL_KEY
Definition: ForeignTable.h:44
#define CHECK(condition)
Definition: Logger.h:197
static constexpr int NULL_REFRESH_TIME
Definition: ForeignTable.h:51

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
V foreign_storage::get_null_value ( )
inline

Definition at line 31 of file ParquetMetadataValidator.h.

31  {
32  return inline_int_null_value<V>();
33 }
std::pair< int, int > foreign_storage::get_parquet_table_size ( const ReaderPtr &  reader)

Definition at line 38 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::LazyParquetChunkLoader::metadataScan().

38  {
39  auto file_metadata = reader->parquet_reader()->metadata();
40  const auto num_row_groups = file_metadata->num_row_groups();
41  const auto num_columns = file_metadata->num_columns();
42  return std::make_pair(num_row_groups, num_columns);
43 }

+ Here is the caller graph for this function:

parquet::Type::type foreign_storage::get_physical_type ( ReaderPtr &  reader,
const int  logical_column_index 
)

Definition at line 51 of file ParquetShared.cpp.

Referenced by anonymous_namespace{ArrowResultSetConverter.cpp}::get_arrow_type(), and ArrowResultSetConverter::initializeColumnBuilder().

51  {
52  return reader->parquet_reader()
53  ->metadata()
54  ->schema()
55  ->Column(logical_column_index)
56  ->physical_type();
57 }

+ Here is the caller graph for this function:

csv_file_buffer_parser::ParseBufferRequest foreign_storage::get_request_from_pool ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets a request from the metadata scan request pool.

Definition at line 663 of file CsvDataWrapper.cpp.

References CHECK, foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by dispatch_metadata_scan_requests().

664  {
665  std::unique_lock<std::mutex> request_pool_lock(
666  multi_threading_params.request_pool_mutex);
667  multi_threading_params.request_pool_condition.wait(
668  request_pool_lock,
669  [&multi_threading_params] { return !multi_threading_params.request_pool.empty(); });
670  auto request = std::move(multi_threading_params.request_pool.front());
671  multi_threading_params.request_pool.pop();
672  request_pool_lock.unlock();
673  CHECK(request.buffer);
674  return request;
675 }
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the caller graph for this function:

std::unique_ptr< ColumnDescriptor > foreign_storage::get_sub_type_column_descriptor ( const ColumnDescriptor column)

Definition at line 75 of file ParquetShared.cpp.

References ColumnDescriptor::columnId, ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_elem_type(), SQLTypeInfo::set_size(), and ColumnDescriptor::tableId.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping().

76  {
77  auto column_type = column->columnType.get_elem_type();
78  if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
79  column_type.set_size(4); // override default size of -1
80  }
81  return std::make_unique<ColumnDescriptor>(
82  column->tableId, column->columnId, column->columnName, column_type);
83 }
void set_size(int s)
Definition: sqltypes.h:412
SQLTypeInfo columnType
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:712
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size,
const size_t  buffer_size 
)

Gets the appropriate number of threads to be used for concurrent processing within the data wrapper.

Definition at line 238 of file CsvDataWrapper.cpp.

References CHECK, and import_export::CopyParams::threads.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata(), and foreign_storage::CsvDataWrapper::populateChunks().

241  {
242  size_t thread_count = copy_params.threads;
243  if (thread_count == 0) {
244  thread_count = std::thread::hardware_concurrency();
245  }
246  if (size_known) {
247  size_t num_buffers_in_file = (file_size + buffer_size - 1) / buffer_size;
248  if (num_buffers_in_file < thread_count) {
249  thread_count = num_buffers_in_file;
250  }
251  }
252  CHECK(thread_count);
253  return thread_count;
254 }
#define CHECK(condition)
Definition: Logger.h:197
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const FileRegions &  file_regions 
)

Definition at line 256 of file CsvDataWrapper.cpp.

References CHECK, and import_export::CopyParams::threads.

257  {
258  size_t thread_count = copy_params.threads;
259  if (thread_count == 0) {
260  thread_count =
261  std::min<size_t>(std::thread::hardware_concurrency(), file_regions.size());
262  }
263  CHECK(thread_count);
264  return thread_count;
265 }
#define CHECK(condition)
Definition: Logger.h:197
void foreign_storage::get_value ( const rapidjson::Value &  json_val,
FileRegion &  file_region 
)

Definition at line 44 of file CsvShared.cpp.

References CHECK, foreign_storage::FileRegion::filename, foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::json_utils::get_value_from_object(), foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

44  {
45  CHECK(json_val.IsObject());
47  json_val, file_region.first_row_file_offset, "first_row_file_offset");
49  json_val, file_region.first_row_index, "first_row_index");
50  json_utils::get_value_from_object(json_val, file_region.region_size, "region_size");
51  json_utils::get_value_from_object(json_val, file_region.row_count, "row_count");
52  if (json_val.HasMember("filename")) {
53  json_utils::get_value_from_object(json_val, file_region.filename, "filename");
54  }
55 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:126
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

void foreign_storage::get_value ( const rapidjson::Value &  json_val,
RowGroupInterval &  value 
)

Definition at line 505 of file ParquetDataWrapper.cpp.

References CHECK, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::json_utils::get_value_from_object(), and foreign_storage::RowGroupInterval::start_index.

505  {
506  CHECK(json_val.IsObject());
507  json_utils::get_value_from_object(json_val, value.file_path, "file_path");
508  json_utils::get_value_from_object(json_val, value.start_index, "start_index");
509  json_utils::get_value_from_object(json_val, value.end_index, "end_index");
510 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:126
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

size_t foreign_storage::get_var_length_data_block_size ( DataBlockPtr  data_block,
SQLTypeInfo  sql_type_info 
)

Get the total number of bytes in the given data block for a variable length column.

Definition at line 437 of file CsvDataWrapper.cpp.

References DataBlockPtr::arraysPtr, CHECK, SQLTypeInfo::is_array(), SQLTypeInfo::is_geometry(), SQLTypeInfo::is_string(), SQLTypeInfo::is_varlen(), DataBlockPtr::stringsPtr, and UNREACHABLE.

Referenced by process_data_blocks().

438  {
439  CHECK(sql_type_info.is_varlen());
440  size_t byte_count = 0;
441  if (sql_type_info.is_string() || sql_type_info.is_geometry()) {
442  for (const auto& str : *data_block.stringsPtr) {
443  byte_count += str.length();
444  }
445  } else if (sql_type_info.is_array()) {
446  for (const auto& array : *data_block.arraysPtr) {
447  byte_count += array.length;
448  }
449  } else {
450  UNREACHABLE();
451  }
452  return byte_count;
453 }
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:221
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:222
bool is_varlen() const
Definition: sqltypes.h:513
#define UNREACHABLE()
Definition: Logger.h:241
#define CHECK(condition)
Definition: Logger.h:197
bool is_geometry() const
Definition: sqltypes.h:500
bool is_string() const
Definition: sqltypes.h:488
bool is_array() const
Definition: sqltypes.h:496

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::is_metadata_placeholder ( const ChunkMetadata metadata)
inline

Check if ChunkMetadata corresponds to a chunk for which metadata must be populated.

Definition at line 27 of file MetadataPlaceholder.h.

References ChunkMetadata::chunkStats, Datum::intval, SQLTypeInfo::is_dict_encoded_type(), ChunkStats::max, ChunkStats::min, and ChunkMetadata::sqlType.

Referenced by anonymous_namespace{RelAlgExecutor.cpp}::prepare_string_dictionaries().

27  {
28  return metadata.chunkStats.min.intval > metadata.chunkStats.max.intval &&
29  metadata.sqlType.is_dict_encoded_type(); // Only supported type for now
30 }
int32_t intval
Definition: sqltypes.h:208
ChunkStats chunkStats
Definition: ChunkMetadata.h:35
bool is_dict_encoded_type() const
Definition: sqltypes.h:529
SQLTypeInfo sqlType
Definition: ChunkMetadata.h:32

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

UniqueReaderPtr foreign_storage::open_parquet_table ( const std::string &  file_path,
std::shared_ptr< arrow::fs::FileSystem > &  file_system 
)

Definition at line 25 of file ParquetShared.cpp.

Referenced by foreign_storage::FileReaderMap::getOrInsert(), and foreign_storage::FileReaderMap::insert().

26  {
27  UniqueReaderPtr reader;
28  auto file_result = file_system->OpenInputFile(file_path);
29  if (!file_result.ok()) {
30  throw std::runtime_error{"Unable to access " + file_system->type_name() + " file: " +
31  file_path + ". " + file_result.status().message()};
32  }
33  auto infile = file_result.ValueOrDie();
34  PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
35  return reader;
36 }
std::unique_ptr< parquet::arrow::FileReader > UniqueReaderPtr
Definition: ParquetShared.h:32

+ Here is the caller graph for this function:

ParseFileRegionResult foreign_storage::parse_file_regions ( const FileRegions &  file_regions,
const size_t  start_index,
const size_t  end_index,
CsvReader &  csv_reader,
csv_file_buffer_parser::ParseBufferRequest &  parse_file_request,
const std::map< int, Chunk_NS::Chunk > &  column_id_to_chunk_map 
)

Parses a set of file regions given a handle to the file and range of indexes for the file regions to be parsed.

Definition at line 175 of file CsvDataWrapper.cpp.

References foreign_storage::csv_file_buffer_parser::ParseBufferRequest::begin_pos, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::buffer, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::buffer_size, CHECK, CHECK_EQ, foreign_storage::csv_file_buffer_parser::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::end_pos, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::file_offset, foreign_storage::ParseFileRegionResult::file_offset, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::first_row_index, i, foreign_storage::csv_file_buffer_parser::parse_buffer(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::process_row_count, foreign_storage::CsvReader::readRegion(), run_benchmark_import::result, and foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count.

Referenced by foreign_storage::CsvDataWrapper::populateChunks().

181  {
182  ParseFileRegionResult load_file_region_result{};
183  load_file_region_result.file_offset = file_regions[start_index].first_row_file_offset;
184  load_file_region_result.row_count = 0;
185 
186  csv_file_buffer_parser::ParseBufferResult result;
187  for (size_t i = start_index; i <= end_index; i++) {
188  CHECK(file_regions[i].region_size <= parse_file_request.buffer_size);
189  size_t read_size;
190  {
191  read_size = csv_reader.readRegion(parse_file_request.buffer.get(),
192  file_regions[i].first_row_file_offset,
193  file_regions[i].region_size);
194  }
195 
196  CHECK_EQ(file_regions[i].region_size, read_size);
197  parse_file_request.begin_pos = 0;
198  parse_file_request.end_pos = file_regions[i].region_size;
199  parse_file_request.first_row_index = file_regions[i].first_row_index;
200  parse_file_request.file_offset = file_regions[i].first_row_file_offset;
201  parse_file_request.process_row_count = file_regions[i].row_count;
202 
203  result = parse_buffer(parse_file_request, i == end_index);
204  CHECK_EQ(file_regions[i].row_count, result.row_count);
205  load_file_region_result.row_count += result.row_count;
206  }
207  load_file_region_result.column_id_to_data_blocks_map =
208  result.column_id_to_data_blocks_map;
209  return load_file_region_result;
210 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
ParseBufferResult parse_buffer(ParseBufferRequest &request, bool convert_data_blocks, bool columns_are_pre_filtered)
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector<size_t> foreign_storage::partition_by_fragment ( const size_t  start_row_index,
const size_t  max_fragment_size,
const size_t  buffer_row_count 
)

Given a start row index, maximum fragment size, and number of rows in a buffer, this function returns a vector indicating how the rows in the buffer should be partitioned in order to fill up available fragment slots while staying within the capacity of fragments.

Definition at line 344 of file CsvDataWrapper.cpp.

References CHECK.

Referenced by scan_metadata().

346  {
347  CHECK(buffer_row_count > 0);
348  std::vector<size_t> partitions{};
349  size_t remaining_rows_in_last_fragment;
350  if (start_row_index % max_fragment_size == 0) {
351  remaining_rows_in_last_fragment = 0;
352  } else {
353  remaining_rows_in_last_fragment =
354  max_fragment_size - (start_row_index % max_fragment_size);
355  }
356  if (buffer_row_count <= remaining_rows_in_last_fragment) {
357  partitions.emplace_back(buffer_row_count);
358  } else {
359  if (remaining_rows_in_last_fragment > 0) {
360  partitions.emplace_back(remaining_rows_in_last_fragment);
361  }
362  size_t remaining_buffer_row_count =
363  buffer_row_count - remaining_rows_in_last_fragment;
364  while (remaining_buffer_row_count > 0) {
365  partitions.emplace_back(
366  std::min<size_t>(remaining_buffer_row_count, max_fragment_size));
367  remaining_buffer_row_count -= partitions.back();
368  }
369  }
370  return partitions;
371 }
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the caller graph for this function:

template<typename T >
auto foreign_storage::partition_for_threads ( const std::set< T > &  items,
size_t  max_threads 
)

Definition at line 41 of file ParquetShared.h.

References i.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan(), and foreign_storage::ParquetDataWrapper::populateChunkBuffers().

41  {
42  const size_t items_per_thread = (items.size() + (max_threads - 1)) / max_threads;
43  std::list<std::set<T>> items_by_thread;
44  auto i = 0U;
45  for (auto item : items) {
46  if (i++ % items_per_thread == 0) {
47  items_by_thread.emplace_back(std::set<T>{});
48  }
49  items_by_thread.back().emplace(item);
50  }
51  return items_by_thread;
52 }

+ Here is the caller graph for this function:

void foreign_storage::process_data_blocks ( MetadataScanMultiThreadingParams &  multi_threading_params,
int  fragment_id,
const csv_file_buffer_parser::ParseBufferRequest &  request,
csv_file_buffer_parser::ParseBufferResult &  result,
std::map< int, const ColumnDescriptor * > &  column_by_id,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map 
)

Updates metadata encapsulated in encoders for all table columns given new data blocks gotten from parsing a new set of rows in a CSV file buffer. If cache is available, also append the data_blocks to chunks in the cache

Definition at line 533 of file CsvDataWrapper.cpp.

References add_file_region(), cache_blocks(), foreign_storage::MetadataScanMultiThreadingParams::cached_chunks, foreign_storage::MetadataScanMultiThreadingParams::chunk_byte_count, foreign_storage::MetadataScanMultiThreadingParams::chunk_byte_count_mutex, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers_mutex, foreign_storage::csv_file_buffer_parser::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::db_id, foreign_storage::csv_file_buffer_parser::ParseBufferRequest::first_row_index, get_var_length_data_block_size(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::getFilePath(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::getMaxFragRows(), foreign_storage::csv_file_buffer_parser::ParseBufferRequest::getTableId(), foreign_storage::csv_file_buffer_parser::ParseBufferResult::row_count, and update_stats().

Referenced by scan_metadata().

538  {
539  std::lock_guard<std::mutex> lock(multi_threading_params.chunk_encoder_buffers_mutex);
540  // File regions should be added in same order as appendData
541  add_file_region(fragment_id_to_file_regions_map,
542  fragment_id,
543  request.first_row_index,
544  result,
545  request.getFilePath());
546 
547  for (auto& [column_id, data_block] : result.column_id_to_data_blocks_map) {
548  ChunkKey chunk_key{request.db_id, request.getTableId(), column_id, fragment_id};
549  const auto column = column_by_id[column_id];
550  size_t byte_count;
551  if (column->columnType.is_varlen_indeed()) {
552  chunk_key.emplace_back(1);
553  byte_count = get_var_length_data_block_size(data_block, column->columnType);
554  } else {
555  byte_count = column->columnType.get_size() * result.row_count;
556  }
557 
558  {
559  std::lock_guard<std::mutex> lock(multi_threading_params.chunk_byte_count_mutex);
560  multi_threading_params.chunk_byte_count[chunk_key] += byte_count;
561  }
562 
563  if (multi_threading_params.chunk_encoder_buffers.find(chunk_key) ==
564  multi_threading_params.chunk_encoder_buffers.end()) {
565  multi_threading_params.chunk_encoder_buffers[chunk_key] =
566  std::make_unique<ForeignStorageBuffer>();
567  multi_threading_params.chunk_encoder_buffers[chunk_key]->initEncoder(
568  column->columnType);
569  }
570  update_stats(multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder(),
571  column->columnType,
572  data_block,
573  result.row_count);
574  size_t num_elements = multi_threading_params.chunk_encoder_buffers[chunk_key]
575  ->getEncoder()
576  ->getNumElems() +
577  result.row_count;
578  multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder()->setNumElems(
579  num_elements);
580  cache_blocks(
581  multi_threading_params.cached_chunks,
582  data_block,
583  result.row_count,
584  chunk_key,
585  column,
586  (num_elements - result.row_count) == 0, // Is the first block added to this chunk
587  num_elements == request.getMaxFragRows() // Is the last block for this chunk
588  );
589  }
590 }
std::vector< int > ChunkKey
Definition: types.h:37
size_t get_var_length_data_block_size(DataBlockPtr data_block, SQLTypeInfo sql_type_info)
void update_stats(Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
void cache_blocks(std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block)
void add_file_region(std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const csv_file_buffer_parser::ParseBufferResult &result, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::refresh_foreign_table ( Catalog_Namespace::Catalog catalog,
const std::string &  table_name,
const bool  evict_cached_entries 
)

Definition at line 22 of file ForeignTableRefresh.cpp.

References CHUNK_KEY_FRAGMENT_IDX, Data_Namespace::CPU_LEVEL, Catalog_Namespace::DBMetadata::dbId, StorageType::FOREIGN_TABLE, Catalog_Namespace::Catalog::getCurrentDB(), Catalog_Namespace::Catalog::getDataMgr(), Catalog_Namespace::Catalog::getForeignTableUnlocked(), PostEvictionRefreshException::getOriginalException(), Data_Namespace::GPU_LEVEL, foreign_storage::ForeignTable::isAppendMode(), Catalog_Namespace::Catalog::removeFragmenterForTable(), and Catalog_Namespace::Catalog::updateForeignTableRefreshTimes().

Referenced by RefreshForeignTablesCommand::execute(), and foreign_storage::ForeignTableRefreshScheduler::start().

24  {
25  auto& data_mgr = catalog.getDataMgr();
26  auto table_lock =
27  std::make_unique<lockmgr::TableSchemaLockContainer<lockmgr::WriteLock>>(
29  catalog, table_name, false));
30 
31  const TableDescriptor* td = (*table_lock)();
32  if (td->storageType != StorageType::FOREIGN_TABLE) {
33  throw std::runtime_error{
34  table_name +
35  " is not a foreign table. Refreshes are applicable to only foreign tables."};
36  }
37 
38  catalog.removeFragmenterForTable(td->tableId);
39  ChunkKey table_key{catalog.getCurrentDB().dbId, td->tableId};
40 
41  if (catalog.getForeignTableUnlocked(td->tableId)->isAppendMode() &&
42  !evict_cached_entries) {
43  ChunkMetadataVector metadata_vec;
44  data_mgr.getChunkMetadataVecForKeyPrefix(metadata_vec, table_key);
45  int last_fragment_id = 0;
46  for (const auto& [key, metadata] : metadata_vec) {
47  if (key[CHUNK_KEY_FRAGMENT_IDX] > last_fragment_id) {
48  last_fragment_id = key[CHUNK_KEY_FRAGMENT_IDX];
49  }
50  }
51  for (const auto& [key, metadata] : metadata_vec) {
52  if (key[CHUNK_KEY_FRAGMENT_IDX] == last_fragment_id) {
53  data_mgr.deleteChunksWithPrefix(key, MemoryLevel::CPU_LEVEL);
54  data_mgr.deleteChunksWithPrefix(key, MemoryLevel::GPU_LEVEL);
55  }
56  }
57  } else {
58  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::CPU_LEVEL);
59  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::GPU_LEVEL);
60  }
61 
62  try {
63  data_mgr.getPersistentStorageMgr()->getForeignStorageMgr()->refreshTable(
64  table_key, evict_cached_entries);
65  catalog.updateForeignTableRefreshTimes(td->tableId);
66  } catch (PostEvictionRefreshException& e) {
67  catalog.updateForeignTableRefreshTimes(td->tableId);
68  throw e.getOriginalException();
69  }
70 }
std::vector< int > ChunkKey
Definition: types.h:37
const foreign_storage::ForeignTable * getForeignTableUnlocked(int tableId) const
Definition: Catalog.cpp:1677
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:222
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
std::runtime_error getOriginalException()
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:221
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
bool isAppendMode() const
Checks if the table is in append mode.
void removeFragmenterForTable(const int table_id) const
Definition: Catalog.cpp:3394
static constexpr char const * FOREIGN_TABLE
void updateForeignTableRefreshTimes(const int32_t table_id)
Definition: Catalog.cpp:4677

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::resize_buffer_if_needed ( std::unique_ptr< char[]> &  buffer,
size_t &  buffer_size,
const size_t  alloc_size 
)

Optionally resizes the given buffer if the buffer size is less than the current buffer allocation size.

Definition at line 696 of file CsvDataWrapper.cpp.

References CHECK_LE.

Referenced by dispatch_metadata_scan_requests().

698  {
699  CHECK_LE(buffer_size, alloc_size);
700  if (buffer_size < alloc_size) {
701  buffer = std::make_unique<char[]>(alloc_size);
702  buffer_size = alloc_size;
703  }
704 }
#define CHECK_LE(x, y)
Definition: Logger.h:208

+ Here is the caller graph for this function:

void foreign_storage::scan_metadata ( MetadataScanMultiThreadingParams &  multi_threading_params,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map 
)

Consumes and processes metadata scan requests from a pending requests queue and updates existing metadata objects based on newly scanned metadata.

Definition at line 609 of file CsvDataWrapper.cpp.

References add_request_to_pool(), foreign_storage::MetadataScanMultiThreadingParams::continue_processing, get_next_metadata_scan_request(), foreign_storage::csv_file_buffer_parser::parse_buffer(), partition_by_fragment(), foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, process_data_blocks(), and run_benchmark_import::result.

Referenced by foreign_storage::CsvDataWrapper::populateChunkMetadata().

610  {
611  std::map<int, const ColumnDescriptor*> column_by_id{};
612  while (true) {
613  auto request_opt = get_next_metadata_scan_request(multi_threading_params);
614  if (!request_opt.has_value()) {
615  break;
616  }
617  auto& request = request_opt.value();
618  try {
619  if (column_by_id.empty()) {
620  for (const auto column : request.getColumns()) {
621  column_by_id[column->columnId] = column;
622  }
623  }
624  auto partitions = partition_by_fragment(
625  request.first_row_index, request.getMaxFragRows(), request.buffer_row_count);
626  request.begin_pos = 0;
627  size_t row_index = request.first_row_index;
628  for (const auto partition : partitions) {
629  request.process_row_count = partition;
630  for (const auto& import_buffer : request.import_buffers) {
631  if (import_buffer != nullptr) {
632  import_buffer->clear();
633  }
634  }
635  auto result = parse_buffer(request, true);
636  int fragment_id = row_index / request.getMaxFragRows();
637  process_data_blocks(multi_threading_params,
638  fragment_id,
639  request,
640  result,
641  column_by_id,
642  fragment_id_to_file_regions_map);
643  row_index += result.row_count;
644  request.begin_pos = result.row_offsets.back() - request.file_offset;
645  }
646  } catch (...) {
647  // Re-add request to pool so we dont block any other threads
648  {
649  std::lock_guard<std::mutex> pending_requests_lock(
650  multi_threading_params.pending_requests_mutex);
651  multi_threading_params.continue_processing = false;
652  }
653  add_request_to_pool(multi_threading_params, request);
654  throw;
655  }
656  add_request_to_pool(multi_threading_params, request);
657  }
658 }
void add_request_to_pool(MetadataScanMultiThreadingParams &multi_threading_params, csv_file_buffer_parser::ParseBufferRequest &request)
std::vector< size_t > partition_by_fragment(const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
ParseBufferResult parse_buffer(ParseBufferRequest &request, bool convert_data_blocks, bool columns_are_pre_filtered)
void process_data_blocks(MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const csv_file_buffer_parser::ParseBufferRequest &request, csv_file_buffer_parser::ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
std::optional< csv_file_buffer_parser::ParseBufferRequest > get_next_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const FileRegion &  file_region,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 26 of file CsvShared.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::FileRegion::filename, foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

28  {
29  json_val.SetObject();
31  json_val, file_region.first_row_file_offset, "first_row_file_offset", allocator);
33  json_val, file_region.first_row_index, "first_row_index", allocator);
35  json_val, file_region.region_size, "region_size", allocator);
37  json_val, file_region.row_count, "row_count", allocator);
38  if (file_region.filename.size()) {
40  json_val, file_region.filename, "filename", allocator);
41  }
42 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:111

+ Here is the call graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const RowGroupInterval &  value,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 496 of file ParquetDataWrapper.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, and foreign_storage::RowGroupInterval::start_index.

498  {
499  json_val.SetObject();
500  json_utils::add_value_to_object(json_val, value.file_path, "file_path", allocator);
501  json_utils::add_value_to_object(json_val, value.start_index, "start_index", allocator);
502  json_utils::add_value_to_object(json_val, value.end_index, "end_index", allocator);
503 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:111

+ Here is the call graph for this function:

void foreign_storage::throw_number_of_columns_mismatch_error ( size_t  num_table_cols,
size_t  num_file_cols,
const std::string &  file_path 
)
inline

Definition at line 35 of file ForeignDataWrapperShared.h.

References to_string().

Referenced by foreign_storage::csv_file_buffer_parser::validate_expected_column_count(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns().

37  {
38  throw std::runtime_error{"Mismatched number of logical columns: (expected " +
39  std::to_string(num_table_cols) + " columns, has " +
40  std::to_string(num_file_cols) + "): in file '" + file_path +
41  "'"};
42 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::throw_parquet_metadata_out_of_bounds_error ( const std::string &  min_value,
const std::string &  max_value,
const std::string &  encountered_value 
)
inline

Definition at line 67 of file ParquetMetadataValidator.h.

Referenced by foreign_storage::TimestampBoundsValidator< T >::validateValue(), foreign_storage::IntegralFixedLengthBoundsValidator< T >::validateValue(), foreign_storage::DateInSecondsBoundsValidator< T >::validateValue(), and foreign_storage::FloatPointValidator< T >::validateValue().

70  {
71  std::stringstream error_message;
72  error_message << "Parquet column contains values that are outside the range of the "
73  "OmniSci column "
74  "type. Consider using a wider column type. Min allowed value: "
75  << min_value << ". Max allowed value: " << max_value
76  << ". Encountered value: " << encountered_value << ".";
77  throw std::runtime_error(error_message.str());
78 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_file_error ( const std::string &  file_path)
inline

Definition at line 29 of file ForeignDataWrapperShared.h.

Referenced by foreign_storage::LocalMultiFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

29  {
30  throw std::runtime_error{
31  "Refresh of foreign table created with \"APPEND\" update type failed as "
32  "file \"" +
33  file_path + "\" was removed."};
34 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_row_error ( const std::string &  file_path)
inline

Definition at line 22 of file ForeignDataWrapperShared.h.

Referenced by foreign_storage::SingleFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

22  {
23  throw std::runtime_error{
24  "Refresh of foreign table created with \"APPEND\" update type failed as file "
25  "reduced in size: " +
26  file_path};
27 }

+ Here is the caller graph for this function:

void foreign_storage::update_stats ( Encoder encoder,
const SQLTypeInfo column_type,
DataBlockPtr  data_block,
const size_t  row_count 
)

Updates the statistics metadata encapsulated in the encoder given new data in a data block.

Definition at line 459 of file CsvDataWrapper.cpp.

References DataBlockPtr::arraysPtr, SQLTypeInfo::is_array(), SQLTypeInfo::is_varlen(), DataBlockPtr::numbersPtr, DataBlockPtr::stringsPtr, and Encoder::updateStats().

Referenced by process_data_blocks(), Fragmenter_Namespace::InsertOrderFragmenter::updateColumn(), Fragmenter_Namespace::InsertOrderFragmenter::updateColumnMetadata(), and StorageIOFacility::yieldUpdateCallback().

462  {
463  if (column_type.is_array()) {
464  encoder->updateStats(data_block.arraysPtr, 0, row_count);
465  } else if (!column_type.is_varlen()) {
466  encoder->updateStats(data_block.numbersPtr, row_count);
467  } else {
468  encoder->updateStats(data_block.stringsPtr, 0, row_count);
469  }
470 }
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:221
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:222
bool is_varlen() const
Definition: sqltypes.h:513
int8_t * numbersPtr
Definition: sqltypes.h:220
virtual void updateStats(const int64_t val, const bool is_null)=0
bool is_array() const
Definition: sqltypes.h:496

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr< parquet::Statistics > foreign_storage::validate_and_get_column_metadata_statistics ( const parquet::ColumnChunkMetaData *  column_metadata)

Definition at line 85 of file ParquetShared.cpp.

References CHECK.

Referenced by foreign_storage::ParquetEncoder::getRowGroupMetadata(), and foreign_storage::TypedParquetInPlaceEncoder< int64_t, int32_t >::getRowGroupMetadata().

86  {
87  CHECK(column_metadata->is_stats_set());
88  std::shared_ptr<parquet::Statistics> stats = column_metadata->statistics();
89  bool is_all_nulls = stats->null_count() == column_metadata->num_values();
90  CHECK(is_all_nulls || stats->HasMinMax());
91  return stats;
92 }
#define CHECK(condition)
Definition: Logger.h:197

+ Here is the caller graph for this function:

void foreign_storage::validate_equal_column_descriptor ( const parquet::ColumnDescriptor *  reference_descriptor,
const parquet::ColumnDescriptor *  new_descriptor,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 59 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

63  {
64  if (!reference_descriptor->Equals(*new_descriptor)) {
65  throw std::runtime_error{"Parquet file \"" + new_file_path +
66  "\" has a different schema. Please ensure that all Parquet "
67  "files use the same schema. Reference Parquet file: " +
68  reference_file_path +
69  ", column name: " + reference_descriptor->name() +
70  ". New Parquet file: " + new_file_path +
71  ", column name: " + new_descriptor->name() + "."};
72  }
73 }

+ Here is the caller graph for this function:

void foreign_storage::validate_non_foreign_table_write ( const TableDescriptor table_descriptor)
inline

Definition at line 22 of file FsiUtils.h.

References StorageType::FOREIGN_TABLE, and TableDescriptor::storageType.

Referenced by Parser::InsertStmt::analyze(), Parser::InsertValuesStmt::determineLeafIndex(), Parser::TruncateTableStmt::execute(), Parser::InsertValuesStmt::execute(), Parser::InsertIntoTableAsSelectStmt::populateData(), and RelModify::RelModify().

22  {
23  if (table_descriptor && table_descriptor->storageType == StorageType::FOREIGN_TABLE) {
24  throw std::runtime_error{
25  "DELETE, INSERT, TRUNCATE, OR UPDATE commands are not supported for foreign "
26  "tables."};
27  }
28 }
std::string storageType
static constexpr char const * FOREIGN_TABLE

+ Here is the caller graph for this function:

Variable Documentation

const std::string foreign_storage::wrapper_file_name = "wrapper_metadata.json"