OmniSciDB  eb3a3d0a03
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage Namespace Reference

Namespaces

 anonymous_namespace{AbstractFileStorageDataWrapper.cpp}
 
 anonymous_namespace{AbstractTextFileDataWrapper.cpp}
 
 anonymous_namespace{CachingForeignStorageMgr.cpp}
 
 anonymous_namespace{CsvFileBufferParser.cpp}
 
 anonymous_namespace{FileReader.cpp}
 
 anonymous_namespace{ForeignStorageCache.cpp}
 
 anonymous_namespace{LazyParquetChunkLoader.cpp}
 
 anonymous_namespace{ParquetDataWrapper.cpp}
 
 Csv
 
 json_utils
 

Classes

struct  ForeignServer
 
struct  ForeignTable
 
struct  OptionsContainer
 
class  AbstractFileStorageDataWrapper
 
struct  ParseFileRegionResult
 
struct  MetadataScanMultiThreadingParams
 
class  AbstractTextFileDataWrapper
 
class  CachingForeignStorageMgr
 
class  CsvDataWrapper
 
class  CsvFileBufferParser
 
class  FileReader
 
class  SingleFileReader
 
class  ArchiveWrapper
 
class  CompressedFileReader
 
class  MultiFileReader
 
class  LocalMultiFileReader
 
struct  FileRegion
 
class  ForeignDataWrapper
 
struct  DataWrapperType
 Encapsulates an enumeration of foreign data wrapper type strings. More...
 
class  ForeignDataWrapperFactory
 
class  ForeignStorageBuffer
 
class  ForeignStorageCache
 
class  ForeignStorageException
 
class  MockForeignDataWrapper
 
class  ForeignStorageMgr
 
class  ForeignTableSchema
 
class  GeospatialEncoder
 
struct  ColumnType
 
struct  FragmentType
 
struct  Interval
 
class  LazyParquetChunkLoader
 
class  OdbcGeospatialEncoder
 
class  ParquetArrayEncoder
 
class  ParquetDataWrapper
 
class  ParquetDateFromTimestampEncoder
 
class  ParquetDateInSecondsEncoder
 
class  ParquetDecimalEncoder
 
class  ParquetEncoder
 
class  ParquetScalarEncoder
 
class  ParquetFixedLengthArrayEncoder
 
class  ParquetFixedLengthEncoder
 
class  ParquetUnsignedFixedLengthEncoder
 
class  ParquetGeospatialEncoder
 
class  ParquetInPlaceEncoder
 
class  TypedParquetInPlaceEncoder
 
class  ParquetMetadataValidator
 
class  TimestampBoundsValidator
 
class  IntegralFixedLengthBoundsValidator
 
class  DateInSecondsBoundsValidator
 
class  FloatPointValidator
 
struct  RowGroupInterval
 
struct  RowGroupMetadata
 
class  FileReaderMap
 
class  ParquetStringEncoder
 
class  ParquetStringNoneEncoder
 
class  ParquetTimeEncoder
 
class  ParquetTimestampEncoder
 
class  ParquetVariableLengthArrayEncoder
 
struct  ParseBufferRequest
 
struct  ParseBufferResult
 
class  TextFileBufferParser
 
class  ForeignTableRefreshScheduler
 

Typedefs

using OptionsMap = std::map< std::string, std::string, std::less<>>
 
using FileRegions = std::vector< FileRegion >
 
using ChunkToBufferMap = std::map< ChunkKey, AbstractBuffer * >
 
using read_lock = mapd_shared_lock< mapd_shared_mutex >
 
using write_lock = mapd_unique_lock< mapd_shared_mutex >
 
using UniqueReaderPtr = std::unique_ptr< parquet::arrow::FileReader >
 
using ReaderPtr = parquet::arrow::FileReader *
 

Functions

int64_t get_interval_duration (const std::string &interval)
 
int64_t get_next_refresh_time (const std::map< std::string, std::string, std::less<>> &foreign_table_options)
 
ParseFileRegionResult parse_file_regions (const FileRegions &file_regions, const size_t start_index, const size_t end_index, FileReader &file_reader, ParseBufferRequest &parse_file_request, const std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map, const TextFileBufferParser &parser)
 
size_t get_buffer_size (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size)
 
size_t get_buffer_size (const FileRegions &file_regions)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size, const size_t buffer_size)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const FileRegions &file_regions)
 
std::vector< size_t > partition_by_fragment (const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
 
std::optional< ParseBufferRequestget_next_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params)
 
void add_file_region (std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const ParseBufferResult &result, const std::string &file_path)
 
void update_stats (Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
 
void cache_blocks (std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block)
 
void process_data_blocks (MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const ParseBufferRequest &request, ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
 
void add_request_to_pool (MetadataScanMultiThreadingParams &multi_threading_params, ParseBufferRequest &request)
 
void scan_metadata (MetadataScanMultiThreadingParams &multi_threading_params, std::map< int, FileRegions > &fragment_id_to_file_regions_map, const TextFileBufferParser &parser)
 
ParseBufferRequest get_request_from_pool (MetadataScanMultiThreadingParams &multi_threading_params)
 
void dispatch_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params, ParseBufferRequest &request)
 
void resize_buffer_if_needed (std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
 
void dispatch_metadata_scan_requests (const size_t &buffer_size, const std::string &file_path, FileReader &file_reader, const import_export::CopyParams &copy_params, MetadataScanMultiThreadingParams &multi_threading_params, size_t &first_row_index_in_buffer, size_t &current_file_offset, const TextFileBufferParser &parser)
 
void set_value (rapidjson::Value &json_val, const FileRegion &file_region, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, FileRegion &file_region)
 
void throw_removed_row_error (const std::string &file_path)
 
void throw_removed_file_error (const std::string &file_path)
 
void throw_number_of_columns_mismatch_error (size_t num_table_cols, size_t num_file_cols, const std::string &file_path)
 
void throw_file_access_error (const std::string &file_path, const std::string &message)
 
void throw_file_not_found_error (const std::string &file_path)
 
std::vector< ChunkKeyget_keys_vec_from_table (const ChunkKey &destination_chunk_key)
 
std::set< ChunkKeyget_keys_set_from_table (const ChunkKey &destination_chunk_key)
 
void refresh_foreign_table (Catalog_Namespace::Catalog &catalog, const std::string &table_name, const bool evict_cached_entries)
 
void init_chunk_for_column (const ChunkKey &chunk_key, const std::map< ChunkKey, std::shared_ptr< ChunkMetadata >> &chunk_metadata_map, const std::map< ChunkKey, AbstractBuffer * > &buffers, Chunk_NS::Chunk &chunk)
 
std::shared_ptr< ChunkMetadataget_placeholder_metadata (const ColumnDescriptor *column, size_t num_elements)
 
template<typename T >
ArrayDatum encode_as_array_datum (const std::vector< T > &data)
 
bool is_metadata_placeholder (const ChunkMetadata &metadata)
 
void set_value (rapidjson::Value &json_val, const RowGroupInterval &value, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, RowGroupInterval &value)
 
template<typename D , typename T >
bool check_bounds (const T &value)
 
template<typename D >
std::string datetime_to_string (const D &timestamp, const SQLTypeInfo &column_type)
 
void throw_parquet_metadata_out_of_bounds_error (const std::string &min_value, const std::string &max_value, const std::string &encountered_value)
 
UniqueReaderPtr open_parquet_table (const std::string &file_path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
 
std::pair< int, int > get_parquet_table_size (const ReaderPtr &reader)
 
const parquet::ColumnDescriptor * get_column_descriptor (const parquet::arrow::FileReader *reader, const int logical_column_index)
 
parquet::Type::type get_physical_type (ReaderPtr &reader, const int logical_column_index)
 
void validate_equal_column_descriptor (const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
 
std::unique_ptr< ColumnDescriptorget_sub_type_column_descriptor (const ColumnDescriptor *column)
 
std::shared_ptr
< parquet::Statistics > 
validate_and_get_column_metadata_statistics (const parquet::ColumnChunkMetaData *column_metadata)
 
template<typename T >
auto partition_for_threads (const std::set< T > &items, size_t max_threads)
 
template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
get_null_value ()
 
template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > get_min_max_bounds ()
 
void validate_non_foreign_table_write (const TableDescriptor *table_descriptor)
 

Typedef Documentation

Definition at line 28 of file ForeignDataWrapper.h.

using foreign_storage::FileRegions = typedef std::vector<FileRegion>

Definition at line 67 of file FileRegions.h.

using foreign_storage::OptionsMap = typedef std::map<std::string, std::string, std::less<>>

Definition at line 30 of file OptionsContainer.h.

using foreign_storage::ReaderPtr = typedef parquet::arrow::FileReader*

Definition at line 33 of file ParquetShared.h.

using foreign_storage::UniqueReaderPtr = typedef std::unique_ptr<parquet::arrow::FileReader>

Definition at line 32 of file ParquetShared.h.

Function Documentation

void foreign_storage::add_file_region ( std::map< int, FileRegions > &  fragment_id_to_file_regions_map,
int  fragment_id,
size_t  first_row_index,
const ParseBufferResult &  result,
const std::string &  file_path 
)

Creates a new file region based on metadata from parsed file buffers and adds the new region to the fragment id to file regions map.

Definition at line 403 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::ParseBufferResult::row_count, and foreign_storage::ParseBufferResult::row_offsets.

Referenced by process_data_blocks().

407  {
408  fragment_id_to_file_regions_map[fragment_id].emplace_back(
409  // file naming is handled by FileReader
410  FileRegion(result.row_offsets.front(),
411  first_row_index,
412  result.row_count,
413  result.row_offsets.back() - result.row_offsets.front()));
414 }

+ Here is the caller graph for this function:

void foreign_storage::add_request_to_pool ( MetadataScanMultiThreadingParams &  multi_threading_params,
ParseBufferRequest &  request 
)

Adds the request object for a processed request back to the request pool for reuse in subsequent requests.

Definition at line 539 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by scan_metadata().

540  {
541  std::unique_lock<std::mutex> completed_requests_queue_lock(
542  multi_threading_params.request_pool_mutex);
543  multi_threading_params.request_pool.emplace(std::move(request));
544  completed_requests_queue_lock.unlock();
545  multi_threading_params.request_pool_condition.notify_all();
546 }

+ Here is the caller graph for this function:

void foreign_storage::cache_blocks ( std::map< ChunkKey, Chunk_NS::Chunk > &  cached_chunks,
DataBlockPtr  data_block,
size_t  row_count,
ChunkKey chunk_key,
const ColumnDescriptor column,
bool  is_first_block,
bool  is_last_block 
)

Definition at line 447 of file AbstractTextFileDataWrapper.cpp.

References CHECK, CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_DB_IDX, CHUNK_KEY_FRAGMENT_IDX, CHUNK_KEY_TABLE_IDX, ColumnDescriptor::columnType, foreign_storage::anonymous_namespace{AbstractTextFileDataWrapper.cpp}::get_cache_if_enabled(), Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), and SQLTypeInfo::is_varlen_indeed().

Referenced by process_data_blocks().

453  {
454  auto catalog =
456  CHECK(catalog);
457  auto cache = get_cache_if_enabled(catalog);
458  if (cache) {
459  ChunkKey index_key = {chunk_key[CHUNK_KEY_DB_IDX],
460  chunk_key[CHUNK_KEY_TABLE_IDX],
461  chunk_key[CHUNK_KEY_COLUMN_IDX],
462  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
463  2};
464  // Create actual data chunks to prepopulate cache
465  if (cached_chunks.find(chunk_key) == cached_chunks.end()) {
466  cached_chunks[chunk_key] = Chunk_NS::Chunk{column};
467  cached_chunks[chunk_key].setBuffer(
468  cache->getChunkBufferForPrecaching(chunk_key, is_first_block));
469  if (column->columnType.is_varlen_indeed()) {
470  cached_chunks[chunk_key].setIndexBuffer(
471  cache->getChunkBufferForPrecaching(index_key, is_first_block));
472  }
473  if (is_first_block) {
474  cached_chunks[chunk_key].initEncoder();
475  }
476  }
477  cached_chunks[chunk_key].appendData(data_block, row_count, 0);
478  }
479 }
std::vector< int > ChunkKey
Definition: types.h:37
foreign_storage::ForeignStorageCache * get_cache_if_enabled(std::shared_ptr< Catalog_Namespace::Catalog > &catalog)
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
static SysCatalog & instance()
Definition: SysCatalog.h:318
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
#define CHECK(condition)
Definition: Logger.h:209
SQLTypeInfo columnType
bool is_varlen_indeed() const
Definition: sqltypes.h:535
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename D , typename T >
bool foreign_storage::check_bounds ( const T &  value)
inline

Definition at line 32 of file ParquetMetadataValidator.h.

32  {
33  auto [min_value, max_value] = get_min_max_bounds<D>();
34  return value >= min_value && value <= max_value;
35 }
template<typename D >
std::string foreign_storage::datetime_to_string ( const D &  timestamp,
const SQLTypeInfo column_type 
)
inline

Definition at line 38 of file ParquetMetadataValidator.h.

References Datum::bigintval, CHECK, DatumToString(), SQLTypeInfo::is_date(), and SQLTypeInfo::is_timestamp().

Referenced by foreign_storage::TimestampBoundsValidator< T >::getMinMaxBoundsAsStrings(), foreign_storage::DateInSecondsBoundsValidator< T >::getMinMaxBoundsAsStrings(), foreign_storage::TimestampBoundsValidator< T >::validateValue(), and foreign_storage::DateInSecondsBoundsValidator< T >::validateValue().

39  {
40  CHECK(column_type.is_timestamp() || column_type.is_date());
41  Datum d;
42  d.bigintval = timestamp;
43  return DatumToString(d, column_type);
44 }
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:388
bool is_timestamp() const
Definition: sqltypes.h:762
int64_t bigintval
Definition: sqltypes.h:215
#define CHECK(condition)
Definition: Logger.h:209
bool is_date() const
Definition: sqltypes.h:750

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params,
ParseBufferRequest &  request 
)

Dispatches a new metadata scan request by adding the request to the pending requests queue to be consumed by a worker thread.

Definition at line 625 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by dispatch_metadata_scan_requests().

627  {
628  {
629  std::unique_lock<std::mutex> pending_requests_lock(
630  multi_threading_params.pending_requests_mutex);
631  multi_threading_params.pending_requests.emplace(std::move(request));
632  }
633  multi_threading_params.pending_requests_condition.notify_all();
634 }

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_requests ( const size_t &  buffer_size,
const std::string &  file_path,
FileReader &  file_reader,
const import_export::CopyParams copy_params,
MetadataScanMultiThreadingParams &  multi_threading_params,
size_t &  first_row_index_in_buffer,
size_t &  current_file_offset,
const TextFileBufferParser &  parser 
)

Reads from a text file iteratively and dispatches metadata scan requests that are processed by worker threads.

Definition at line 654 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, dispatch_metadata_scan_request(), foreign_storage::TextFileBufferParser::findRowEndPosition(), get_request_from_pool(), foreign_storage::FileReader::isScanFinished(), import_export::CopyParams::line_delim, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, foreign_storage::FileReader::read(), and resize_buffer_if_needed().

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata().

662  {
663  auto alloc_size = buffer_size;
664  auto residual_buffer = std::make_unique<char[]>(alloc_size);
665  size_t residual_buffer_size = 0;
666  size_t residual_buffer_alloc_size = alloc_size;
667 
668  while (!file_reader.isScanFinished()) {
669  {
670  std::lock_guard<std::mutex> pending_requests_lock(
671  multi_threading_params.pending_requests_mutex);
672  if (!multi_threading_params.continue_processing) {
673  break;
674  }
675  }
676  auto request = get_request_from_pool(multi_threading_params);
677  resize_buffer_if_needed(request.buffer, request.buffer_alloc_size, alloc_size);
678 
679  if (residual_buffer_size > 0) {
680  memcpy(request.buffer.get(), residual_buffer.get(), residual_buffer_size);
681  }
682  size_t size = residual_buffer_size;
683  size += file_reader.read(request.buffer.get() + residual_buffer_size,
684  alloc_size - residual_buffer_size);
685 
686  if (size == 0) {
687  // In some cases at the end of a file we will read 0 bytes even when
688  // file_reader.isScanFinished() is false
689  continue;
690  } else if (size == 1 && request.buffer[0] == copy_params.line_delim) {
691  // In some cases files with newlines at the end will be encoded with a second
692  // newline that can end up being the only thing in the buffer
693  current_file_offset++;
694  continue;
695  }
696  unsigned int num_rows_in_buffer = 0;
697  request.end_pos = parser.findRowEndPosition(alloc_size,
698  request.buffer,
699  size,
700  copy_params,
701  first_row_index_in_buffer,
702  num_rows_in_buffer,
703  &file_reader);
704  request.buffer_size = size;
705  request.buffer_alloc_size = alloc_size;
706  request.first_row_index = first_row_index_in_buffer;
707  request.file_offset = current_file_offset;
708  request.buffer_row_count = num_rows_in_buffer;
709 
710  residual_buffer_size = size - request.end_pos;
711  if (residual_buffer_size > 0) {
712  resize_buffer_if_needed(residual_buffer, residual_buffer_alloc_size, alloc_size);
713  memcpy(residual_buffer.get(),
714  request.buffer.get() + request.end_pos,
715  residual_buffer_size);
716  }
717 
718  current_file_offset += request.end_pos;
719  first_row_index_in_buffer += num_rows_in_buffer;
720 
721  dispatch_metadata_scan_request(multi_threading_params, request);
722  }
723 
724  std::unique_lock<std::mutex> pending_requests_queue_lock(
725  multi_threading_params.pending_requests_mutex);
726  multi_threading_params.pending_requests_condition.wait(
727  pending_requests_queue_lock, [&multi_threading_params] {
728  return multi_threading_params.pending_requests.empty() ||
729  (multi_threading_params.continue_processing == false);
730  });
731  multi_threading_params.continue_processing = false;
732  pending_requests_queue_lock.unlock();
733  multi_threading_params.pending_requests_condition.notify_all();
734 }
ParseBufferRequest get_request_from_pool(MetadataScanMultiThreadingParams &multi_threading_params)
void resize_buffer_if_needed(std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
void dispatch_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params, ParseBufferRequest &request)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
ArrayDatum foreign_storage::encode_as_array_datum ( const std::vector< T > &  data)
inline

Definition at line 32 of file GeospatialEncoder.h.

References omnisci.dtypes::T.

Referenced by foreign_storage::GeospatialEncoder::processGeoElement(), and foreign_storage::GeospatialEncoder::processNullGeoElement().

32  {
33  const size_t num_bytes = data.size() * sizeof(T);
34  std::shared_ptr<int8_t> buffer(new int8_t[num_bytes], std::default_delete<int8_t[]>());
35  memcpy(buffer.get(), data.data(), num_bytes);
36  return ArrayDatum(num_bytes, buffer, false);
37 }
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:208

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size 
)

Gets the appropriate buffer size to be used when processing file(s).

Definition at line 199 of file AbstractTextFileDataWrapper.cpp.

References import_export::CopyParams::buffer_size.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata(), and foreign_storage::AbstractTextFileDataWrapper::populateChunks().

201  {
202  size_t buffer_size = copy_params.buffer_size;
203  if (size_known && file_size < buffer_size) {
204  buffer_size = file_size + 1; // +1 for end of line character, if missing
205  }
206  return buffer_size;
207 }
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const FileRegions &  file_regions)

Definition at line 209 of file AbstractTextFileDataWrapper.cpp.

References CHECK.

209  {
210  size_t buffer_size = 0;
211  for (const auto& file_region : file_regions) {
212  buffer_size = std::max(buffer_size, file_region.region_size);
213  }
214  CHECK(buffer_size);
215  return buffer_size;
216 }
#define CHECK(condition)
Definition: Logger.h:209
const parquet::ColumnDescriptor * foreign_storage::get_column_descriptor ( const parquet::arrow::FileReader *  reader,
const int  logical_column_index 
)

Definition at line 45 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

47  {
48  return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
49 }

+ Here is the caller graph for this function:

int64_t foreign_storage::get_interval_duration ( const std::string &  interval)
inline

Gets the interval duration in seconds.

Parameters
interval- interval string with format of {interval_count}{interval_type} (e.g. 5H for "every 5 hours")
Returns
internal duration in seconds

Definition at line 16 of file RefreshTimeCalculator.h.

References UNREACHABLE.

Referenced by get_next_refresh_time().

16  {
17  int interval_count = std::stoi(interval.substr(0, interval.length() - 1));
18  auto interval_type = std::tolower(interval[interval.length() - 1]);
19  int64_t duration{0};
20  if (interval_type == 's') {
21  duration = interval_count;
22  } else if (interval_type == 'h') {
23  duration = interval_count * 60 * 60;
24  } else if (interval_type == 'd') {
25  duration = interval_count * 60 * 60 * 24;
26  } else {
27  UNREACHABLE();
28  }
29  return duration;
30 }
#define UNREACHABLE()
Definition: Logger.h:253

+ Here is the caller graph for this function:

std::set<ChunkKey> foreign_storage::get_keys_set_from_table ( const ChunkKey destination_chunk_key)

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer().

+ Here is the caller graph for this function:

std::vector<ChunkKey> foreign_storage::get_keys_vec_from_table ( const ChunkKey destination_chunk_key)

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer().

+ Here is the caller graph for this function:

template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > foreign_storage::get_min_max_bounds ( )
inline

Definition at line 31 of file SharedMetadataValidator.h.

31  {
32  static_assert(std::is_signed<D>::value,
33  "'get_min_max_bounds' is only valid for signed types");
34  return {get_null_value<D>() + 1, std::numeric_limits<D>::max()};
35 }
std::optional<ParseBufferRequest> foreign_storage::get_next_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets the next metadata scan request object from the pending requests queue. A null optional is returned if there are no further requests to be processed.

Definition at line 380 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by scan_metadata().

381  {
382  std::unique_lock<std::mutex> pending_requests_lock(
383  multi_threading_params.pending_requests_mutex);
384  multi_threading_params.pending_requests_condition.wait(
385  pending_requests_lock, [&multi_threading_params] {
386  return !multi_threading_params.pending_requests.empty() ||
387  !multi_threading_params.continue_processing;
388  });
389  if (multi_threading_params.pending_requests.empty()) {
390  return {};
391  }
392  auto request = std::move(multi_threading_params.pending_requests.front());
393  multi_threading_params.pending_requests.pop();
394  pending_requests_lock.unlock();
395  multi_threading_params.pending_requests_condition.notify_all();
396  return std::move(request);
397 }

+ Here is the caller graph for this function:

int64_t foreign_storage::get_next_refresh_time ( const std::map< std::string, std::string, std::less<>> &  foreign_table_options)
inline

Definition at line 33 of file RefreshTimeCalculator.h.

References CHECK, count, get_interval_duration(), foreign_storage::ForeignTable::NULL_REFRESH_TIME, foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY, and foreign_storage::ForeignTable::REFRESH_START_DATE_TIME_KEY.

Referenced by Catalog_Namespace::anonymous_namespace{Catalog.cpp}::get_next_refresh_time().

34  {
35  int64_t current_time = std::chrono::duration_cast<std::chrono::seconds>(
36  std::chrono::system_clock::now().time_since_epoch())
37  .count();
38  auto start_date_entry = foreign_table_options.find(
40  CHECK(start_date_entry != foreign_table_options.end());
41  auto start_date_time = dateTimeParse<kTIMESTAMP>(start_date_entry->second, 0);
42 
43  // If start date time is current or in the future, then that is the next refresh time
44  if (start_date_time >= current_time) {
45  return start_date_time;
46  }
47  auto interval_entry =
48  foreign_table_options.find(foreign_storage::ForeignTable::REFRESH_INTERVAL_KEY);
49  if (interval_entry != foreign_table_options.end()) {
50  auto interval_duration = get_interval_duration(interval_entry->second);
51  auto num_intervals =
52  (current_time - start_date_time + interval_duration - 1) / interval_duration;
53  return start_date_time + (num_intervals * interval_duration);
54  } else {
55  // If this was a one time refresh, then there is no next refresh time
57  }
58 }
int64_t get_interval_duration(const std::string &interval)
static constexpr const char * REFRESH_START_DATE_TIME_KEY
Definition: ForeignTable.h:43
int count
static constexpr const char * REFRESH_INTERVAL_KEY
Definition: ForeignTable.h:44
#define CHECK(condition)
Definition: Logger.h:209
static constexpr int NULL_REFRESH_TIME
Definition: ForeignTable.h:52

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
V foreign_storage::get_null_value ( )
inline

Definition at line 21 of file SharedMetadataValidator.h.

21  {
22  return inline_int_null_value<V>();
23 }
std::pair< int, int > foreign_storage::get_parquet_table_size ( const ReaderPtr &  reader)

Definition at line 38 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::LazyParquetChunkLoader::metadataScan().

38  {
39  auto file_metadata = reader->parquet_reader()->metadata();
40  const auto num_row_groups = file_metadata->num_row_groups();
41  const auto num_columns = file_metadata->num_columns();
42  return std::make_pair(num_row_groups, num_columns);
43 }

+ Here is the caller graph for this function:

parquet::Type::type foreign_storage::get_physical_type ( ReaderPtr &  reader,
const int  logical_column_index 
)

Definition at line 51 of file ParquetShared.cpp.

Referenced by anonymous_namespace{ArrowResultSetConverter.cpp}::get_arrow_type(), and ArrowResultSetConverter::initializeColumnBuilder().

51  {
52  return reader->parquet_reader()
53  ->metadata()
54  ->schema()
55  ->Column(logical_column_index)
56  ->physical_type();
57 }

+ Here is the caller graph for this function:

std::shared_ptr< ChunkMetadata > foreign_storage::get_placeholder_metadata ( const ColumnDescriptor column,
size_t  num_elements 
)

Definition at line 75 of file FsiChunkUtils.cpp.

References ColumnDescriptor::columnType, SQLTypeInfo::get_elem_type(), SQLTypeInfo::get_size(), Data_Namespace::AbstractBuffer::getEncoder(), Encoder::getMetadata(), Data_Namespace::AbstractBuffer::initEncoder(), SQLTypeInfo::is_array(), and SQLTypeInfo::is_varlen_indeed().

Referenced by foreign_storage::anonymous_namespace{AbstractTextFileDataWrapper.cpp}::add_placeholder_metadata().

76  {
77  ForeignStorageBuffer empty_buffer;
78  // Use default encoder metadata as in parquet wrapper
79  empty_buffer.initEncoder(column->columnType);
80  auto chunk_metadata = empty_buffer.getEncoder()->getMetadata(column->columnType);
81  chunk_metadata->numElements = num_elements;
82 
83  if (!column->columnType.is_varlen_indeed()) {
84  chunk_metadata->numBytes = column->columnType.get_size() * num_elements;
85  }
86  // min/max not set by default for arrays, so get from elem type encoder
87  if (column->columnType.is_array()) {
88  ForeignStorageBuffer scalar_buffer;
89  scalar_buffer.initEncoder(column->columnType.get_elem_type());
90  auto scalar_metadata =
91  scalar_buffer.getEncoder()->getMetadata(column->columnType.get_elem_type());
92  chunk_metadata->chunkStats.min = scalar_metadata->chunkStats.min;
93  chunk_metadata->chunkStats.max = scalar_metadata->chunkStats.max;
94  }
95  chunk_metadata->chunkStats.has_nulls = true;
96  return chunk_metadata;
97 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
void initEncoder(const SQLTypeInfo &tmp_sql_type)
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:227
SQLTypeInfo columnType
bool is_varlen_indeed() const
Definition: sqltypes.h:535
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:732
bool is_array() const
Definition: sqltypes.h:512

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ParseBufferRequest foreign_storage::get_request_from_pool ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets a request from the metadata scan request pool.

Definition at line 607 of file AbstractTextFileDataWrapper.cpp.

References CHECK, foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by dispatch_metadata_scan_requests().

608  {
609  std::unique_lock<std::mutex> request_pool_lock(
610  multi_threading_params.request_pool_mutex);
611  multi_threading_params.request_pool_condition.wait(
612  request_pool_lock,
613  [&multi_threading_params] { return !multi_threading_params.request_pool.empty(); });
614  auto request = std::move(multi_threading_params.request_pool.front());
615  multi_threading_params.request_pool.pop();
616  request_pool_lock.unlock();
617  CHECK(request.buffer);
618  return request;
619 }
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the caller graph for this function:

std::unique_ptr< ColumnDescriptor > foreign_storage::get_sub_type_column_descriptor ( const ColumnDescriptor column)

Definition at line 75 of file ParquetShared.cpp.

References ColumnDescriptor::columnId, ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_elem_type(), SQLTypeInfo::set_size(), and ColumnDescriptor::tableId.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping().

76  {
77  auto column_type = column->columnType.get_elem_type();
78  if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
79  column_type.set_size(4); // override default size of -1
80  }
81  return std::make_unique<ColumnDescriptor>(
82  column->tableId, column->columnId, column->columnName, column_type);
83 }
void set_size(int s)
Definition: sqltypes.h:427
SQLTypeInfo columnType
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:732
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size,
const size_t  buffer_size 
)

Gets the appropriate number of threads to be used for concurrent processing within the data wrapper.

Definition at line 222 of file AbstractTextFileDataWrapper.cpp.

References CHECK_GT, and import_export::CopyParams::threads.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata(), and foreign_storage::AbstractTextFileDataWrapper::populateChunks().

225  {
226  size_t thread_count = copy_params.threads;
227  if (thread_count == 0) {
228  thread_count = std::thread::hardware_concurrency();
229  }
230  if (size_known) {
231  size_t num_buffers_in_file = (file_size + buffer_size - 1) / buffer_size;
232  if (num_buffers_in_file < thread_count) {
233  thread_count = num_buffers_in_file;
234  }
235  }
236  CHECK_GT(thread_count, static_cast<size_t>(0));
237  return thread_count;
238 }
#define CHECK_GT(x, y)
Definition: Logger.h:221
size_t file_size(const int fd)
Definition: omnisci_fs.cpp:31

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const FileRegions &  file_regions 
)

Definition at line 240 of file AbstractTextFileDataWrapper.cpp.

References CHECK_GT, and import_export::CopyParams::threads.

241  {
242  size_t thread_count = copy_params.threads;
243  if (thread_count == 0) {
244  thread_count =
245  std::min<size_t>(std::thread::hardware_concurrency(), file_regions.size());
246  }
247  CHECK_GT(thread_count, static_cast<size_t>(0));
248  return thread_count;
249 }
#define CHECK_GT(x, y)
Definition: Logger.h:221
void foreign_storage::get_value ( const rapidjson::Value &  json_val,
FileRegion &  file_region 
)

Definition at line 44 of file CsvShared.cpp.

References CHECK, foreign_storage::FileRegion::filename, foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::json_utils::get_value_from_object(), foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

44  {
45  CHECK(json_val.IsObject());
47  json_val, file_region.first_row_file_offset, "first_row_file_offset");
49  json_val, file_region.first_row_index, "first_row_index");
50  json_utils::get_value_from_object(json_val, file_region.region_size, "region_size");
51  json_utils::get_value_from_object(json_val, file_region.row_count, "row_count");
52  if (json_val.HasMember("filename")) {
53  json_utils::get_value_from_object(json_val, file_region.filename, "filename");
54  }
55 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:134
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

void foreign_storage::get_value ( const rapidjson::Value &  json_val,
RowGroupInterval &  value 
)

Definition at line 498 of file ParquetDataWrapper.cpp.

References CHECK, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::json_utils::get_value_from_object(), and foreign_storage::RowGroupInterval::start_index.

498  {
499  CHECK(json_val.IsObject());
500  json_utils::get_value_from_object(json_val, value.file_path, "file_path");
501  json_utils::get_value_from_object(json_val, value.start_index, "start_index");
502  json_utils::get_value_from_object(json_val, value.end_index, "end_index");
503 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:134
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

void foreign_storage::init_chunk_for_column ( const ChunkKey chunk_key,
const std::map< ChunkKey, std::shared_ptr< ChunkMetadata >> &  chunk_metadata_map,
const std::map< ChunkKey, AbstractBuffer * > &  buffers,
Chunk_NS::Chunk chunk 
)

Definition at line 22 of file FsiChunkUtils.cpp.

References CHECK, CHECK_EQ, CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_DB_IDX, CHUNK_KEY_TABLE_IDX, Catalog_Namespace::SysCatalog::getCatalog(), Chunk_NS::Chunk::initEncoder(), Catalog_Namespace::SysCatalog::instance(), Data_Namespace::AbstractBuffer::reserve(), Chunk_NS::Chunk::setBuffer(), Chunk_NS::Chunk::setColumnDesc(), Chunk_NS::Chunk::setIndexBuffer(), Data_Namespace::AbstractBuffer::size(), and UNREACHABLE.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMapForColumns().

26  {
27  auto catalog =
29  CHECK(catalog);
30 
31  ChunkKey data_chunk_key = chunk_key;
32  AbstractBuffer* data_buffer = nullptr;
33  AbstractBuffer* index_buffer = nullptr;
34  const auto column = catalog->getMetadataForColumnUnlocked(
35  chunk_key[CHUNK_KEY_TABLE_IDX], chunk_key[CHUNK_KEY_COLUMN_IDX]);
36 
37  if (column->columnType.is_varlen_indeed()) {
38  data_chunk_key.push_back(1);
39  ChunkKey index_chunk_key = chunk_key;
40  index_chunk_key.push_back(2);
41 
42  CHECK(buffers.find(data_chunk_key) != buffers.end());
43  CHECK(buffers.find(index_chunk_key) != buffers.end());
44 
45  data_buffer = buffers.find(data_chunk_key)->second;
46  index_buffer = buffers.find(index_chunk_key)->second;
47  CHECK_EQ(data_buffer->size(), static_cast<size_t>(0));
48  CHECK_EQ(index_buffer->size(), static_cast<size_t>(0));
49 
50  size_t index_offset_size{0};
51  if (column->columnType.is_string() || column->columnType.is_geometry()) {
52  index_offset_size = sizeof(StringOffsetT);
53  } else if (column->columnType.is_array()) {
54  index_offset_size = sizeof(ArrayOffsetT);
55  } else {
56  UNREACHABLE();
57  }
58  CHECK(chunk_metadata_map.find(data_chunk_key) != chunk_metadata_map.end());
59  index_buffer->reserve(index_offset_size *
60  (chunk_metadata_map.at(data_chunk_key)->numElements + 1));
61  } else {
62  data_chunk_key = chunk_key;
63  CHECK(buffers.find(data_chunk_key) != buffers.end());
64  data_buffer = buffers.find(data_chunk_key)->second;
65  }
66  CHECK(chunk_metadata_map.find(data_chunk_key) != chunk_metadata_map.end());
67  data_buffer->reserve(chunk_metadata_map.at(data_chunk_key)->numBytes);
68 
69  chunk.setColumnDesc(column);
70  chunk.setBuffer(data_buffer);
71  chunk.setIndexBuffer(index_buffer);
72  chunk.initEncoder();
73 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
std::vector< int > ChunkKey
Definition: types.h:37
void setIndexBuffer(AbstractBuffer *ib)
Definition: Chunk.h:113
#define CHUNK_KEY_DB_IDX
Definition: types.h:39
#define UNREACHABLE()
Definition: Logger.h:253
void setBuffer(AbstractBuffer *b)
Definition: Chunk.h:111
int32_t StringOffsetT
Definition: sqltypes.h:957
static SysCatalog & instance()
Definition: SysCatalog.h:318
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:40
An AbstractBuffer is a unit of data management for a data manager.
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
int32_t ArrayOffsetT
Definition: sqltypes.h:958
void initEncoder()
Definition: Chunk.cpp:225
#define CHECK(condition)
Definition: Logger.h:209
void setColumnDesc(const ColumnDescriptor *cd)
Definition: Chunk.h:56
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:41
virtual void reserve(size_t num_bytes)=0

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::is_metadata_placeholder ( const ChunkMetadata metadata)
inline

Check if ChunkMetadata corresponds to a chunk for which metadata must be populated.

Definition at line 27 of file MetadataPlaceholder.h.

References ChunkMetadata::chunkStats, Datum::intval, SQLTypeInfo::is_dict_encoded_type(), ChunkStats::max, ChunkStats::min, and ChunkMetadata::sqlType.

Referenced by anonymous_namespace{RelAlgExecutor.cpp}::prepare_string_dictionaries().

27  {
28  return metadata.chunkStats.min.intval > metadata.chunkStats.max.intval &&
29  metadata.sqlType.is_dict_encoded_type(); // Only supported type for now
30 }
int32_t intval
Definition: sqltypes.h:214
ChunkStats chunkStats
Definition: ChunkMetadata.h:35
bool is_dict_encoded_type() const
Definition: sqltypes.h:549
SQLTypeInfo sqlType
Definition: ChunkMetadata.h:32

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

UniqueReaderPtr foreign_storage::open_parquet_table ( const std::string &  file_path,
std::shared_ptr< arrow::fs::FileSystem > &  file_system 
)

Definition at line 25 of file ParquetShared.cpp.

Referenced by foreign_storage::FileReaderMap::getOrInsert(), and foreign_storage::FileReaderMap::insert().

26  {
27  UniqueReaderPtr reader;
28  auto file_result = file_system->OpenInputFile(file_path);
29  if (!file_result.ok()) {
30  throw std::runtime_error{"Unable to access " + file_system->type_name() + " file: " +
31  file_path + ". " + file_result.status().message()};
32  }
33  auto infile = file_result.ValueOrDie();
34  PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
35  return reader;
36 }
std::unique_ptr< parquet::arrow::FileReader > UniqueReaderPtr
Definition: ParquetShared.h:32

+ Here is the caller graph for this function:

ParseFileRegionResult foreign_storage::parse_file_regions ( const FileRegions &  file_regions,
const size_t  start_index,
const size_t  end_index,
FileReader &  file_reader,
ParseBufferRequest &  parse_file_request,
const std::map< int, Chunk_NS::Chunk > &  column_id_to_chunk_map,
const TextFileBufferParser &  parser 
)

Parses a set of file regions given a handle to the file and range of indexes for the file regions to be parsed.

Definition at line 162 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::ParseBufferRequest::begin_pos, foreign_storage::ParseBufferRequest::buffer, foreign_storage::ParseBufferRequest::buffer_size, CHECK, CHECK_EQ, foreign_storage::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::ParseBufferRequest::end_pos, foreign_storage::ParseBufferRequest::file_offset, foreign_storage::ParseFileRegionResult::file_offset, foreign_storage::ParseBufferRequest::first_row_index, i, foreign_storage::TextFileBufferParser::parseBuffer(), foreign_storage::ParseBufferRequest::process_row_count, foreign_storage::FileReader::readRegion(), run_benchmark_import::result, and foreign_storage::ParseBufferResult::row_count.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunks().

169  {
170  ParseFileRegionResult load_file_region_result{};
171  load_file_region_result.file_offset = file_regions[start_index].first_row_file_offset;
172  load_file_region_result.row_count = 0;
173 
174  ParseBufferResult result;
175  for (size_t i = start_index; i <= end_index; i++) {
176  CHECK(file_regions[i].region_size <= parse_file_request.buffer_size);
177  auto read_size = file_reader.readRegion(parse_file_request.buffer.get(),
178  file_regions[i].first_row_file_offset,
179  file_regions[i].region_size);
180  CHECK_EQ(file_regions[i].region_size, read_size);
181  parse_file_request.begin_pos = 0;
182  parse_file_request.end_pos = file_regions[i].region_size;
183  parse_file_request.first_row_index = file_regions[i].first_row_index;
184  parse_file_request.file_offset = file_regions[i].first_row_file_offset;
185  parse_file_request.process_row_count = file_regions[i].row_count;
186 
187  result = parser.parseBuffer(parse_file_request, i == end_index);
188  CHECK_EQ(file_regions[i].row_count, result.row_count);
189  load_file_region_result.row_count += result.row_count;
190  }
191  load_file_region_result.column_id_to_data_blocks_map =
192  result.column_id_to_data_blocks_map;
193  return load_file_region_result;
194 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector<size_t> foreign_storage::partition_by_fragment ( const size_t  start_row_index,
const size_t  max_fragment_size,
const size_t  buffer_row_count 
)

Given a start row index, maximum fragment size, and number of rows in a buffer, this function returns a vector indicating how the rows in the buffer should be partitioned in order to fill up available fragment slots while staying within the capacity of fragments.

Definition at line 329 of file AbstractTextFileDataWrapper.cpp.

References CHECK.

Referenced by scan_metadata().

331  {
332  CHECK(buffer_row_count > 0);
333  std::vector<size_t> partitions{};
334  size_t remaining_rows_in_last_fragment;
335  if (start_row_index % max_fragment_size == 0) {
336  remaining_rows_in_last_fragment = 0;
337  } else {
338  remaining_rows_in_last_fragment =
339  max_fragment_size - (start_row_index % max_fragment_size);
340  }
341  if (buffer_row_count <= remaining_rows_in_last_fragment) {
342  partitions.emplace_back(buffer_row_count);
343  } else {
344  if (remaining_rows_in_last_fragment > 0) {
345  partitions.emplace_back(remaining_rows_in_last_fragment);
346  }
347  size_t remaining_buffer_row_count =
348  buffer_row_count - remaining_rows_in_last_fragment;
349  while (remaining_buffer_row_count > 0) {
350  partitions.emplace_back(
351  std::min<size_t>(remaining_buffer_row_count, max_fragment_size));
352  remaining_buffer_row_count -= partitions.back();
353  }
354  }
355  return partitions;
356 }
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the caller graph for this function:

template<typename T >
auto foreign_storage::partition_for_threads ( const std::set< T > &  items,
size_t  max_threads 
)

Definition at line 41 of file ParquetShared.h.

References i.

Referenced by foreign_storage::LazyParquetChunkLoader::metadataScan(), and foreign_storage::ParquetDataWrapper::populateChunkBuffers().

41  {
42  const size_t items_per_thread = (items.size() + (max_threads - 1)) / max_threads;
43  std::list<std::set<T>> items_by_thread;
44  auto i = 0U;
45  for (auto item : items) {
46  if (i++ % items_per_thread == 0) {
47  items_by_thread.emplace_back(std::set<T>{});
48  }
49  items_by_thread.back().emplace(item);
50  }
51  return items_by_thread;
52 }

+ Here is the caller graph for this function:

void foreign_storage::process_data_blocks ( MetadataScanMultiThreadingParams &  multi_threading_params,
int  fragment_id,
const ParseBufferRequest &  request,
ParseBufferResult &  result,
std::map< int, const ColumnDescriptor * > &  column_by_id,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map 
)

Updates metadata encapsulated in encoders for all table columns given new data blocks gotten from parsing a new set of rows in a file buffer. If cache is available, also append the data_blocks to chunks in the cache

Definition at line 486 of file AbstractTextFileDataWrapper.cpp.

References add_file_region(), cache_blocks(), foreign_storage::MetadataScanMultiThreadingParams::cached_chunks, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers_mutex, foreign_storage::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::ParseBufferRequest::db_id, foreign_storage::ParseBufferRequest::first_row_index, foreign_storage::ParseBufferRequest::getFilePath(), foreign_storage::ParseBufferRequest::getMaxFragRows(), foreign_storage::ParseBufferRequest::getTableId(), foreign_storage::ParseBufferResult::row_count, and update_stats().

Referenced by scan_metadata().

491  {
492  std::lock_guard<std::mutex> lock(multi_threading_params.chunk_encoder_buffers_mutex);
493  // File regions should be added in same order as appendData
494  add_file_region(fragment_id_to_file_regions_map,
495  fragment_id,
496  request.first_row_index,
497  result,
498  request.getFilePath());
499 
500  for (auto& [column_id, data_block] : result.column_id_to_data_blocks_map) {
501  ChunkKey chunk_key{request.db_id, request.getTableId(), column_id, fragment_id};
502  const auto column = column_by_id[column_id];
503  if (column->columnType.is_varlen_indeed()) {
504  chunk_key.emplace_back(1);
505  }
506  if (multi_threading_params.chunk_encoder_buffers.find(chunk_key) ==
507  multi_threading_params.chunk_encoder_buffers.end()) {
508  multi_threading_params.chunk_encoder_buffers[chunk_key] =
509  std::make_unique<ForeignStorageBuffer>();
510  multi_threading_params.chunk_encoder_buffers[chunk_key]->initEncoder(
511  column->columnType);
512  }
513  update_stats(multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder(),
514  column->columnType,
515  data_block,
516  result.row_count);
517  size_t num_elements = multi_threading_params.chunk_encoder_buffers[chunk_key]
518  ->getEncoder()
519  ->getNumElems() +
520  result.row_count;
521  multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder()->setNumElems(
522  num_elements);
523  cache_blocks(
524  multi_threading_params.cached_chunks,
525  data_block,
526  result.row_count,
527  chunk_key,
528  column,
529  (num_elements - result.row_count) == 0, // Is the first block added to this chunk
530  num_elements == request.getMaxFragRows() // Is the last block for this chunk
531  );
532  }
533 }
std::vector< int > ChunkKey
Definition: types.h:37
void update_stats(Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
void cache_blocks(std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block)
void add_file_region(std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const ParseBufferResult &result, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::refresh_foreign_table ( Catalog_Namespace::Catalog catalog,
const std::string &  table_name,
const bool  evict_cached_entries 
)

Definition at line 22 of file ForeignTableRefresh.cpp.

References CHUNK_KEY_FRAGMENT_IDX, Data_Namespace::CPU_LEVEL, Catalog_Namespace::DBMetadata::dbId, StorageType::FOREIGN_TABLE, Catalog_Namespace::Catalog::getCurrentDB(), Catalog_Namespace::Catalog::getDataMgr(), Catalog_Namespace::Catalog::getForeignTableUnlocked(), PostEvictionRefreshException::getOriginalException(), Data_Namespace::GPU_LEVEL, foreign_storage::ForeignTable::isAppendMode(), Catalog_Namespace::Catalog::removeFragmenterForTable(), and Catalog_Namespace::Catalog::updateForeignTableRefreshTimes().

Referenced by RefreshForeignTablesCommand::execute(), and foreign_storage::ForeignTableRefreshScheduler::start().

24  {
25  auto& data_mgr = catalog.getDataMgr();
26  auto table_lock =
27  std::make_unique<lockmgr::TableSchemaLockContainer<lockmgr::WriteLock>>(
29  catalog, table_name, false));
30 
31  const TableDescriptor* td = (*table_lock)();
32  if (td->storageType != StorageType::FOREIGN_TABLE) {
33  throw std::runtime_error{
34  table_name +
35  " is not a foreign table. Refreshes are applicable to only foreign tables."};
36  }
37 
38  catalog.removeFragmenterForTable(td->tableId);
39  ChunkKey table_key{catalog.getCurrentDB().dbId, td->tableId};
40 
41  if (catalog.getForeignTableUnlocked(td->tableId)->isAppendMode() &&
42  !evict_cached_entries) {
43  ChunkMetadataVector metadata_vec;
44  data_mgr.getChunkMetadataVecForKeyPrefix(metadata_vec, table_key);
45  int last_fragment_id = 0;
46  for (const auto& [key, metadata] : metadata_vec) {
47  if (key[CHUNK_KEY_FRAGMENT_IDX] > last_fragment_id) {
48  last_fragment_id = key[CHUNK_KEY_FRAGMENT_IDX];
49  }
50  }
51  for (const auto& [key, metadata] : metadata_vec) {
52  if (key[CHUNK_KEY_FRAGMENT_IDX] == last_fragment_id) {
53  data_mgr.deleteChunksWithPrefix(key, MemoryLevel::CPU_LEVEL);
54  data_mgr.deleteChunksWithPrefix(key, MemoryLevel::GPU_LEVEL);
55  }
56  }
57  } else {
58  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::CPU_LEVEL);
59  data_mgr.deleteChunksWithPrefix(table_key, MemoryLevel::GPU_LEVEL);
60  }
61 
62  try {
63  data_mgr.getPersistentStorageMgr()->getForeignStorageMgr()->refreshTable(
64  table_key, evict_cached_entries);
65  catalog.updateForeignTableRefreshTimes(td->tableId);
66  } catch (PostEvictionRefreshException& e) {
67  catalog.updateForeignTableRefreshTimes(td->tableId);
68  throw e.getOriginalException();
69  }
70 }
std::vector< int > ChunkKey
Definition: types.h:37
const foreign_storage::ForeignTable * getForeignTableUnlocked(int tableId) const
Definition: Catalog.cpp:1757
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:223
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:42
std::runtime_error getOriginalException()
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:222
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
bool isAppendMode() const
Checks if the table is in append mode.
void removeFragmenterForTable(const int table_id) const
Definition: Catalog.cpp:3502
static constexpr char const * FOREIGN_TABLE
void updateForeignTableRefreshTimes(const int32_t table_id)
Definition: Catalog.cpp:4791

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::resize_buffer_if_needed ( std::unique_ptr< char[]> &  buffer,
size_t &  buffer_size,
const size_t  alloc_size 
)

Optionally resizes the given buffer if the buffer size is less than the current buffer allocation size.

Definition at line 640 of file AbstractTextFileDataWrapper.cpp.

References CHECK_LE.

Referenced by dispatch_metadata_scan_requests().

642  {
643  CHECK_LE(buffer_size, alloc_size);
644  if (buffer_size < alloc_size) {
645  buffer = std::make_unique<char[]>(alloc_size);
646  buffer_size = alloc_size;
647  }
648 }
#define CHECK_LE(x, y)
Definition: Logger.h:220

+ Here is the caller graph for this function:

void foreign_storage::scan_metadata ( MetadataScanMultiThreadingParams &  multi_threading_params,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map,
const TextFileBufferParser &  parser 
)

Consumes and processes metadata scan requests from a pending requests queue and updates existing metadata objects based on newly scanned metadata.

Definition at line 552 of file AbstractTextFileDataWrapper.cpp.

References add_request_to_pool(), foreign_storage::MetadataScanMultiThreadingParams::continue_processing, get_next_metadata_scan_request(), foreign_storage::TextFileBufferParser::parseBuffer(), partition_by_fragment(), foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, process_data_blocks(), and run_benchmark_import::result.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata().

554  {
555  std::map<int, const ColumnDescriptor*> column_by_id{};
556  while (true) {
557  auto request_opt = get_next_metadata_scan_request(multi_threading_params);
558  if (!request_opt.has_value()) {
559  break;
560  }
561  auto& request = request_opt.value();
562  try {
563  if (column_by_id.empty()) {
564  for (const auto column : request.getColumns()) {
565  column_by_id[column->columnId] = column;
566  }
567  }
568  auto partitions = partition_by_fragment(
569  request.first_row_index, request.getMaxFragRows(), request.buffer_row_count);
570  request.begin_pos = 0;
571  size_t row_index = request.first_row_index;
572  for (const auto partition : partitions) {
573  request.process_row_count = partition;
574  for (const auto& import_buffer : request.import_buffers) {
575  if (import_buffer != nullptr) {
576  import_buffer->clear();
577  }
578  }
579  auto result = parser.parseBuffer(request, true);
580  int fragment_id = row_index / request.getMaxFragRows();
581  process_data_blocks(multi_threading_params,
582  fragment_id,
583  request,
584  result,
585  column_by_id,
586  fragment_id_to_file_regions_map);
587  row_index += result.row_count;
588  request.begin_pos = result.row_offsets.back() - request.file_offset;
589  }
590  } catch (...) {
591  // Re-add request to pool so we dont block any other threads
592  {
593  std::lock_guard<std::mutex> pending_requests_lock(
594  multi_threading_params.pending_requests_mutex);
595  multi_threading_params.continue_processing = false;
596  }
597  add_request_to_pool(multi_threading_params, request);
598  throw;
599  }
600  add_request_to_pool(multi_threading_params, request);
601  }
602 }
std::vector< size_t > partition_by_fragment(const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
void add_request_to_pool(MetadataScanMultiThreadingParams &multi_threading_params, ParseBufferRequest &request)
std::optional< ParseBufferRequest > get_next_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params)
void process_data_blocks(MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const ParseBufferRequest &request, ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const FileRegion &  file_region,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 26 of file CsvShared.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::FileRegion::filename, foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

28  {
29  json_val.SetObject();
31  json_val, file_region.first_row_file_offset, "first_row_file_offset", allocator);
33  json_val, file_region.first_row_index, "first_row_index", allocator);
35  json_val, file_region.region_size, "region_size", allocator);
37  json_val, file_region.row_count, "row_count", allocator);
38  if (file_region.filename.size()) {
40  json_val, file_region.filename, "filename", allocator);
41  }
42 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:119

+ Here is the call graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const RowGroupInterval &  value,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 489 of file ParquetDataWrapper.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, and foreign_storage::RowGroupInterval::start_index.

491  {
492  json_val.SetObject();
493  json_utils::add_value_to_object(json_val, value.file_path, "file_path", allocator);
494  json_utils::add_value_to_object(json_val, value.start_index, "start_index", allocator);
495  json_utils::add_value_to_object(json_val, value.end_index, "end_index", allocator);
496 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:119

+ Here is the call graph for this function:

void foreign_storage::throw_file_access_error ( const std::string &  file_path,
const std::string &  message 
)
inline

Definition at line 52 of file ForeignStorageException.h.

53  {
54  std::string error_message{"Unable to access file \"" + file_path + "\". " + message};
55  throw ForeignStorageException{error_message};
56 }
void foreign_storage::throw_file_not_found_error ( const std::string &  file_path)
inline

Definition at line 58 of file ForeignStorageException.h.

58  {
59  throw ForeignStorageException{"File or directory \"" + file_path +
60  "\" does not exist."};
61 }
void foreign_storage::throw_number_of_columns_mismatch_error ( size_t  num_table_cols,
size_t  num_file_cols,
const std::string &  file_path 
)
inline

Definition at line 43 of file ForeignStorageException.h.

References to_string().

Referenced by foreign_storage::anonymous_namespace{CsvFileBufferParser.cpp}::validate_expected_column_count(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns().

45  {
46  throw ForeignStorageException{"Mismatched number of logical columns: (expected " +
47  std::to_string(num_table_cols) + " columns, has " +
48  std::to_string(num_file_cols) + "): in file '" +
49  file_path + "'"};
50 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::throw_parquet_metadata_out_of_bounds_error ( const std::string &  min_value,
const std::string &  max_value,
const std::string &  encountered_value 
)
inline

Definition at line 46 of file ParquetMetadataValidator.h.

Referenced by foreign_storage::TimestampBoundsValidator< T >::validateValue(), foreign_storage::IntegralFixedLengthBoundsValidator< T >::validateValue(), foreign_storage::DateInSecondsBoundsValidator< T >::validateValue(), and foreign_storage::FloatPointValidator< T >::validateValue().

49  {
50  std::stringstream error_message;
51  error_message << "Parquet column contains values that are outside the range of the "
52  "OmniSci column "
53  "type. Consider using a wider column type. Min allowed value: "
54  << min_value << ". Max allowed value: " << max_value
55  << ". Encountered value: " << encountered_value << ".";
56  throw std::runtime_error(error_message.str());
57 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_file_error ( const std::string &  file_path)
inline

Definition at line 36 of file ForeignStorageException.h.

Referenced by foreign_storage::LocalMultiFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

36  {
37  throw ForeignStorageException{
38  "Refresh of foreign table created with \"APPEND\" update type failed as "
39  "file \"" +
40  file_path + "\" was removed."};
41 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_row_error ( const std::string &  file_path)
inline

Definition at line 29 of file ForeignStorageException.h.

Referenced by foreign_storage::SingleFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

29  {
30  throw ForeignStorageException{
31  "Refresh of foreign table created with \"APPEND\" update type failed as file "
32  "reduced in size: " +
33  file_path};
34 }

+ Here is the caller graph for this function:

void foreign_storage::update_stats ( Encoder encoder,
const SQLTypeInfo column_type,
DataBlockPtr  data_block,
const size_t  row_count 
)

Updates the statistics metadata encapsulated in the encoder given new data in a data block.

Definition at line 420 of file AbstractTextFileDataWrapper.cpp.

References DataBlockPtr::arraysPtr, SQLTypeInfo::is_array(), SQLTypeInfo::is_varlen(), DataBlockPtr::numbersPtr, DataBlockPtr::stringsPtr, and Encoder::updateStats().

Referenced by process_data_blocks(), Fragmenter_Namespace::InsertOrderFragmenter::updateColumn(), Fragmenter_Namespace::InsertOrderFragmenter::updateColumnMetadata(), and StorageIOFacility::yieldUpdateCallback().

423  {
424  if (column_type.is_array()) {
425  encoder->updateStats(data_block.arraysPtr, 0, row_count);
426  } else if (!column_type.is_varlen()) {
427  encoder->updateStats(data_block.numbersPtr, row_count);
428  } else {
429  encoder->updateStats(data_block.stringsPtr, 0, row_count);
430  }
431 }
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:227
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:228
bool is_varlen() const
Definition: sqltypes.h:529
int8_t * numbersPtr
Definition: sqltypes.h:226
virtual void updateStats(const int64_t val, const bool is_null)=0
bool is_array() const
Definition: sqltypes.h:512

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr< parquet::Statistics > foreign_storage::validate_and_get_column_metadata_statistics ( const parquet::ColumnChunkMetaData *  column_metadata)

Definition at line 85 of file ParquetShared.cpp.

References CHECK.

Referenced by foreign_storage::ParquetEncoder::getRowGroupMetadata(), foreign_storage::TypedParquetInPlaceEncoder< int64_t, int32_t >::getRowGroupMetadata(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_definition_levels().

86  {
87  CHECK(column_metadata->is_stats_set());
88  std::shared_ptr<parquet::Statistics> stats = column_metadata->statistics();
89  return stats;
90 }
#define CHECK(condition)
Definition: Logger.h:209

+ Here is the caller graph for this function:

void foreign_storage::validate_equal_column_descriptor ( const parquet::ColumnDescriptor *  reference_descriptor,
const parquet::ColumnDescriptor *  new_descriptor,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 59 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

63  {
64  if (!reference_descriptor->Equals(*new_descriptor)) {
65  throw std::runtime_error{"Parquet file \"" + new_file_path +
66  "\" has a different schema. Please ensure that all Parquet "
67  "files use the same schema. Reference Parquet file: " +
68  reference_file_path +
69  ", column name: " + reference_descriptor->name() +
70  ". New Parquet file: " + new_file_path +
71  ", column name: " + new_descriptor->name() + "."};
72  }
73 }

+ Here is the caller graph for this function:

void foreign_storage::validate_non_foreign_table_write ( const TableDescriptor table_descriptor)
inline

Definition at line 22 of file FsiUtils.h.

References StorageType::FOREIGN_TABLE, and TableDescriptor::storageType.

Referenced by Parser::InsertStmt::analyze(), Parser::InsertValuesStmt::determineLeafIndex(), Parser::TruncateTableStmt::execute(), Parser::InsertValuesStmt::execute(), Parser::InsertIntoTableAsSelectStmt::populateData(), and RelModify::RelModify().

22  {
23  if (table_descriptor && table_descriptor->storageType == StorageType::FOREIGN_TABLE) {
24  throw std::runtime_error{
25  "DELETE, INSERT, TRUNCATE, OR UPDATE commands are not supported for foreign "
26  "tables."};
27  }
28 }
std::string storageType
static constexpr char const * FOREIGN_TABLE

+ Here is the caller graph for this function: