OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage Namespace Reference

Namespaces

 anonymous_namespace{AbstractFileStorageDataWrapper.cpp}
 
 anonymous_namespace{AbstractTextFileDataWrapper.cpp}
 
 anonymous_namespace{CachingForeignStorageMgr.cpp}
 
 anonymous_namespace{CsvFileBufferParser.cpp}
 
 anonymous_namespace{FileReader.cpp}
 
 anonymous_namespace{ForeignDataWrapperFactory.cpp}
 
 anonymous_namespace{ForeignStorageCache.cpp}
 
 anonymous_namespace{ForeignTableRefresh.cpp}
 
 anonymous_namespace{InternalCatalogDataWrapper.cpp}
 
 anonymous_namespace{InternalMemoryStatsDataWrapper.cpp}
 
 anonymous_namespace{InternalStorageStatsDataWrapper.cpp}
 
 anonymous_namespace{InternalSystemDataWrapper.cpp}
 
 anonymous_namespace{LazyParquetChunkLoader.cpp}
 
 anonymous_namespace{LogFileBufferParser.cpp}
 
 anonymous_namespace{ParquetDataWrapper.cpp}
 
 anonymous_namespace{RefreshTimeCalculator.cpp}
 
 anonymous_namespace{RegexFileBufferParser.cpp}
 
 anonymous_namespace{RegexParserDataWrapper.cpp}
 
 anonymous_namespace{S3FilePathUtil.cpp}
 
 anonymous_namespace{TextFileBufferParser.cpp}
 
 Csv
 
 json_utils
 

Classes

struct  ForeignServer
 
struct  ForeignTable
 
struct  OptionsContainer
 
struct  UserMappingType
 
struct  UserMapping
 
class  RefreshTimeCalculator
 
class  AbstractFileStorageDataWrapper
 
struct  ParseFileRegionResult
 
struct  MetadataScanMultiThreadingParams
 
class  AbstractTextFileDataWrapper
 
class  CachingForeignStorageMgr
 
class  CsvDataWrapper
 
class  CsvFileBufferParser
 
struct  DataPreview
 
class  FileReader
 
class  SingleFileReader
 
class  SingleTextFileReader
 
class  ArchiveWrapper
 
class  CompressedFileReader
 
class  MultiFileReader
 
class  LocalMultiFileReader
 
struct  FileRegion
 
class  ForeignDataWrapper
 
struct  DataWrapperType
 Encapsulates an enumeration of foreign data wrapper type strings. More...
 
class  ForeignDataWrapperFactory
 
class  ForeignStorageBuffer
 
class  ForeignStorageCache
 
class  ForeignStorageException
 
class  MetadataScanInfeasibleFragmentSizeException
 
class  ChunkSizeValidator
 
class  MockForeignDataWrapper
 
class  ForeignStorageMgr
 
class  ForeignTableSchema
 
class  GeospatialEncoder
 
class  InternalCatalogDataWrapper
 
class  InternalLogsDataWrapper
 
class  InternalMemoryStatsDataWrapper
 
struct  StorageDetails
 
class  InternalStorageStatsDataWrapper
 
class  InternalSystemDataWrapper
 
struct  ColumnType
 
struct  FragmentType
 
struct  Interval
 
struct  ParquetBatchData
 
class  ParquetRowGroupReader
 
struct  PreviewContext
 
class  LazyParquetChunkLoader
 
class  LogFileBufferParser
 
class  OdbcGeospatialEncoder
 
class  ParquetArrayDetectEncoder
 
class  ParquetArrayEncoder
 
class  ParquetArrayImportEncoder
 
class  ParquetDataWrapper
 
class  ParquetDateInDaysFromTimestampEncoder
 
class  ParquetDateInSecondsEncoder
 
class  ParquetDecimalEncoder
 
class  ParquetDetectStringEncoder
 
class  ParquetEncoder
 
class  ParquetImportEncoder
 
class  ParquetScalarEncoder
 
class  ParquetFixedLengthArrayEncoder
 
class  ParquetFixedLengthEncoder
 
class  ParquetUnsignedFixedLengthEncoder
 
class  ParquetGeospatialEncoder
 
class  ParquetGeospatialImportEncoder
 
class  RowGroupIntervalTracker
 
class  ParquetImportBatchResult
 
class  AbstractRowGroupIntervalTracker
 
class  ParquetImporter
 
class  ParquetInPlaceEncoder
 
class  TypedParquetInPlaceEncoder
 
class  ParquetMetadataValidator
 
class  TimestampBoundsValidator
 
class  IntegralFixedLengthBoundsValidator
 
class  BaseDateBoundsValidator
 
class  FloatPointValidator
 
struct  RowGroupInterval
 
struct  RowGroupMetadata
 
class  FileReaderMap
 
class  ParquetStringEncoder
 
class  ParquetStringImportEncoder
 
class  ParquetStringNoneEncoder
 
class  ParquetTimeEncoder
 
class  ParquetTimestampEncoder
 
class  ParquetVariableLengthArrayEncoder
 
class  PassThroughBuffer
 
class  RegexFileBufferParser
 
class  RegexParserDataWrapper
 
class  FileOrderS3
 
struct  ParseBufferRequest
 
struct  ParseBufferResult
 
class  TextFileBufferParser
 
class  TypedParquetDetectBuffer
 
class  TypedParquetStorageBuffer
 
class  ForeignTableRefreshScheduler
 

Typedefs

using OptionsMap = std::map< std::string, std::string, std::less<>>
 
using SampleRows = std::vector< std::vector< std::string >>
 
using FirstLineByFilePath = std::map< std::string, std::string >
 
using FileRegions = std::vector< FileRegion >
 
using ChunkToBufferMap = std::map< ChunkKey, AbstractBuffer * >
 
using RenderGroupAnalyzerMap = std::map< int, std::unique_ptr< import_export::RenderGroupAnalyzer >>
 
using read_lock = heavyai::shared_lock< heavyai::shared_mutex >
 
using write_lock = heavyai::unique_lock< heavyai::shared_mutex >
 
using FilePathAndRowGroup = std::pair< std::string, int32_t >
 
using RejectedRowIndices = std::set< int64_t >
 
using InvalidRowGroupIndices = std::set< int64_t >
 
template<typename T >
using DateInSecondsBoundsValidator = BaseDateBoundsValidator< T, true >
 
template<typename T >
using DateInDaysBoundsValidator = BaseDateBoundsValidator< T, false >
 
using UniqueReaderPtr = std::unique_ptr< parquet::arrow::FileReader >
 
using ReaderPtr = parquet::arrow::FileReader *
 
template<typename V , typename T , T conversion_denominator, typename NullType = V>
using ParquetDateInSecondsFromTimestampEncoder = ParquetTimestampEncoder< V, T, conversion_denominator, NullType >
 
using S3ObjectComparator = std::function< bool(const Aws::S3::Model::Object &, const Aws::S3::Model::Object &)>
 

Functions

ParseFileRegionResult parse_file_regions (const FileRegions &file_regions, const size_t start_index, const size_t end_index, FileReader &file_reader, ParseBufferRequest &parse_file_request, const std::map< int, Chunk_NS::Chunk > &column_id_to_chunk_map, const TextFileBufferParser &parser)
 
size_t get_buffer_size (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size)
 
size_t get_buffer_size (const FileRegions &file_regions)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const bool size_known, const size_t file_size, const size_t buffer_size)
 
size_t get_thread_count (const import_export::CopyParams &copy_params, const FileRegions &file_regions)
 
std::vector< size_t > partition_by_fragment (const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
 
std::optional< ParseBufferRequestget_next_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params)
 
void add_file_region (std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const ParseBufferResult &result, const std::string &file_path)
 
void update_stats (Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
 
void cache_blocks (std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block, bool disable_cache)
 
void process_data_blocks (MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const ParseBufferRequest &request, ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)
 
void add_request_to_pool (MetadataScanMultiThreadingParams &multi_threading_params, ParseBufferRequest &request)
 
void scan_metadata (MetadataScanMultiThreadingParams &multi_threading_params, std::map< int, FileRegions > &fragment_id_to_file_regions_map, const TextFileBufferParser &parser)
 
ParseBufferRequest get_request_from_pool (MetadataScanMultiThreadingParams &multi_threading_params)
 
void dispatch_metadata_scan_request (MetadataScanMultiThreadingParams &multi_threading_params, ParseBufferRequest &request)
 
void resize_buffer_if_needed (std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
 
void dispatch_metadata_scan_requests (const size_t &buffer_size, const std::string &file_path, FileReader &file_reader, const import_export::CopyParams &copy_params, MetadataScanMultiThreadingParams &multi_threading_params, size_t &first_row_index_in_buffer, size_t &current_file_offset, const TextFileBufferParser &parser)
 
void set_value (rapidjson::Value &json_val, const FileRegion &file_region, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, FileRegion &file_region)
 
std::optional< SQLTypesdetect_geo_type (const SampleRows &sample_rows, size_t column_index)
 
bool is_s3_uri (const std::string &file_path)
 
std::tuple< std::unique_ptr
< foreign_storage::ForeignServer >
, std::unique_ptr
< foreign_storage::UserMapping >
, std::unique_ptr
< foreign_storage::ForeignTable > > 
create_proxy_fsi_objects (const std::string &copy_from_source, const import_export::CopyParams &copy_params, const int db_id, const TableDescriptor *table, const int32_t user_id)
 Create proxy fsi objects for use outside FSI. More...
 
std::tuple< std::unique_ptr
< foreign_storage::ForeignServer >
, std::unique_ptr
< foreign_storage::UserMapping >
, std::unique_ptr
< foreign_storage::ForeignTable > > 
create_proxy_fsi_objects (const std::string &copy_from_source, const import_export::CopyParams &copy_params, const TableDescriptor *table)
 Create proxy fsi objects for use outside FSI NOTE: parameters mirror function above. More...
 
void validate_regex_parser_options (const import_export::CopyParams &copy_params)
 
bool is_valid_source_type (const import_export::CopyParams &copy_params)
 
std::string bool_to_option_value (const bool value)
 
void throw_unexpected_number_of_items (const size_t &num_expected, const size_t &num_loaded, const std::string &item_type)
 
void throw_removed_row_in_result_set_error (const std::string &select_statement)
 
void throw_removed_row_in_file_error (const std::string &file_path)
 
void throw_removed_file_error (const std::string &file_path)
 
void throw_number_of_columns_mismatch_error (size_t num_table_cols, size_t num_file_cols, const std::string &file_path)
 
void throw_file_access_error (const std::string &file_path, const std::string &message)
 
void throw_file_not_found_error (const std::string &file_path)
 
void throw_s3_compressed_mime_type (const std::string &file_path, const std::string &mime_type)
 
void throw_s3_compressed_extension (const std::string &file_path, const std::string &ext_type)
 
bool is_append_table_chunk_key (const ChunkKey &chunk_key)
 
bool set_comp (const ChunkKey &left, const ChunkKey &right)
 
std::vector< ChunkKeyget_column_key_vec (const ChunkKey &destination_chunk_key)
 
std::set< ChunkKeyget_column_key_set (const ChunkKey &destination_chunk_key)
 
size_t get_max_chunk_size (const ChunkKey &key)
 
bool contains_fragment_key (const std::set< ChunkKey > &key_set, const ChunkKey &target_key)
 
bool is_table_enabled_on_node (const ChunkKey &key)
 
void refresh_foreign_table_unlocked (Catalog_Namespace::Catalog &catalog, const ForeignTable &td, const bool evict_cached_entries)
 
void refresh_foreign_table (Catalog_Namespace::Catalog &catalog, const std::string &table_name, const bool evict_cached_entries)
 
void init_chunk_for_column (const ChunkKey &chunk_key, const std::map< ChunkKey, std::shared_ptr< ChunkMetadata >> &chunk_metadata_map, const std::map< ChunkKey, AbstractBuffer * > &buffers, Chunk_NS::Chunk &chunk)
 
std::shared_ptr< ChunkMetadataget_placeholder_metadata (const SQLTypeInfo &type, size_t num_elements)
 
const
foreign_storage::ForeignTable
get_foreign_table_for_key (const ChunkKey &key)
 
bool is_system_table_chunk_key (const ChunkKey &chunk_key)
 
bool is_replicated_table_chunk_key (const ChunkKey &chunk_key)
 
bool is_shardable_key (const ChunkKey &key)
 
bool fragment_maps_to_leaf (const ChunkKey &key)
 
bool key_does_not_shard_to_leaf (const ChunkKey &key)
 
template<typename T >
auto partition_for_threads (const std::set< T > &items, size_t max_threads)
 
template<typename T >
auto partition_for_threads (const std::vector< T > &items, size_t max_threads)
 
template<typename Container >
std::vector< std::future< void > > create_futures_for_workers (const Container &items, size_t max_threads, std::function< void(const Container &)> lambda)
 
template<typename T >
ArrayDatum encode_as_array_datum (const std::vector< T > &data)
 
std::string get_db_name (int32_t db_id)
 
std::string get_table_name (int32_t db_id, int32_t table_id)
 
void set_node_name (std::map< std::string, import_export::TypedImportBuffer * > &import_buffers)
 
void set_value (rapidjson::Value &json_val, const RowGroupInterval &value, rapidjson::Document::AllocatorType &allocator)
 
void get_value (const rapidjson::Value &json_val, RowGroupInterval &value)
 
template<typename D , typename T >
bool check_bounds (const T &value)
 
template<typename D >
std::string datetime_to_string (const D &timestamp, const SQLTypeInfo &column_type)
 
void throw_parquet_metadata_out_of_bounds_error (const std::string &min_value, const std::string &max_value, const std::string &encountered_value)
 
UniqueReaderPtr open_parquet_table (const std::string &file_path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
 
std::pair< int, int > get_parquet_table_size (const ReaderPtr &reader)
 
const parquet::ColumnDescriptor * get_column_descriptor (const parquet::arrow::FileReader *reader, const int logical_column_index)
 
parquet::Type::type get_physical_type (ReaderPtr &reader, const int logical_column_index)
 
void validate_equal_column_descriptor (const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
 
std::unique_ptr< ColumnDescriptorget_sub_type_column_descriptor (const ColumnDescriptor *column)
 
std::shared_ptr
< parquet::Statistics > 
validate_and_get_column_metadata_statistics (const parquet::ColumnChunkMetaData *column_metadata)
 
std::vector
< Aws::S3::Model::Object > 
s3_objects_filter_sort_files (const std::vector< Aws::S3::Model::Object > &file_paths, const shared::FilePathOptions &options)
 
template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
get_null_value ()
 
template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > get_min_max_bounds ()
 
void populate_string_dictionary (const int32_t table_id, const int32_t col_id, const Catalog_Namespace::Catalog &catalog)
 
void validate_non_foreign_table_write (const TableDescriptor *table_descriptor)
 

Variables

constexpr const char * kDeletedValueIndicator {"<DELETED>"}
 

Typedef Documentation

Definition at line 34 of file ForeignDataWrapper.h.

Definition at line 277 of file ParquetMetadataValidator.h.

Definition at line 274 of file ParquetMetadataValidator.h.

using foreign_storage::FilePathAndRowGroup = typedef std::pair<std::string, int32_t>

Definition at line 40 of file ParquetDataWrapper.h.

using foreign_storage::FileRegions = typedef std::vector<FileRegion>

Definition at line 60 of file FileRegions.h.

using foreign_storage::FirstLineByFilePath = typedef std::map<std::string, std::string>

Definition at line 37 of file FileReader.h.

using foreign_storage::InvalidRowGroupIndices = typedef std::set<int64_t>

Definition at line 123 of file ParquetEncoder.h.

using foreign_storage::OptionsMap = typedef std::map<std::string, std::string, std::less<>>

Definition at line 30 of file OptionsContainer.h.

template<typename V , typename T , T conversion_denominator, typename NullType = V>
using foreign_storage::ParquetDateInSecondsFromTimestampEncoder = typedef ParquetTimestampEncoder<V, T, conversion_denominator, NullType>

Definition at line 89 of file ParquetTimestampEncoder.h.

using foreign_storage::ReaderPtr = typedef parquet::arrow::FileReader*

Definition at line 33 of file ParquetShared.h.

using foreign_storage::RejectedRowIndices = typedef std::set<int64_t>

Definition at line 28 of file ParquetEncoder.h.

using foreign_storage::RenderGroupAnalyzerMap = typedef std::map<int, std::unique_ptr<import_export::RenderGroupAnalyzer>>

Definition at line 37 of file ForeignDataWrapper.h.

using foreign_storage::S3ObjectComparator = typedef std::function<bool(const Aws::S3::Model::Object&, const Aws::S3::Model::Object&)>

Definition at line 17 of file S3FilePathUtil.h.

using foreign_storage::SampleRows = typedef std::vector<std::vector<std::string>>

Definition at line 26 of file DataPreview.h.

using foreign_storage::UniqueReaderPtr = typedef std::unique_ptr<parquet::arrow::FileReader>

Definition at line 32 of file ParquetShared.h.

Function Documentation

void foreign_storage::add_file_region ( std::map< int, FileRegions > &  fragment_id_to_file_regions_map,
int  fragment_id,
size_t  first_row_index,
const ParseBufferResult &  result,
const std::string &  file_path 
)

Creates a new file region based on metadata from parsed file buffers and adds the new region to the fragment id to file regions map.

Definition at line 478 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::ParseBufferResult::row_count, and foreign_storage::ParseBufferResult::row_offsets.

Referenced by process_data_blocks().

482  {
483  fragment_id_to_file_regions_map[fragment_id].emplace_back(
484  // file naming is handled by FileReader
485  FileRegion(file_path,
486  result.row_offsets.front(),
487  first_row_index,
488  result.row_count,
489  result.row_offsets.back() - result.row_offsets.front()));
490 }

+ Here is the caller graph for this function:

void foreign_storage::add_request_to_pool ( MetadataScanMultiThreadingParams &  multi_threading_params,
ParseBufferRequest &  request 
)

Adds the request object for a processed request back to the request pool for reuse in subsequent requests.

Definition at line 624 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by dispatch_metadata_scan_requests(), and scan_metadata().

625  {
626  std::unique_lock<std::mutex> completed_requests_queue_lock(
627  multi_threading_params.request_pool_mutex);
628  multi_threading_params.request_pool.emplace(std::move(request));
629  completed_requests_queue_lock.unlock();
630  multi_threading_params.request_pool_condition.notify_all();
631 }

+ Here is the caller graph for this function:

std::string foreign_storage::bool_to_option_value ( const bool  value)

Definition at line 115 of file ForeignDataWrapperFactory.cpp.

Referenced by foreign_storage::ForeignDataWrapperFactory::createForeignTableProxy().

115  {
116  return value ? "TRUE" : "FALSE";
117 }

+ Here is the caller graph for this function:

void foreign_storage::cache_blocks ( std::map< ChunkKey, Chunk_NS::Chunk > &  cached_chunks,
DataBlockPtr  data_block,
size_t  row_count,
ChunkKey chunk_key,
const ColumnDescriptor column,
bool  is_first_block,
bool  is_last_block,
bool  disable_cache 
)

Definition at line 524 of file AbstractTextFileDataWrapper.cpp.

References CHECK, CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_DB_IDX, CHUNK_KEY_FRAGMENT_IDX, CHUNK_KEY_TABLE_IDX, ColumnDescriptor::columnType, foreign_storage::anonymous_namespace{AbstractTextFileDataWrapper.cpp}::get_cache_if_enabled(), Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), SQLTypeInfo::is_varlen_indeed(), and key_does_not_shard_to_leaf().

Referenced by process_data_blocks().

531  {
532  auto catalog =
534  CHECK(catalog);
535  auto cache = get_cache_if_enabled(catalog, disable_cache);
536  if (cache) {
537  // This extra filter needs to be here because this wrapper is the only one that
538  // accesses the cache directly and it should not be inserting chunks which are not
539  // mapped to the current leaf (in distributed mode).
540  if (key_does_not_shard_to_leaf(chunk_key)) {
541  return;
542  }
543 
544  ChunkKey index_key = {chunk_key[CHUNK_KEY_DB_IDX],
545  chunk_key[CHUNK_KEY_TABLE_IDX],
546  chunk_key[CHUNK_KEY_COLUMN_IDX],
547  chunk_key[CHUNK_KEY_FRAGMENT_IDX],
548  2};
549  // Create actual data chunks to prepopulate cache
550  if (cached_chunks.find(chunk_key) == cached_chunks.end()) {
551  cached_chunks[chunk_key] = Chunk_NS::Chunk{column, false};
552  cached_chunks[chunk_key].setBuffer(
553  cache->getChunkBufferForPrecaching(chunk_key, is_first_block));
554  if (column->columnType.is_varlen_indeed()) {
555  cached_chunks[chunk_key].setIndexBuffer(
556  cache->getChunkBufferForPrecaching(index_key, is_first_block));
557  }
558  if (is_first_block) {
559  cached_chunks[chunk_key].initEncoder();
560  }
561  }
562  cached_chunks[chunk_key].appendData(data_block, row_count, 0);
563  }
564 }
std::vector< int > ChunkKey
Definition: types.h:36
foreign_storage::ForeignStorageCache * get_cache_if_enabled(std::shared_ptr< Catalog_Namespace::Catalog > &catalog, const bool disable_cache)
#define CHUNK_KEY_DB_IDX
Definition: types.h:38
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:41
bool key_does_not_shard_to_leaf(const ChunkKey &key)
static SysCatalog & instance()
Definition: SysCatalog.h:337
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:39
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
#define CHECK(condition)
Definition: Logger.h:222
SQLTypeInfo columnType
bool is_varlen_indeed() const
Definition: sqltypes.h:542
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:40

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename D , typename T >
bool foreign_storage::check_bounds ( const T &  value)
inline

Definition at line 32 of file ParquetMetadataValidator.h.

32  {
33  auto [min_value, max_value] = get_min_max_bounds<D>();
34  return value >= min_value && value <= max_value;
35 }
bool foreign_storage::contains_fragment_key ( const std::set< ChunkKey > &  key_set,
const ChunkKey target_key 
)
template<typename Container >
std::vector<std::future<void> > foreign_storage::create_futures_for_workers ( const Container &  items,
size_t  max_threads,
std::function< void(const Container &)>  lambda 
)

Definition at line 74 of file FsiChunkUtils.h.

References threading_serial::async(), and partition_for_threads().

Referenced by import_export::ForeignDataImporter::importGeneralS3(), foreign_storage::ParquetDataWrapper::populateChunkBuffers(), and foreign_storage::LazyParquetChunkLoader::previewFiles().

77  {
78  auto items_per_thread = partition_for_threads(items, max_threads);
79  std::vector<std::future<void>> futures;
80  for (const auto& items : items_per_thread) {
81  futures.emplace_back(std::async(std::launch::async, lambda, items));
82  }
83 
84  return futures;
85 }
auto partition_for_threads(const std::set< T > &items, size_t max_threads)
Definition: FsiChunkUtils.h:41
future< Result > async(Fn &&fn, Args &&...args)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::tuple< std::unique_ptr< foreign_storage::ForeignServer >, std::unique_ptr< foreign_storage::UserMapping >, std::unique_ptr< foreign_storage::ForeignTable > > foreign_storage::create_proxy_fsi_objects ( const std::string &  copy_from_source,
const import_export::CopyParams copy_params,
const int  db_id,
const TableDescriptor table,
const int32_t  user_id 
)

Create proxy fsi objects for use outside FSI.

Parameters
copy_from_source- the source that will be copied
copy_params- CopyParams that specify parameters around use case
db_id- db id of database in use case
table- the table descriptor of the table in use case
user_id- the user id of user in use case
Returns
tuple of FSI objects that can be used for particular use case (for example import or detect)

Definition at line 44 of file ForeignDataWrapperFactory.cpp.

References CHECK, foreign_storage::ForeignDataWrapperFactory::createForeignServerProxy(), foreign_storage::ForeignDataWrapperFactory::createForeignTableProxy(), and foreign_storage::ForeignDataWrapperFactory::createUserMappingProxyIfApplicable().

Referenced by create_proxy_fsi_objects(), and import_export::ForeignDataImporter::importGeneral().

48  {
50  db_id, user_id, copy_from_source, copy_params);
51 
52  CHECK(server);
53  server->validate();
54 
55  auto user_mapping =
57  db_id, user_id, copy_from_source, copy_params, server.get());
58 
59  if (user_mapping) {
60  user_mapping->validate(server.get());
61  }
62 
63  auto foreign_table =
65  db_id, table, copy_from_source, copy_params, server.get());
66 
67  CHECK(foreign_table);
68  foreign_table->validateOptionValues();
69 
70  return {std::move(server), std::move(user_mapping), std::move(foreign_table)};
71 }
static std::unique_ptr< ForeignServer > createForeignServerProxy(const int db_id, const int user_id, const std::string &file_path, const import_export::CopyParams &copy_params)
static std::unique_ptr< ForeignTable > createForeignTableProxy(const int db_id, const TableDescriptor *table, const std::string &file_path, const import_export::CopyParams &copy_params, const ForeignServer *server)
#define CHECK(condition)
Definition: Logger.h:222
static std::unique_ptr< UserMapping > createUserMappingProxyIfApplicable(const int db_id, const int user_id, const std::string &file_path, const import_export::CopyParams &copy_params, const ForeignServer *server)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::tuple< std::unique_ptr< foreign_storage::ForeignServer >, std::unique_ptr< foreign_storage::UserMapping >, std::unique_ptr< foreign_storage::ForeignTable > > foreign_storage::create_proxy_fsi_objects ( const std::string &  copy_from_source,
const import_export::CopyParams copy_params,
const TableDescriptor table 
)

Create proxy fsi objects for use outside FSI NOTE: parameters mirror function above.

Definition at line 76 of file ForeignDataWrapperFactory.cpp.

References create_proxy_fsi_objects().

78  {
79  return create_proxy_fsi_objects(copy_from_source, copy_params, -1, table, -1);
80 }
std::tuple< std::unique_ptr< foreign_storage::ForeignServer >, std::unique_ptr< foreign_storage::UserMapping >, std::unique_ptr< foreign_storage::ForeignTable > > create_proxy_fsi_objects(const std::string &copy_from_source, const import_export::CopyParams &copy_params, const int db_id, const TableDescriptor *table, const int32_t user_id)
Create proxy fsi objects for use outside FSI.

+ Here is the call graph for this function:

template<typename D >
std::string foreign_storage::datetime_to_string ( const D &  timestamp,
const SQLTypeInfo column_type 
)
inline

Definition at line 38 of file ParquetMetadataValidator.h.

References Datum::bigintval, CHECK, DatumToString(), SQLTypeInfo::is_date(), and SQLTypeInfo::is_timestamp().

Referenced by foreign_storage::TimestampBoundsValidator< T >::getMinMaxBoundsAsStrings(), foreign_storage::BaseDateBoundsValidator< T, is_in_seconds >::getMinMaxBoundsAsStrings(), foreign_storage::TimestampBoundsValidator< T >::validateValue(), and foreign_storage::BaseDateBoundsValidator< T, is_in_seconds >::validateValue().

39  {
40  CHECK(column_type.is_timestamp() || column_type.is_date());
41  Datum d;
42  d.bigintval = timestamp;
43  return DatumToString(d, column_type);
44 }
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:392
bool is_timestamp() const
Definition: sqltypes.h:895
int64_t bigintval
Definition: sqltypes.h:215
#define CHECK(condition)
Definition: Logger.h:222
bool is_date() const
Definition: sqltypes.h:883

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::optional< SQLTypes > foreign_storage::detect_geo_type ( const SampleRows &  sample_rows,
size_t  column_index 
)

Definition at line 22 of file DataPreview.cpp.

References CHECK_EQ, CHECK_LT, kLINESTRING, kMULTIPOLYGON, kNULLT, kPOINT, kPOLYGON, and UNREACHABLE.

Referenced by foreign_storage::LazyParquetChunkLoader::previewFiles().

23  {
24  std::optional<SQLTypes> tentative_geo_type{};
25  for (const auto& row : sample_rows) {
26  static std::regex geo_regex{
27  "\\s*(POINT|LINESTRING|POLYGON|MULTIPOLYGON)\\s*\\(.+\\)\\s*"};
28  std::smatch match;
29  CHECK_LT(column_index, row.size());
30  if (std::regex_match(row[column_index], match, geo_regex)) {
31  CHECK_EQ(match.size(), static_cast<size_t>(2));
32  SQLTypes geo_type{kNULLT};
33  const auto& geo_type_str = match[1];
34  if (geo_type_str == "POINT") {
35  geo_type = kPOINT;
36  } else if (geo_type_str == "LINESTRING") {
37  geo_type = kLINESTRING;
38  } else if (geo_type_str == "POLYGON") {
39  geo_type = kPOLYGON;
40  } else if (geo_type_str == "MULTIPOLYGON") {
41  geo_type = kMULTIPOLYGON;
42  } else {
43  UNREACHABLE() << "Unexpected geo type match: " << geo_type_str;
44  }
45  if (tentative_geo_type.has_value()) {
46  if (tentative_geo_type.value() != geo_type) {
47  return {}; // geo type does not match between rows, can not be imported
48  }
49  } else {
50  tentative_geo_type = geo_type;
51  }
52  }
53  }
54  return tentative_geo_type;
55 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
SQLTypes
Definition: sqltypes.h:38
#define UNREACHABLE()
Definition: Logger.h:266
#define CHECK_LT(x, y)
Definition: Logger.h:232

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params,
ParseBufferRequest &  request 
)

Dispatches a new metadata scan request by adding the request to the pending requests queue to be consumed by a worker thread.

Definition at line 710 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by dispatch_metadata_scan_requests().

712  {
713  {
714  std::unique_lock<std::mutex> pending_requests_lock(
715  multi_threading_params.pending_requests_mutex);
716  multi_threading_params.pending_requests.emplace(std::move(request));
717  }
718  multi_threading_params.pending_requests_condition.notify_all();
719 }

+ Here is the caller graph for this function:

void foreign_storage::dispatch_metadata_scan_requests ( const size_t &  buffer_size,
const std::string &  file_path,
FileReader &  file_reader,
const import_export::CopyParams copy_params,
MetadataScanMultiThreadingParams &  multi_threading_params,
size_t &  first_row_index_in_buffer,
size_t &  current_file_offset,
const TextFileBufferParser &  parser 
)

Reads from a text file iteratively and dispatches metadata scan requests that are processed by worker threads.

Definition at line 739 of file AbstractTextFileDataWrapper.cpp.

References add_request_to_pool(), foreign_storage::MetadataScanMultiThreadingParams::continue_processing, dispatch_metadata_scan_request(), foreign_storage::TextFileBufferParser::findRowEndPosition(), get_request_from_pool(), foreign_storage::FileReader::getCurrentFilePath(), foreign_storage::FileReader::isScanFinished(), import_export::CopyParams::line_delim, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, foreign_storage::FileReader::read(), and resize_buffer_if_needed().

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata().

747  {
748  auto alloc_size = buffer_size;
749  auto residual_buffer = std::make_unique<char[]>(alloc_size);
750  size_t residual_buffer_size = 0;
751  size_t residual_buffer_alloc_size = alloc_size;
752 
753  while (!file_reader.isScanFinished()) {
754  {
755  std::lock_guard<std::mutex> pending_requests_lock(
756  multi_threading_params.pending_requests_mutex);
757  if (!multi_threading_params.continue_processing) {
758  break;
759  }
760  }
761  auto request = get_request_from_pool(multi_threading_params);
762  request.full_path = file_reader.getCurrentFilePath();
763  resize_buffer_if_needed(request.buffer, request.buffer_alloc_size, alloc_size);
764 
765  if (residual_buffer_size > 0) {
766  memcpy(request.buffer.get(), residual_buffer.get(), residual_buffer_size);
767  }
768  size_t size = residual_buffer_size;
769  size += file_reader.read(request.buffer.get() + residual_buffer_size,
770  alloc_size - residual_buffer_size);
771 
772  if (size == 0) {
773  // In some cases at the end of a file we will read 0 bytes even when
774  // file_reader.isScanFinished() is false. Also add request back to the pool to be
775  // picked up again in the next iteration.
776  add_request_to_pool(multi_threading_params, request);
777  continue;
778  } else if (size == 1 && request.buffer[0] == copy_params.line_delim) {
779  // In some cases files with newlines at the end will be encoded with a second
780  // newline that can end up being the only thing in the buffer. Also add request
781  // back to the pool to be picked up again in the next iteration.
782  current_file_offset++;
783  add_request_to_pool(multi_threading_params, request);
784  continue;
785  }
786  unsigned int num_rows_in_buffer = 0;
787  request.end_pos = parser.findRowEndPosition(alloc_size,
788  request.buffer,
789  size,
790  copy_params,
791  first_row_index_in_buffer,
792  num_rows_in_buffer,
793  &file_reader);
794  request.buffer_size = size;
795  request.buffer_alloc_size = alloc_size;
796  request.first_row_index = first_row_index_in_buffer;
797  request.file_offset = current_file_offset;
798  request.buffer_row_count = num_rows_in_buffer;
799 
800  residual_buffer_size = size - request.end_pos;
801  if (residual_buffer_size > 0) {
802  resize_buffer_if_needed(residual_buffer, residual_buffer_alloc_size, alloc_size);
803  memcpy(residual_buffer.get(),
804  request.buffer.get() + request.end_pos,
805  residual_buffer_size);
806  }
807 
808  current_file_offset += request.end_pos;
809  first_row_index_in_buffer += num_rows_in_buffer;
810 
811  if (num_rows_in_buffer > 0) {
812  dispatch_metadata_scan_request(multi_threading_params, request);
813  } else {
814  add_request_to_pool(multi_threading_params, request);
815  }
816  }
817 
818  std::unique_lock<std::mutex> pending_requests_queue_lock(
819  multi_threading_params.pending_requests_mutex);
820  multi_threading_params.pending_requests_condition.wait(
821  pending_requests_queue_lock, [&multi_threading_params] {
822  return multi_threading_params.pending_requests.empty() ||
823  (multi_threading_params.continue_processing == false);
824  });
825  multi_threading_params.continue_processing = false;
826  pending_requests_queue_lock.unlock();
827  multi_threading_params.pending_requests_condition.notify_all();
828 }
void add_request_to_pool(MetadataScanMultiThreadingParams &multi_threading_params, ParseBufferRequest &request)
ParseBufferRequest get_request_from_pool(MetadataScanMultiThreadingParams &multi_threading_params)
void resize_buffer_if_needed(std::unique_ptr< char[]> &buffer, size_t &buffer_size, const size_t alloc_size)
void dispatch_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params, ParseBufferRequest &request)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
ArrayDatum foreign_storage::encode_as_array_datum ( const std::vector< T > &  data)
inline

Definition at line 33 of file GeospatialEncoder.h.

References heavydb.dtypes::T.

Referenced by foreign_storage::GeospatialEncoder::processGeoElement(), and foreign_storage::GeospatialEncoder::processNullGeoElement().

33  {
34  const size_t num_bytes = data.size() * sizeof(T);
35  std::shared_ptr<int8_t> buffer(new int8_t[num_bytes], std::default_delete<int8_t[]>());
36  memcpy(buffer.get(), data.data(), num_bytes);
37  return ArrayDatum(num_bytes, buffer, false);
38 }
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:208

+ Here is the caller graph for this function:

bool foreign_storage::fragment_maps_to_leaf ( const ChunkKey key)

Definition at line 126 of file FsiChunkUtils.cpp.

References CHECK, g_distributed_leaf_idx, g_distributed_num_leaves, get_fragment(), dist::is_aggregator(), and dist::is_distributed().

Referenced by anonymous_namespace{ForeignStorageMgr.cpp}::filter_metadata_by_leaf(), foreign_storage::CachingForeignStorageMgr::getChunkMetadataVecForKeyPrefix(), key_does_not_shard_to_leaf(), and foreign_storage::CachingForeignStorageMgr::refreshTableInCache().

126  {
131 }
int get_fragment(const ChunkKey &key)
Definition: types.h:52
int32_t g_distributed_leaf_idx
Definition: Catalog.cpp:98
bool is_aggregator()
Definition: distributed.cpp:33
int32_t g_distributed_num_leaves
Definition: Catalog.cpp:99
#define CHECK(condition)
Definition: Logger.h:222
bool is_distributed()
Definition: distributed.cpp:21

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size 
)

Gets the appropriate buffer size to be used when processing file(s).

Definition at line 241 of file AbstractTextFileDataWrapper.cpp.

References import_export::CopyParams::buffer_size.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata(), and foreign_storage::AbstractTextFileDataWrapper::populateChunks().

243  {
244  size_t buffer_size = copy_params.buffer_size;
245  if (size_known && file_size < buffer_size) {
246  buffer_size = file_size + 1; // +1 for end of line character, if missing
247  }
248  return buffer_size;
249 }
size_t file_size(const int fd)
Definition: heavyai_fs.cpp:33

+ Here is the caller graph for this function:

size_t foreign_storage::get_buffer_size ( const FileRegions &  file_regions)

Definition at line 251 of file AbstractTextFileDataWrapper.cpp.

References CHECK.

251  {
252  size_t buffer_size = 0;
253  for (const auto& file_region : file_regions) {
254  buffer_size = std::max(buffer_size, file_region.region_size);
255  }
256  CHECK(buffer_size);
257  return buffer_size;
258 }
#define CHECK(condition)
Definition: Logger.h:222
const parquet::ColumnDescriptor * foreign_storage::get_column_descriptor ( const parquet::arrow::FileReader *  reader,
const int  logical_column_index 
)

Definition at line 46 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

48  {
49  return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
50 }

+ Here is the caller graph for this function:

std::set<ChunkKey> foreign_storage::get_column_key_set ( const ChunkKey destination_chunk_key)

Referenced by foreign_storage::CachingForeignStorageMgr::fetchBuffer(), foreign_storage::CachingForeignStorageMgr::getOptionalKeysWithinSizeLimit(), and foreign_storage::CachingForeignStorageMgr::getRequiredBuffersSize().

+ Here is the caller graph for this function:

std::vector<ChunkKey> foreign_storage::get_column_key_vec ( const ChunkKey destination_chunk_key)
std::string foreign_storage::get_db_name ( int32_t  db_id)

Definition at line 31 of file InternalSystemDataWrapper.cpp.

References Catalog_Namespace::DBMetadata::dbName, Catalog_Namespace::SysCatalog::instance(), and kDeletedValueIndicator.

Referenced by foreign_storage::anonymous_namespace{InternalCatalogDataWrapper.cpp}::populate_import_buffers_for_catalog_dashboards(), foreign_storage::anonymous_namespace{InternalCatalogDataWrapper.cpp}::populate_import_buffers_for_catalog_permissions(), foreign_storage::anonymous_namespace{InternalCatalogDataWrapper.cpp}::populate_import_buffers_for_catalog_tables(), foreign_storage::anonymous_namespace{InternalCatalogDataWrapper.cpp}::populate_import_buffers_for_catalog_users(), foreign_storage::anonymous_namespace{InternalMemoryStatsDataWrapper.cpp}::populate_import_buffers_for_memory_details(), and foreign_storage::anonymous_namespace{InternalStorageStatsDataWrapper.cpp}::populate_import_buffers_for_storage_details().

31  {
33  auto& sys_catalog = Catalog_Namespace::SysCatalog::instance();
34  if (sys_catalog.getMetadataForDBById(db_id, db_metadata)) {
35  return db_metadata.dbName;
36  } else {
37  // Database has been deleted.
39  }
40 }
constexpr const char * kDeletedValueIndicator
static SysCatalog & instance()
Definition: SysCatalog.h:337

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

const foreign_storage::ForeignTable & foreign_storage::get_foreign_table_for_key ( const ChunkKey key)

Definition at line 99 of file FsiChunkUtils.cpp.

References CHECK, get_table_prefix(), Catalog_Namespace::SysCatalog::getCatalog(), and Catalog_Namespace::SysCatalog::instance().

Referenced by is_append_table_chunk_key(), foreign_storage::anonymous_namespace{CachingForeignStorageMgr.cpp}::is_in_memory_system_table_chunk_key(), is_replicated_table_chunk_key(), and is_system_table_chunk_key().

99  {
100  auto [db_id, tb_id] = get_table_prefix(key);
101  auto catalog = Catalog_Namespace::SysCatalog::instance().getCatalog(db_id);
102  CHECK(catalog);
103  auto table = catalog->getForeignTable(tb_id);
104  CHECK(table);
105  return *table;
106 }
static SysCatalog & instance()
Definition: SysCatalog.h:337
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
std::pair< int, int > get_table_prefix(const ChunkKey &key)
Definition: types.h:62
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t foreign_storage::get_max_chunk_size ( const ChunkKey key)

Referenced by foreign_storage::CachingForeignStorageMgr::getBufferSize().

+ Here is the caller graph for this function:

template<typename D , std::enable_if_t< std::is_integral< D >::value, int > = 0>
std::pair< D, D > foreign_storage::get_min_max_bounds ( )
inline

Definition at line 31 of file SharedMetadataValidator.h.

31  {
32  static_assert(std::is_signed<D>::value,
33  "'get_min_max_bounds' is only valid for signed types");
34  return {get_null_value<D>() + 1, std::numeric_limits<D>::max()};
35 }
std::optional<ParseBufferRequest> foreign_storage::get_next_metadata_scan_request ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets the next metadata scan request object from the pending requests queue. A null optional is returned if there are no further requests to be processed.

Definition at line 455 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::MetadataScanMultiThreadingParams::continue_processing, foreign_storage::MetadataScanMultiThreadingParams::pending_requests, foreign_storage::MetadataScanMultiThreadingParams::pending_requests_condition, and foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex.

Referenced by scan_metadata().

456  {
457  std::unique_lock<std::mutex> pending_requests_lock(
458  multi_threading_params.pending_requests_mutex);
459  multi_threading_params.pending_requests_condition.wait(
460  pending_requests_lock, [&multi_threading_params] {
461  return !multi_threading_params.pending_requests.empty() ||
462  !multi_threading_params.continue_processing;
463  });
464  if (multi_threading_params.pending_requests.empty()) {
465  return {};
466  }
467  auto request = std::move(multi_threading_params.pending_requests.front());
468  multi_threading_params.pending_requests.pop();
469  pending_requests_lock.unlock();
470  multi_threading_params.pending_requests_condition.notify_all();
471  return std::move(request);
472 }

+ Here is the caller graph for this function:

template<typename V , std::enable_if_t< std::is_integral< V >::value, int > = 0>
V foreign_storage::get_null_value ( )
inline

Definition at line 21 of file SharedMetadataValidator.h.

21  {
22  return inline_int_null_value<V>();
23 }
std::pair< int, int > foreign_storage::get_parquet_table_size ( const ReaderPtr &  reader)

Definition at line 39 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), foreign_storage::LazyParquetChunkLoader::loadRowGroups(), and foreign_storage::LazyParquetChunkLoader::metadataScan().

39  {
40  auto file_metadata = reader->parquet_reader()->metadata();
41  const auto num_row_groups = file_metadata->num_row_groups();
42  const auto num_columns = file_metadata->num_columns();
43  return std::make_pair(num_row_groups, num_columns);
44 }

+ Here is the caller graph for this function:

parquet::Type::type foreign_storage::get_physical_type ( ReaderPtr &  reader,
const int  logical_column_index 
)

Definition at line 52 of file ParquetShared.cpp.

Referenced by anonymous_namespace{ArrowResultSetConverter.cpp}::get_arrow_type(), and ArrowResultSetConverter::initializeColumnBuilder().

52  {
53  return reader->parquet_reader()
54  ->metadata()
55  ->schema()
56  ->Column(logical_column_index)
57  ->physical_type();
58 }

+ Here is the caller graph for this function:

std::shared_ptr< ChunkMetadata > foreign_storage::get_placeholder_metadata ( const SQLTypeInfo type,
size_t  num_elements 
)

Definition at line 75 of file FsiChunkUtils.cpp.

References SQLTypeInfo::get_elem_type(), SQLTypeInfo::get_size(), Data_Namespace::AbstractBuffer::getEncoder(), Encoder::getMetadata(), Data_Namespace::AbstractBuffer::initEncoder(), SQLTypeInfo::is_array(), and SQLTypeInfo::is_varlen_indeed().

Referenced by foreign_storage::anonymous_namespace{AbstractTextFileDataWrapper.cpp}::add_placeholder_metadata(), foreign_storage::InternalSystemDataWrapper::populateChunkMetadata(), and foreign_storage::AbstractTextFileDataWrapper::updateRolledOffChunks().

76  {
77  ForeignStorageBuffer empty_buffer;
78  // Use default encoder metadata as in parquet wrapper
79  empty_buffer.initEncoder(type);
80 
81  auto chunk_metadata = empty_buffer.getEncoder()->getMetadata(type);
82  chunk_metadata->numElements = num_elements;
83 
84  if (!type.is_varlen_indeed()) {
85  chunk_metadata->numBytes = type.get_size() * num_elements;
86  }
87  // min/max not set by default for arrays, so get from elem type encoder
88  if (type.is_array()) {
89  ForeignStorageBuffer scalar_buffer;
90  scalar_buffer.initEncoder(type.get_elem_type());
91  auto scalar_metadata = scalar_buffer.getEncoder()->getMetadata(type.get_elem_type());
92  chunk_metadata->chunkStats.min = scalar_metadata->chunkStats.min;
93  chunk_metadata->chunkStats.max = scalar_metadata->chunkStats.max;
94  }
95 
96  return chunk_metadata;
97 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
void initEncoder(const SQLTypeInfo &tmp_sql_type)
virtual void getMetadata(const std::shared_ptr< ChunkMetadata > &chunkMetadata)
Definition: Encoder.cpp:227
bool is_varlen_indeed() const
Definition: sqltypes.h:542
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:865
bool is_array() const
Definition: sqltypes.h:518

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ParseBufferRequest foreign_storage::get_request_from_pool ( MetadataScanMultiThreadingParams &  multi_threading_params)

Gets a request from the metadata scan request pool.

Definition at line 692 of file AbstractTextFileDataWrapper.cpp.

References CHECK, foreign_storage::MetadataScanMultiThreadingParams::request_pool, foreign_storage::MetadataScanMultiThreadingParams::request_pool_condition, and foreign_storage::MetadataScanMultiThreadingParams::request_pool_mutex.

Referenced by dispatch_metadata_scan_requests().

693  {
694  std::unique_lock<std::mutex> request_pool_lock(
695  multi_threading_params.request_pool_mutex);
696  multi_threading_params.request_pool_condition.wait(
697  request_pool_lock,
698  [&multi_threading_params] { return !multi_threading_params.request_pool.empty(); });
699  auto request = std::move(multi_threading_params.request_pool.front());
700  multi_threading_params.request_pool.pop();
701  request_pool_lock.unlock();
702  CHECK(request.buffer);
703  return request;
704 }
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the caller graph for this function:

std::unique_ptr< ColumnDescriptor > foreign_storage::get_sub_type_column_descriptor ( const ColumnDescriptor column)

Definition at line 76 of file ParquetShared.cpp.

References ColumnDescriptor::columnId, ColumnDescriptor::columnName, ColumnDescriptor::columnType, SQLTypeInfo::get_elem_type(), SQLTypeInfo::set_size(), and ColumnDescriptor::tableId.

Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_array_encoder(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_array_mapping().

77  {
78  auto column_type = column->columnType.get_elem_type();
79  if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
80  column_type.set_size(4); // override default size of -1
81  }
82  return std::make_unique<ColumnDescriptor>(
83  column->tableId, column->columnId, column->columnName, column_type);
84 }
void set_size(int s)
Definition: sqltypes.h:437
SQLTypeInfo columnType
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:865
std::string columnName

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::string foreign_storage::get_table_name ( int32_t  db_id,
int32_t  table_id 
)

Definition at line 42 of file InternalSystemDataWrapper.cpp.

References CHECK, Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), and kDeletedValueIndicator.

Referenced by anonymous_namespace{Execute.cpp}::checkWorkUnitWatchdog(), foreign_storage::anonymous_namespace{InternalMemoryStatsDataWrapper.cpp}::populate_import_buffers_for_memory_details(), and foreign_storage::anonymous_namespace{InternalStorageStatsDataWrapper.cpp}::populate_import_buffers_for_storage_details().

42  {
44  CHECK(catalog);
45  auto table_name = catalog->getTableName(table_id);
46  if (table_name.has_value()) {
47  return table_name.value();
48  } else {
49  // It is possible for the table to be concurrently deleted while querying the system
50  // table.
52  }
53 }
constexpr const char * kDeletedValueIndicator
static SysCatalog & instance()
Definition: SysCatalog.h:337
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const bool  size_known,
const size_t  file_size,
const size_t  buffer_size 
)

Gets the appropriate number of threads to be used for concurrent processing within the data wrapper.

Definition at line 264 of file AbstractTextFileDataWrapper.cpp.

References CHECK_GT, and import_export::CopyParams::threads.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata(), and foreign_storage::AbstractTextFileDataWrapper::populateChunks().

267  {
268  size_t thread_count = copy_params.threads;
269  if (thread_count == 0) {
270  thread_count = std::thread::hardware_concurrency();
271  }
272  if (size_known && file_size > 0) {
273  size_t num_buffers_in_file = (file_size + buffer_size - 1) / buffer_size;
274  if (num_buffers_in_file < thread_count) {
275  thread_count = num_buffers_in_file;
276  }
277  }
278  CHECK_GT(thread_count, static_cast<size_t>(0));
279  return thread_count;
280 }
#define CHECK_GT(x, y)
Definition: Logger.h:234
size_t file_size(const int fd)
Definition: heavyai_fs.cpp:33

+ Here is the caller graph for this function:

size_t foreign_storage::get_thread_count ( const import_export::CopyParams copy_params,
const FileRegions &  file_regions 
)

Definition at line 282 of file AbstractTextFileDataWrapper.cpp.

References CHECK_GT, and import_export::CopyParams::threads.

283  {
284  size_t thread_count = copy_params.threads;
285  if (thread_count == 0) {
286  thread_count =
287  std::min<size_t>(std::thread::hardware_concurrency(), file_regions.size());
288  }
289  CHECK_GT(thread_count, static_cast<size_t>(0));
290  return thread_count;
291 }
#define CHECK_GT(x, y)
Definition: Logger.h:234
void foreign_storage::get_value ( const rapidjson::Value &  json_val,
FileRegion &  file_region 
)

Definition at line 44 of file CsvShared.cpp.

References CHECK, foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::json_utils::get_value_from_object(), foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

44  {
45  CHECK(json_val.IsObject());
47  json_val, file_region.first_row_file_offset, "first_row_file_offset");
49  json_val, file_region.first_row_index, "first_row_index");
50  json_utils::get_value_from_object(json_val, file_region.region_size, "region_size");
51  json_utils::get_value_from_object(json_val, file_region.row_count, "row_count");
52  if (json_val.HasMember("filename")) {
53  json_utils::get_value_from_object(json_val, file_region.filename, "filename");
54  }
55 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:164
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the call graph for this function:

void foreign_storage::get_value ( const rapidjson::Value &  json_val,
RowGroupInterval &  value 
)

Definition at line 669 of file ParquetDataWrapper.cpp.

References CHECK, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, foreign_storage::json_utils::get_value_from_object(), and foreign_storage::RowGroupInterval::start_index.

669  {
670  CHECK(json_val.IsObject());
671  json_utils::get_value_from_object(json_val, value.file_path, "file_path");
672  json_utils::get_value_from_object(json_val, value.start_index, "start_index");
673  json_utils::get_value_from_object(json_val, value.end_index, "end_index");
674 }
void get_value_from_object(const rapidjson::Value &object, T &value, const std::string &name)
Definition: FsiJsonUtils.h:164
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the call graph for this function:

void foreign_storage::init_chunk_for_column ( const ChunkKey chunk_key,
const std::map< ChunkKey, std::shared_ptr< ChunkMetadata >> &  chunk_metadata_map,
const std::map< ChunkKey, AbstractBuffer * > &  buffers,
Chunk_NS::Chunk chunk 
)

Definition at line 21 of file FsiChunkUtils.cpp.

References CHECK, CHECK_EQ, CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_DB_IDX, CHUNK_KEY_TABLE_IDX, Catalog_Namespace::SysCatalog::getCatalog(), Chunk_NS::Chunk::initEncoder(), Catalog_Namespace::SysCatalog::instance(), Data_Namespace::AbstractBuffer::reserve(), Chunk_NS::Chunk::setBuffer(), Chunk_NS::Chunk::setColumnDesc(), Chunk_NS::Chunk::setIndexBuffer(), Chunk_NS::Chunk::setPinnable(), Data_Namespace::AbstractBuffer::size(), and UNREACHABLE.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMapForColumns().

25  {
26  auto catalog =
28  CHECK(catalog);
29 
30  ChunkKey data_chunk_key = chunk_key;
31  AbstractBuffer* data_buffer = nullptr;
32  AbstractBuffer* index_buffer = nullptr;
33  const auto column = catalog->getMetadataForColumn(chunk_key[CHUNK_KEY_TABLE_IDX],
34  chunk_key[CHUNK_KEY_COLUMN_IDX]);
35 
36  if (column->columnType.is_varlen_indeed()) {
37  data_chunk_key.push_back(1);
38  ChunkKey index_chunk_key = chunk_key;
39  index_chunk_key.push_back(2);
40 
41  CHECK(buffers.find(data_chunk_key) != buffers.end());
42  CHECK(buffers.find(index_chunk_key) != buffers.end());
43 
44  data_buffer = buffers.find(data_chunk_key)->second;
45  index_buffer = buffers.find(index_chunk_key)->second;
46  CHECK_EQ(data_buffer->size(), static_cast<size_t>(0));
47  CHECK_EQ(index_buffer->size(), static_cast<size_t>(0));
48 
49  size_t index_offset_size{0};
50  if (column->columnType.is_string() || column->columnType.is_geometry()) {
51  index_offset_size = sizeof(StringOffsetT);
52  } else if (column->columnType.is_array()) {
53  index_offset_size = sizeof(ArrayOffsetT);
54  } else {
55  UNREACHABLE();
56  }
57  CHECK(chunk_metadata_map.find(data_chunk_key) != chunk_metadata_map.end());
58  index_buffer->reserve(index_offset_size *
59  (chunk_metadata_map.at(data_chunk_key)->numElements + 1));
60  } else {
61  data_chunk_key = chunk_key;
62  CHECK(buffers.find(data_chunk_key) != buffers.end());
63  data_buffer = buffers.find(data_chunk_key)->second;
64  }
65  CHECK(chunk_metadata_map.find(data_chunk_key) != chunk_metadata_map.end());
66  data_buffer->reserve(chunk_metadata_map.at(data_chunk_key)->numBytes);
67 
68  chunk.setPinnable(false);
69  chunk.setColumnDesc(column);
70  chunk.setBuffer(data_buffer);
71  chunk.setIndexBuffer(index_buffer);
72  chunk.initEncoder();
73 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
void setPinnable(bool pinnable)
Definition: Chunk.h:63
std::vector< int > ChunkKey
Definition: types.h:36
void setIndexBuffer(AbstractBuffer *ib)
Definition: Chunk.h:152
#define CHUNK_KEY_DB_IDX
Definition: types.h:38
#define UNREACHABLE()
Definition: Logger.h:266
void setBuffer(AbstractBuffer *b)
Definition: Chunk.h:150
int32_t StringOffsetT
Definition: sqltypes.h:1113
static SysCatalog & instance()
Definition: SysCatalog.h:337
#define CHUNK_KEY_TABLE_IDX
Definition: types.h:39
An AbstractBuffer is a unit of data management for a data manager.
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
int32_t ArrayOffsetT
Definition: sqltypes.h:1114
void initEncoder()
Definition: Chunk.cpp:284
#define CHECK(condition)
Definition: Logger.h:222
void setColumnDesc(const ColumnDescriptor *cd)
Definition: Chunk.h:67
#define CHUNK_KEY_COLUMN_IDX
Definition: types.h:40
virtual void reserve(size_t num_bytes)=0

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::is_append_table_chunk_key ( const ChunkKey chunk_key)

Definition at line 116 of file FsiChunkUtils.cpp.

References get_foreign_table_for_key(), and foreign_storage::ForeignTable::isAppendMode().

Referenced by foreign_storage::CachingForeignStorageMgr::refreshTableInCache().

116  {
117  return get_foreign_table_for_key(chunk_key).isAppendMode();
118 }
bool isAppendMode() const
Checks if the table is in append mode.
const foreign_storage::ForeignTable & get_foreign_table_for_key(const ChunkKey &key)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::is_replicated_table_chunk_key ( const ChunkKey chunk_key)

Definition at line 112 of file FsiChunkUtils.cpp.

References get_foreign_table_for_key(), and table_is_replicated().

Referenced by is_shardable_key().

112  {
114 }
const foreign_storage::ForeignTable & get_foreign_table_for_key(const ChunkKey &key)
bool table_is_replicated(const TableDescriptor *td)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::is_s3_uri ( const std::string &  file_path)

Definition at line 36 of file ForeignDataWrapperFactory.cpp.

Referenced by foreign_storage::ForeignDataWrapperFactory::createForeignServerProxy(), foreign_storage::ForeignDataWrapperFactory::createForeignTableProxy(), import_export::ForeignDataImporter::import(), and import_export::ForeignDataImporter::importGeneralS3().

36  {
37  const std::string s3_prefix = "s3://";
38  return file_path.find(s3_prefix) != std::string::npos;
39 }

+ Here is the caller graph for this function:

bool foreign_storage::is_shardable_key ( const ChunkKey key)

Definition at line 120 of file FsiChunkUtils.cpp.

References dist::is_aggregator(), dist::is_distributed(), is_replicated_table_chunk_key(), and is_system_table_chunk_key().

Referenced by anonymous_namespace{ForeignStorageMgr.cpp}::filter_metadata_by_leaf(), foreign_storage::CachingForeignStorageMgr::getChunkMetadataVecForKeyPrefix(), key_does_not_shard_to_leaf(), and foreign_storage::CachingForeignStorageMgr::refreshTableInCache().

120  {
121  return (dist::is_distributed() && !dist::is_aggregator() &&
123 }
bool is_system_table_chunk_key(const ChunkKey &chunk_key)
bool is_replicated_table_chunk_key(const ChunkKey &chunk_key)
bool is_aggregator()
Definition: distributed.cpp:33
bool is_distributed()
Definition: distributed.cpp:21

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::is_system_table_chunk_key ( const ChunkKey chunk_key)

Definition at line 108 of file FsiChunkUtils.cpp.

References get_foreign_table_for_key(), and TableDescriptor::is_system_table.

Referenced by is_shardable_key().

108  {
109  return get_foreign_table_for_key(chunk_key).is_system_table;
110 }
const foreign_storage::ForeignTable & get_foreign_table_for_key(const ChunkKey &key)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::is_table_enabled_on_node ( const ChunkKey key)

Referenced by foreign_storage::CachingForeignStorageMgr::getChunkMetadataVecFromDataWrapper().

+ Here is the caller graph for this function:

bool foreign_storage::is_valid_source_type ( const import_export::CopyParams copy_params)

Verify if source_type is valid.

Definition at line 106 of file ForeignDataWrapperFactory.cpp.

References import_export::kDelimitedFile, import_export::kParquetFile, import_export::kRegexParsedFile, and import_export::CopyParams::source_type.

Referenced by foreign_storage::ForeignDataWrapperFactory::createForeignServerProxy(), foreign_storage::ForeignDataWrapperFactory::createForeignTableProxy(), and import_export::ForeignDataImporter::importGeneral().

+ Here is the caller graph for this function:

bool foreign_storage::key_does_not_shard_to_leaf ( const ChunkKey key)

Definition at line 133 of file FsiChunkUtils.cpp.

References fragment_maps_to_leaf(), and is_shardable_key().

Referenced by cache_blocks(), populate_string_dictionary(), and anonymous_namespace{RelAlgExecutor.cpp}::set_parallelism_hints().

133  {
134  return (is_shardable_key(key) && !fragment_maps_to_leaf(key));
135 }
bool fragment_maps_to_leaf(const ChunkKey &key)
bool is_shardable_key(const ChunkKey &key)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

UniqueReaderPtr foreign_storage::open_parquet_table ( const std::string &  file_path,
std::shared_ptr< arrow::fs::FileSystem > &  file_system 
)

Definition at line 26 of file ParquetShared.cpp.

Referenced by foreign_storage::FileReaderMap::getOrInsert(), foreign_storage::FileReaderMap::insert(), and foreign_storage::LazyParquetChunkLoader::loadRowGroups().

27  {
28  UniqueReaderPtr reader;
29  auto file_result = file_system->OpenInputFile(file_path);
30  if (!file_result.ok()) {
31  throw std::runtime_error{"Unable to access " + file_system->type_name() + " file: " +
32  file_path + ". " + file_result.status().message()};
33  }
34  auto infile = file_result.ValueOrDie();
35  PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
36  return reader;
37 }
std::unique_ptr< parquet::arrow::FileReader > UniqueReaderPtr
Definition: ParquetShared.h:32

+ Here is the caller graph for this function:

ParseFileRegionResult foreign_storage::parse_file_regions ( const FileRegions &  file_regions,
const size_t  start_index,
const size_t  end_index,
FileReader &  file_reader,
ParseBufferRequest &  parse_file_request,
const std::map< int, Chunk_NS::Chunk > &  column_id_to_chunk_map,
const TextFileBufferParser &  parser 
)

Parses a set of file regions given a handle to the file and range of indexes for the file regions to be parsed.

Definition at line 194 of file AbstractTextFileDataWrapper.cpp.

References foreign_storage::ParseBufferRequest::begin_pos, foreign_storage::ParseBufferRequest::buffer, foreign_storage::ParseBufferRequest::buffer_size, CHECK, CHECK_EQ, foreign_storage::ParseBufferResult::column_id_to_data_blocks_map, DEBUG_TIMER, foreign_storage::ParseBufferRequest::end_pos, foreign_storage::ParseBufferRequest::file_offset, foreign_storage::ParseFileRegionResult::file_offset, foreign_storage::ParseBufferRequest::first_row_index, foreign_storage::ParseBufferRequest::getTableName(), foreign_storage::TextFileBufferParser::parseBuffer(), foreign_storage::ParseBufferRequest::process_row_count, foreign_storage::FileReader::readRegion(), foreign_storage::ParseBufferResult::rejected_rows, run_benchmark_import::result, foreign_storage::ParseBufferResult::row_count, and throw_unexpected_number_of_items().

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunks().

201  {
202  auto timer = DEBUG_TIMER(__func__);
203  ParseFileRegionResult load_file_region_result{};
204  load_file_region_result.file_offset = file_regions[start_index].first_row_file_offset;
205  load_file_region_result.row_count = 0;
206 
207  ParseBufferResult result;
208  for (size_t i = start_index; i <= end_index; i++) {
209  CHECK(file_regions[i].region_size <= parse_file_request.buffer_size);
210  auto read_size = file_reader.readRegion(parse_file_request.buffer.get(),
211  file_regions[i].first_row_file_offset,
212  file_regions[i].region_size);
213  if (file_regions[i].region_size != read_size) {
214  throw_unexpected_number_of_items(file_regions[i].region_size,
215  read_size,
216  "bytes",
217  parse_file_request.getTableName());
218  }
219  parse_file_request.begin_pos = 0;
220  parse_file_request.end_pos = file_regions[i].region_size;
221  parse_file_request.first_row_index = file_regions[i].first_row_index;
222  parse_file_request.file_offset = file_regions[i].first_row_file_offset;
223  parse_file_request.process_row_count = file_regions[i].row_count;
224 
225  result = parser.parseBuffer(parse_file_request, i == end_index);
226  CHECK_EQ(file_regions[i].row_count, result.row_count);
227  for (const auto& rejected_row_index : result.rejected_rows) {
228  load_file_region_result.rejected_row_indices.insert(
229  load_file_region_result.row_count + rejected_row_index);
230  }
231  load_file_region_result.row_count += result.row_count;
232  }
233  load_file_region_result.column_id_to_data_blocks_map =
234  result.column_id_to_data_blocks_map;
235  return load_file_region_result;
236 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
void throw_unexpected_number_of_items(const size_t num_expected, const size_t num_loaded, const std::string &item_type, const std::string &foreign_table_name)
#define CHECK(condition)
Definition: Logger.h:222
#define DEBUG_TIMER(name)
Definition: Logger.h:371

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector<size_t> foreign_storage::partition_by_fragment ( const size_t  start_row_index,
const size_t  max_fragment_size,
const size_t  buffer_row_count 
)

Given a start row index, maximum fragment size, and number of rows in a buffer, this function returns a vector indicating how the rows in the buffer should be partitioned in order to fill up available fragment slots while staying within the capacity of fragments.

Definition at line 403 of file AbstractTextFileDataWrapper.cpp.

References CHECK.

Referenced by scan_metadata().

405  {
406  CHECK(buffer_row_count > 0);
407  std::vector<size_t> partitions{};
408  size_t remaining_rows_in_last_fragment;
409  if (start_row_index % max_fragment_size == 0) {
410  remaining_rows_in_last_fragment = 0;
411  } else {
412  remaining_rows_in_last_fragment =
413  max_fragment_size - (start_row_index % max_fragment_size);
414  }
415  if (buffer_row_count <= remaining_rows_in_last_fragment) {
416  partitions.emplace_back(buffer_row_count);
417  } else {
418  if (remaining_rows_in_last_fragment > 0) {
419  partitions.emplace_back(remaining_rows_in_last_fragment);
420  }
421  size_t remaining_buffer_row_count =
422  buffer_row_count - remaining_rows_in_last_fragment;
423  while (remaining_buffer_row_count > 0) {
424  partitions.emplace_back(
425  std::min<size_t>(remaining_buffer_row_count, max_fragment_size));
426  remaining_buffer_row_count -= partitions.back();
427  }
428  }
429  return partitions;
430 }
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the caller graph for this function:

template<typename T >
auto foreign_storage::partition_for_threads ( const std::set< T > &  items,
size_t  max_threads 
)

Definition at line 41 of file FsiChunkUtils.h.

Referenced by create_futures_for_workers(), and foreign_storage::LazyParquetChunkLoader::metadataScan().

41  {
42  const size_t items_per_thread = (items.size() + (max_threads - 1)) / max_threads;
43  std::list<std::set<T>> items_by_thread;
44  auto i = 0U;
45  for (auto item : items) {
46  if (i++ % items_per_thread == 0) {
47  items_by_thread.emplace_back(std::set<T>{});
48  }
49  items_by_thread.back().emplace(item);
50  }
51  return items_by_thread;
52 }

+ Here is the caller graph for this function:

template<typename T >
auto foreign_storage::partition_for_threads ( const std::vector< T > &  items,
size_t  max_threads 
)

Definition at line 60 of file FsiChunkUtils.h.

60  {
61  const size_t items_per_thread = (items.size() + (max_threads - 1)) / max_threads;
62  std::list<std::vector<T>> items_by_thread;
63  auto i = 0U;
64  for (auto item : items) {
65  if (i++ % items_per_thread == 0) {
66  items_by_thread.emplace_back(std::vector<T>{});
67  }
68  items_by_thread.back().emplace_back(item);
69  }
70  return items_by_thread;
71 }
void foreign_storage::populate_string_dictionary ( const int32_t  table_id,
const int32_t  col_id,
const Catalog_Namespace::Catalog catalog 
)

Definition at line 205 of file Execute.cpp.

References CHECK, Data_Namespace::CPU_LEVEL, Chunk_NS::Chunk::getChunk(), Catalog_Namespace::Catalog::getDatabaseId(), Catalog_Namespace::Catalog::getDataMgr(), Catalog_Namespace::Catalog::getMetadataForColumn(), Catalog_Namespace::Catalog::getMetadataForTable(), anonymous_namespace{Execute.cpp}::is_empty_table(), and key_does_not_shard_to_leaf().

Referenced by anonymous_namespace{RelAlgExecutor.cpp}::prepare_string_dictionaries().

207  {
208  if (const auto foreign_table = dynamic_cast<const ForeignTable*>(
209  catalog.getMetadataForTable(table_id, false))) {
210  const auto col_desc = catalog.getMetadataForColumn(table_id, col_id);
211  if (col_desc->columnType.is_dict_encoded_type()) {
212  auto& fragmenter = foreign_table->fragmenter;
213  CHECK(fragmenter != nullptr);
214  if (is_empty_table(fragmenter.get())) {
215  return;
216  }
217  for (const auto& frag : fragmenter->getFragmentsForQuery().fragments) {
218  ChunkKey chunk_key = {catalog.getDatabaseId(), table_id, col_id, frag.fragmentId};
219  // If the key is sharded across leaves, only populate fragments that are sharded
220  // to this leaf.
221  if (key_does_not_shard_to_leaf(chunk_key)) {
222  continue;
223  }
224 
225  const ChunkMetadataMap& metadata_map = frag.getChunkMetadataMap();
226  CHECK(metadata_map.find(col_id) != metadata_map.end());
227  if (auto& meta = metadata_map.at(col_id); meta->isPlaceholder()) {
228  // When this goes out of scope it will stay in CPU cache but become
229  // evictable
230  auto chunk = Chunk_NS::Chunk::getChunk(col_desc,
231  &(catalog.getDataMgr()),
232  chunk_key,
234  0,
235  0,
236  0);
237  }
238  }
239  }
240  }
241 }
std::vector< int > ChunkKey
Definition: types.h:36
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:243
bool is_empty_table(Fragmenter_Namespace::AbstractFragmenter *fragmenter)
Definition: Execute.cpp:195
std::map< int, std::shared_ptr< ChunkMetadata >> ChunkMetadataMap
bool key_does_not_shard_to_leaf(const ChunkKey &key)
const ColumnDescriptor * getMetadataForColumn(int tableId, const std::string &colName) const
int getDatabaseId() const
Definition: Catalog.h:298
#define CHECK(condition)
Definition: Logger.h:222
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
static std::shared_ptr< Chunk > getChunk(const ColumnDescriptor *cd, DataMgr *data_mgr, const ChunkKey &key, const MemoryLevel mem_level, const int deviceId, const size_t num_bytes, const size_t num_elems, const bool pinnable=true)
Definition: Chunk.cpp:31

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::process_data_blocks ( MetadataScanMultiThreadingParams &  multi_threading_params,
int  fragment_id,
const ParseBufferRequest &  request,
ParseBufferResult &  result,
std::map< int, const ColumnDescriptor * > &  column_by_id,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map 
)

Updates metadata encapsulated in encoders for all table columns given new data blocks gotten from parsing a new set of rows in a file buffer. If cache is available, also append the data_blocks to chunks in the cache

Definition at line 571 of file AbstractTextFileDataWrapper.cpp.

References add_file_region(), cache_blocks(), foreign_storage::MetadataScanMultiThreadingParams::cached_chunks, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers, foreign_storage::MetadataScanMultiThreadingParams::chunk_encoder_buffers_mutex, foreign_storage::ParseBufferResult::column_id_to_data_blocks_map, foreign_storage::ParseBufferRequest::db_id, foreign_storage::MetadataScanMultiThreadingParams::disable_cache, foreign_storage::ParseBufferRequest::first_row_index, foreign_storage::ParseBufferRequest::getFilePath(), foreign_storage::ParseBufferRequest::getMaxFragRows(), foreign_storage::ParseBufferRequest::getTableId(), foreign_storage::ParseBufferResult::row_count, and update_stats().

Referenced by scan_metadata().

576  {
577  std::lock_guard<std::mutex> lock(multi_threading_params.chunk_encoder_buffers_mutex);
578  // File regions should be added in same order as appendData
579  add_file_region(fragment_id_to_file_regions_map,
580  fragment_id,
581  request.first_row_index,
582  result,
583  request.getFilePath());
584 
585  for (auto& [column_id, data_block] : result.column_id_to_data_blocks_map) {
586  ChunkKey chunk_key{request.db_id, request.getTableId(), column_id, fragment_id};
587  const auto column = column_by_id[column_id];
588  if (column->columnType.is_varlen_indeed()) {
589  chunk_key.emplace_back(1);
590  }
591  if (multi_threading_params.chunk_encoder_buffers.find(chunk_key) ==
592  multi_threading_params.chunk_encoder_buffers.end()) {
593  multi_threading_params.chunk_encoder_buffers[chunk_key] =
594  std::make_unique<ForeignStorageBuffer>();
595  multi_threading_params.chunk_encoder_buffers[chunk_key]->initEncoder(
596  column->columnType);
597  }
598  update_stats(multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder(),
599  column->columnType,
600  data_block,
601  result.row_count);
602  size_t num_elements = multi_threading_params.chunk_encoder_buffers[chunk_key]
603  ->getEncoder()
604  ->getNumElems() +
605  result.row_count;
606  multi_threading_params.chunk_encoder_buffers[chunk_key]->getEncoder()->setNumElems(
607  num_elements);
608  cache_blocks(
609  multi_threading_params.cached_chunks,
610  data_block,
611  result.row_count,
612  chunk_key,
613  column,
614  (num_elements - result.row_count) == 0, // Is the first block added to this chunk
615  num_elements == request.getMaxFragRows(), // Is the last block for this chunk
616  multi_threading_params.disable_cache);
617  }
618 }
void cache_blocks(std::map< ChunkKey, Chunk_NS::Chunk > &cached_chunks, DataBlockPtr data_block, size_t row_count, ChunkKey &chunk_key, const ColumnDescriptor *column, bool is_first_block, bool is_last_block, bool disable_cache)
std::vector< int > ChunkKey
Definition: types.h:36
void update_stats(Encoder *encoder, const SQLTypeInfo &column_type, DataBlockPtr data_block, const size_t row_count)
void add_file_region(std::map< int, FileRegions > &fragment_id_to_file_regions_map, int fragment_id, size_t first_row_index, const ParseBufferResult &result, const std::string &file_path)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::refresh_foreign_table ( Catalog_Namespace::Catalog catalog,
const std::string &  table_name,
const bool  evict_cached_entries 
)

Definition at line 90 of file ForeignTableRefresh.cpp.

References CHECK, shared::contains(), StorageType::FOREIGN_TABLE, dist::is_leaf_node(), Catalog_Namespace::kAggregatorOnlySystemTables, and refresh_foreign_table_unlocked().

Referenced by RefreshForeignTablesCommand::execute(), and foreign_storage::ForeignTableRefreshScheduler::start().

92  {
93  if (dist::is_leaf_node() &&
95  // Skip aggregator only system tables on leaf nodes.
96  return;
97  }
98  auto table_lock =
99  std::make_unique<lockmgr::TableSchemaLockContainer<lockmgr::WriteLock>>(
101  catalog, table_name, false));
102 
103  const TableDescriptor* td = (*table_lock)();
104  if (td->storageType != StorageType::FOREIGN_TABLE) {
105  throw std::runtime_error{
106  table_name +
107  " is not a foreign table. Refreshes are applicable to only foreign tables."};
108  }
109 
110  auto foreign_table = dynamic_cast<const ForeignTable*>(td);
111  CHECK(foreign_table);
112  refresh_foreign_table_unlocked(catalog, *foreign_table, evict_cached_entries);
113 }
bool contains(const T &container, const U &element)
Definition: misc.h:195
bool is_leaf_node()
Definition: distributed.cpp:29
#define CHECK(condition)
Definition: Logger.h:222
static const std::array< std::string, 4 > kAggregatorOnlySystemTables
Definition: Catalog.h:120
void refresh_foreign_table_unlocked(Catalog_Namespace::Catalog &catalog, const ForeignTable &td, const bool evict_cached_entries)
static constexpr char const * FOREIGN_TABLE

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::refresh_foreign_table_unlocked ( Catalog_Namespace::Catalog catalog,
const ForeignTable &  td,
const bool  evict_cached_entries 
)

Definition at line 34 of file ForeignTableRefresh.cpp.

References CHUNK_KEY_FRAGMENT_IDX, foreign_storage::anonymous_namespace{ForeignTableRefresh.cpp}::clear_cpu_and_gpu_cache(), Catalog_Namespace::DBMetadata::dbId, Catalog_Namespace::Catalog::getCurrentDB(), Catalog_Namespace::Catalog::getDataMgr(), Catalog_Namespace::Catalog::getForeignTable(), PostEvictionRefreshException::getOriginalException(), CacheInvalidator< CACHE_HOLDING_TYPES >::invalidateCachesByTable(), foreign_storage::ForeignTable::isAppendMode(), Catalog_Namespace::Catalog::removeFragmenterForTable(), TableDescriptor::tableId, and Catalog_Namespace::Catalog::updateForeignTableRefreshTimes().

Referenced by refresh_foreign_table().

36  {
37  auto& data_mgr = catalog.getDataMgr();
38  ChunkKey table_key{catalog.getCurrentDB().dbId, td.tableId};
39  ResultSetCacheInvalidator::invalidateCachesByTable(boost::hash_value(table_key));
40 
41  catalog.removeFragmenterForTable(td.tableId);
42 
43  std::map<ChunkKey, std::shared_ptr<ChunkMetadata>> old_chunk_metadata_by_chunk_key;
44  if (catalog.getForeignTable(td.tableId)->isAppendMode() && !evict_cached_entries) {
45  ChunkMetadataVector metadata_vec;
46  data_mgr.getChunkMetadataVecForKeyPrefix(metadata_vec, table_key);
47  int last_fragment_id = 0;
48  for (const auto& [key, metadata] : metadata_vec) {
49  if (key[CHUNK_KEY_FRAGMENT_IDX] > last_fragment_id) {
50  last_fragment_id = key[CHUNK_KEY_FRAGMENT_IDX];
51  }
52  old_chunk_metadata_by_chunk_key[key] = metadata;
53  }
54  for (const auto& [key, metadata] : metadata_vec) {
55  if (key[CHUNK_KEY_FRAGMENT_IDX] == last_fragment_id) {
56  clear_cpu_and_gpu_cache(data_mgr, key);
57  }
58  }
59  } else {
60  clear_cpu_and_gpu_cache(data_mgr, table_key);
61  }
62 
63  try {
64  data_mgr.getPersistentStorageMgr()->getForeignStorageMgr()->refreshTable(
65  table_key, evict_cached_entries);
66  catalog.updateForeignTableRefreshTimes(td.tableId);
67  } catch (PostEvictionRefreshException& e) {
68  catalog.updateForeignTableRefreshTimes(td.tableId);
69  clear_cpu_and_gpu_cache(data_mgr, table_key);
70  throw e.getOriginalException();
71  } catch (...) {
72  clear_cpu_and_gpu_cache(data_mgr, table_key);
73  throw;
74  }
75 
76  // Delete cached rolled off/updated chunks.
77  if (!old_chunk_metadata_by_chunk_key.empty()) {
78  ChunkMetadataVector new_metadata_vec;
79  data_mgr.getChunkMetadataVecForKeyPrefix(new_metadata_vec, table_key);
80  for (const auto& [key, metadata] : new_metadata_vec) {
81  auto it = old_chunk_metadata_by_chunk_key.find(key);
82  if (it != old_chunk_metadata_by_chunk_key.end() &&
83  it->second->numElements != metadata->numElements) {
84  clear_cpu_and_gpu_cache(data_mgr, key);
85  }
86  }
87  }
88 }
static void invalidateCachesByTable(size_t table_key)
const foreign_storage::ForeignTable * getForeignTable(const std::string &tableName) const
Definition: Catalog.cpp:1883
std::vector< int > ChunkKey
Definition: types.h:36
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:243
#define CHUNK_KEY_FRAGMENT_IDX
Definition: types.h:41
std::runtime_error getOriginalException()
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:242
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
bool isAppendMode() const
Checks if the table is in append mode.
void removeFragmenterForTable(const int table_id) const
Definition: Catalog.cpp:3965
void clear_cpu_and_gpu_cache(Data_Namespace::DataMgr &data_mgr, const ChunkKey &key_prefix)
void updateForeignTableRefreshTimes(const int32_t table_id)
Definition: Catalog.cpp:5369

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::resize_buffer_if_needed ( std::unique_ptr< char[]> &  buffer,
size_t &  buffer_size,
const size_t  alloc_size 
)

Optionally resizes the given buffer if the buffer size is less than the current buffer allocation size.

Definition at line 725 of file AbstractTextFileDataWrapper.cpp.

References CHECK_LE.

Referenced by dispatch_metadata_scan_requests().

727  {
728  CHECK_LE(buffer_size, alloc_size);
729  if (buffer_size < alloc_size) {
730  buffer = std::make_unique<char[]>(alloc_size);
731  buffer_size = alloc_size;
732  }
733 }
#define CHECK_LE(x, y)
Definition: Logger.h:233

+ Here is the caller graph for this function:

std::vector< Aws::S3::Model::Object > foreign_storage::s3_objects_filter_sort_files ( const std::vector< Aws::S3::Model::Object > &  file_paths,
const shared::FilePathOptions options 
)

Definition at line 22 of file S3FilePathUtil.cpp.

References shared::FilePathOptions::filter_regex, shared::PATHNAME_ORDER_TYPE, foreign_storage::anonymous_namespace{S3FilePathUtil.cpp}::s3_objects_regex_file_filter(), and shared::FilePathOptions::sort_by.

24  {
25  auto result_files =
26  options.filter_regex.has_value()
27  ? s3_objects_regex_file_filter(options.filter_regex.value(), file_paths)
28  : file_paths;
29  // initial lexicographical order ensures a determinisitc ordering for files not matching
30  // sort_regex
31  shared::FilePathOptions temp_options;
32  temp_options.sort_by = shared::PATHNAME_ORDER_TYPE;
33  auto initial_file_order = FileOrderS3(temp_options);
34  auto lexi_comp = initial_file_order.getFileComparator();
35  std::stable_sort(result_files.begin(), result_files.end(), lexi_comp);
36 
37  auto file_order = FileOrderS3(options);
38  auto comp = file_order.getFileComparator();
39  std::stable_sort(result_files.begin(), result_files.end(), comp);
40  return result_files;
41 }
std::optional< std::string > filter_regex
std::vector< Aws::S3::Model::Object > s3_objects_regex_file_filter(const std::string &pattern, const std::vector< Aws::S3::Model::Object > &objects_list)
const std::string PATHNAME_ORDER_TYPE
std::optional< std::string > sort_by

+ Here is the call graph for this function:

void foreign_storage::scan_metadata ( MetadataScanMultiThreadingParams &  multi_threading_params,
std::map< int, FileRegions > &  fragment_id_to_file_regions_map,
const TextFileBufferParser &  parser 
)

Consumes and processes metadata scan requests from a pending requests queue and updates existing metadata objects based on newly scanned metadata.

Definition at line 637 of file AbstractTextFileDataWrapper.cpp.

References add_request_to_pool(), foreign_storage::MetadataScanMultiThreadingParams::continue_processing, get_next_metadata_scan_request(), foreign_storage::TextFileBufferParser::parseBuffer(), partition_by_fragment(), foreign_storage::MetadataScanMultiThreadingParams::pending_requests_mutex, process_data_blocks(), and run_benchmark_import::result.

Referenced by foreign_storage::AbstractTextFileDataWrapper::populateChunkMetadata().

639  {
640  std::map<int, const ColumnDescriptor*> column_by_id{};
641  while (true) {
642  auto request_opt = get_next_metadata_scan_request(multi_threading_params);
643  if (!request_opt.has_value()) {
644  break;
645  }
646  auto& request = request_opt.value();
647  try {
648  if (column_by_id.empty()) {
649  for (const auto column : request.getColumns()) {
650  column_by_id[column->columnId] = column;
651  }
652  }
653  auto partitions = partition_by_fragment(
654  request.first_row_index, request.getMaxFragRows(), request.buffer_row_count);
655  request.begin_pos = 0;
656  size_t row_index = request.first_row_index;
657  for (const auto partition : partitions) {
658  request.process_row_count = partition;
659  for (const auto& import_buffer : request.import_buffers) {
660  if (import_buffer != nullptr) {
661  import_buffer->clear();
662  }
663  }
664  auto result = parser.parseBuffer(request, true);
665  int fragment_id = row_index / request.getMaxFragRows();
666  process_data_blocks(multi_threading_params,
667  fragment_id,
668  request,
669  result,
670  column_by_id,
671  fragment_id_to_file_regions_map);
672  row_index += result.row_count;
673  request.begin_pos = result.row_offsets.back() - request.file_offset;
674  }
675  } catch (...) {
676  // Re-add request to pool so we dont block any other threads
677  {
678  std::lock_guard<std::mutex> pending_requests_lock(
679  multi_threading_params.pending_requests_mutex);
680  multi_threading_params.continue_processing = false;
681  }
682  add_request_to_pool(multi_threading_params, request);
683  throw;
684  }
685  add_request_to_pool(multi_threading_params, request);
686  }
687 }
std::vector< size_t > partition_by_fragment(const size_t start_row_index, const size_t max_fragment_size, const size_t buffer_row_count)
void add_request_to_pool(MetadataScanMultiThreadingParams &multi_threading_params, ParseBufferRequest &request)
std::optional< ParseBufferRequest > get_next_metadata_scan_request(MetadataScanMultiThreadingParams &multi_threading_params)
void process_data_blocks(MetadataScanMultiThreadingParams &multi_threading_params, int fragment_id, const ParseBufferRequest &request, ParseBufferResult &result, std::map< int, const ColumnDescriptor * > &column_by_id, std::map< int, FileRegions > &fragment_id_to_file_regions_map)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool foreign_storage::set_comp ( const ChunkKey left,
const ChunkKey right 
)
void foreign_storage::set_node_name ( std::map< std::string, import_export::TypedImportBuffer * > &  import_buffers)

Definition at line 55 of file InternalSystemDataWrapper.cpp.

References g_distributed_leaf_idx, dist::is_leaf_node(), and to_string().

Referenced by foreign_storage::anonymous_namespace{InternalMemoryStatsDataWrapper.cpp}::populate_import_buffers_for_memory_details(), foreign_storage::anonymous_namespace{InternalMemoryStatsDataWrapper.cpp}::populate_import_buffers_for_memory_summary(), and foreign_storage::anonymous_namespace{InternalStorageStatsDataWrapper.cpp}::populate_import_buffers_for_storage_details().

56  {
57  if (import_buffers.find("node") != import_buffers.end()) {
58  if (dist::is_leaf_node()) {
59  std::string leaf_string{"Leaf " + to_string(g_distributed_leaf_idx)};
60  import_buffers["node"]->addString(leaf_string);
61  } else {
62  import_buffers["node"]->addString("Server");
63  }
64  }
65 }
std::string to_string(char const *&&v)
bool is_leaf_node()
Definition: distributed.cpp:29
int32_t g_distributed_leaf_idx
Definition: Catalog.cpp:98

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const FileRegion &  file_region,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 26 of file CsvShared.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::FileRegion::first_row_file_offset, foreign_storage::FileRegion::first_row_index, foreign_storage::FileRegion::region_size, and foreign_storage::FileRegion::row_count.

28  {
29  json_val.SetObject();
31  json_val, file_region.first_row_file_offset, "first_row_file_offset", allocator);
33  json_val, file_region.first_row_index, "first_row_index", allocator);
35  json_val, file_region.region_size, "region_size", allocator);
37  json_val, file_region.row_count, "row_count", allocator);
38  if (file_region.filename.size()) {
40  json_val, file_region.filename, "filename", allocator);
41  }
42 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:149

+ Here is the call graph for this function:

void foreign_storage::set_value ( rapidjson::Value &  json_val,
const RowGroupInterval &  value,
rapidjson::Document::AllocatorType &  allocator 
)

Definition at line 660 of file ParquetDataWrapper.cpp.

References foreign_storage::json_utils::add_value_to_object(), foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, and foreign_storage::RowGroupInterval::start_index.

662  {
663  json_val.SetObject();
664  json_utils::add_value_to_object(json_val, value.file_path, "file_path", allocator);
665  json_utils::add_value_to_object(json_val, value.start_index, "start_index", allocator);
666  json_utils::add_value_to_object(json_val, value.end_index, "end_index", allocator);
667 }
void add_value_to_object(rapidjson::Value &object, const T &value, const std::string &name, rapidjson::Document::AllocatorType &allocator)
Definition: FsiJsonUtils.h:149

+ Here is the call graph for this function:

void foreign_storage::throw_file_access_error ( const std::string &  file_path,
const std::string &  message 
)
inline

Definition at line 83 of file ForeignStorageException.h.

Referenced by foreign_storage::ParquetImporter::getAllFilePaths().

84  {
85  std::string error_message{"Unable to access file \"" + file_path + "\". " + message};
86  throw ForeignStorageException{error_message};
87 }

+ Here is the caller graph for this function:

void foreign_storage::throw_file_not_found_error ( const std::string &  file_path)
inline

Definition at line 89 of file ForeignStorageException.h.

Referenced by foreign_storage::ParquetImporter::getAllFilePaths().

89  {
90  throw ForeignStorageException{"File or directory \"" + file_path +
91  "\" does not exist."};
92 }

+ Here is the caller graph for this function:

void foreign_storage::throw_number_of_columns_mismatch_error ( size_t  num_table_cols,
size_t  num_file_cols,
const std::string &  file_path 
)
inline

Definition at line 74 of file ForeignStorageException.h.

References to_string().

Referenced by foreign_storage::RegexFileBufferParser::regexMatchColumns(), foreign_storage::anonymous_namespace{CsvFileBufferParser.cpp}::validate_expected_column_count(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns().

76  {
77  throw ForeignStorageException{"Mismatched number of logical columns: (expected " +
78  std::to_string(num_table_cols) + " columns, has " +
79  std::to_string(num_file_cols) + "): in file '" +
80  file_path + "'"};
81 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::throw_parquet_metadata_out_of_bounds_error ( const std::string &  min_value,
const std::string &  max_value,
const std::string &  encountered_value 
)
inline

Definition at line 46 of file ParquetMetadataValidator.h.

Referenced by foreign_storage::TimestampBoundsValidator< T >::validateValue(), foreign_storage::IntegralFixedLengthBoundsValidator< T >::validateValue(), foreign_storage::BaseDateBoundsValidator< T, is_in_seconds >::validateValue(), and foreign_storage::FloatPointValidator< T >::validateValue().

49  {
50  std::stringstream error_message;
51  error_message << "Parquet column contains values that are outside the range of the "
52  "HeavyDB column "
53  "type. Consider using a wider column type. Min allowed value: "
54  << min_value << ". Max allowed value: " << max_value
55  << ". Encountered value: " << encountered_value << ".";
56  throw std::runtime_error(error_message.str());
57 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_file_error ( const std::string &  file_path)
inline

Definition at line 67 of file ForeignStorageException.h.

Referenced by foreign_storage::LocalMultiFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

67  {
68  throw ForeignStorageException{
69  "Refresh of foreign table created with \"APPEND\" update type failed as "
70  "file \"" +
71  file_path + "\" was removed."};
72 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_row_in_file_error ( const std::string &  file_path)
inline

Definition at line 60 of file ForeignStorageException.h.

Referenced by foreign_storage::SingleTextFileReader::checkForMoreRows(), and foreign_storage::ParquetDataWrapper::fetchChunkMetadata().

60  {
61  throw ForeignStorageException{
62  "Refresh of foreign table created with \"APPEND\" update type failed as file "
63  "reduced in size: \"" +
64  file_path + "\""};
65 }

+ Here is the caller graph for this function:

void foreign_storage::throw_removed_row_in_result_set_error ( const std::string &  select_statement)
inline

Definition at line 52 of file ForeignStorageException.h.

52  {
53  throw ForeignStorageException{
54  "Refresh of foreign table created with \"APPEND\" update type failed as result set "
55  "of select statement "
56  "reduced in size: \"" +
57  select_statement + "\""};
58 }
void foreign_storage::throw_s3_compressed_extension ( const std::string &  file_path,
const std::string &  ext_type 
)
inline

Definition at line 101 of file ForeignStorageException.h.

102  {
103  throw ForeignStorageException{
104  "File \"" + file_path + "\" has extension type \"" + ext_type +
105  "\", compressed file formats are not supported by S3 Foreign Tables."};
106 }
void foreign_storage::throw_s3_compressed_mime_type ( const std::string &  file_path,
const std::string &  mime_type 
)
inline

Definition at line 94 of file ForeignStorageException.h.

95  {
96  throw ForeignStorageException{
97  "File \"" + file_path + "\" has mime type \"" + mime_type +
98  "\", compressed file formats are not supported by S3 Foreign Tables."};
99 }
void foreign_storage::throw_unexpected_number_of_items ( const size_t &  num_expected,
const size_t &  num_loaded,
const std::string &  item_type 
)
inline

Definition at line 40 of file ForeignStorageException.h.

References to_string().

Referenced by parse_file_regions(), and foreign_storage::anonymous_namespace{AbstractTextFileDataWrapper.cpp}::throw_unexpected_number_of_items().

42  {
43  throw ForeignStorageException(
44  "Unexpected number of " + item_type +
45  " while loading from foreign data source: expected " +
46  std::to_string(num_expected) + " , obtained " + std::to_string(num_loaded) + " " +
47  item_type +
48  ". Please use the \"REFRESH FOREIGN TABLES\" command on the foreign table "
49  "if data source has been updated.");
50 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void foreign_storage::update_stats ( Encoder encoder,
const SQLTypeInfo column_type,
DataBlockPtr  data_block,
const size_t  row_count 
)

Updates the statistics metadata encapsulated in the encoder given new data in a data block.

Definition at line 496 of file AbstractTextFileDataWrapper.cpp.

References DataBlockPtr::arraysPtr, SQLTypeInfo::is_array(), SQLTypeInfo::is_varlen(), DataBlockPtr::numbersPtr, DataBlockPtr::stringsPtr, and Encoder::updateStats().

Referenced by process_data_blocks(), Fragmenter_Namespace::InsertOrderFragmenter::updateColumn(), Fragmenter_Namespace::InsertOrderFragmenter::updateColumnMetadata(), and StorageIOFacility::yieldUpdateCallback().

499  {
500  if (column_type.is_array()) {
501  encoder->updateStats(data_block.arraysPtr, 0, row_count);
502  } else if (!column_type.is_varlen()) {
503  encoder->updateStats(data_block.numbersPtr, row_count);
504  } else {
505  encoder->updateStats(data_block.stringsPtr, 0, row_count);
506  }
507 }
std::vector< std::string > * stringsPtr
Definition: sqltypes.h:227
std::vector< ArrayDatum > * arraysPtr
Definition: sqltypes.h:228
bool is_varlen() const
Definition: sqltypes.h:536
int8_t * numbersPtr
Definition: sqltypes.h:226
virtual void updateStats(const int64_t val, const bool is_null)=0
bool is_array() const
Definition: sqltypes.h:518

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::shared_ptr< parquet::Statistics > foreign_storage::validate_and_get_column_metadata_statistics ( const parquet::ColumnChunkMetaData *  column_metadata)

Definition at line 86 of file ParquetShared.cpp.

References CHECK.

Referenced by foreign_storage::ParquetEncoder::getRowGroupMetadata(), foreign_storage::TypedParquetInPlaceEncoder< V, V >::getRowGroupMetadata(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_definition_levels().

87  {
88  CHECK(column_metadata->is_stats_set());
89  std::shared_ptr<parquet::Statistics> stats = column_metadata->statistics();
90  return stats;
91 }
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the caller graph for this function:

void foreign_storage::validate_equal_column_descriptor ( const parquet::ColumnDescriptor *  reference_descriptor,
const parquet::ColumnDescriptor *  new_descriptor,
const std::string &  reference_file_path,
const std::string &  new_file_path 
)

Definition at line 60 of file ParquetShared.cpp.

Referenced by foreign_storage::LazyParquetChunkLoader::appendRowGroups(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().

64  {
65  if (!reference_descriptor->Equals(*new_descriptor)) {
66  throw std::runtime_error{"Parquet file \"" + new_file_path +
67  "\" has a different schema. Please ensure that all Parquet "
68  "files use the same schema. Reference Parquet file: " +
69  reference_file_path +
70  ", column name: " + reference_descriptor->name() +
71  ". New Parquet file: " + new_file_path +
72  ", column name: " + new_descriptor->name() + "."};
73  }
74 }

+ Here is the caller graph for this function:

void foreign_storage::validate_non_foreign_table_write ( const TableDescriptor table_descriptor)
inline

Definition at line 22 of file FsiUtils.h.

References StorageType::FOREIGN_TABLE, and TableDescriptor::storageType.

Referenced by Parser::InsertStmt::analyze(), Parser::TruncateTableStmt::execute(), Parser::InsertValuesStmt::execute(), Parser::InsertIntoTableAsSelectStmt::populateData(), and RelModify::RelModify().

22  {
23  if (table_descriptor && table_descriptor->storageType == StorageType::FOREIGN_TABLE) {
24  throw std::runtime_error{
25  "DELETE, INSERT, TRUNCATE, OR UPDATE commands are not supported for foreign "
26  "tables."};
27  }
28 }
std::string storageType
static constexpr char const * FOREIGN_TABLE

+ Here is the caller graph for this function:

void foreign_storage::validate_regex_parser_options ( const import_export::CopyParams copy_params)

Definition at line 100 of file ForeignDataWrapperFactory.cpp.

References import_export::CopyParams::line_regex.

Referenced by anonymous_namespace{ForeignDataImporter.cpp}::validate_copy_params().

100  {
101  if (copy_params.line_regex.empty()) {
102  throw std::runtime_error{"Regex parser options must contain a line regex."};
103  }
104 }

+ Here is the caller graph for this function:

Variable Documentation