#include <Importer.h>

Inheritance diagram for import_export::DataStreamSink:

Collaboration diagram for import_export::DataStreamSink:

Public Member Functions
	DataStreamSink ()

	DataStreamSink (const CopyParams &copy_params, const std::string file_path)

virtual	~DataStreamSink ()

virtual ImportStatus	importDelimited (const std::string &file_path, const bool decompressed, const Catalog_Namespace::SessionInfo *session_info)=0

const CopyParams &	get_copy_params () const

void	import_compressed (std::vector< std::string > &file_paths, const Catalog_Namespace::SessionInfo *session_info)

Protected Member Functions
ImportStatus	archivePlumber (const Catalog_Namespace::SessionInfo *session_info)

Protected Attributes
CopyParams	copy_params

const std::string	file_path

FILE *	p_file = nullptr

ImportStatus	import_status_

heavyai::shared_mutex	import_mutex_

size_t	total_file_size {0}

std::vector< size_t >	file_offsets

std::mutex	file_offsets_mutex

Detailed Description

Definition at line 693 of file Importer.h.

Constructor & Destructor Documentation

import_export::DataStreamSink::DataStreamSink ( )

inline

Definition at line 695 of file Importer.h.

695 {}

import_export::DataStreamSink::DataStreamSink	(	const CopyParams &	copy_params,
		const std::string	file_path
	)

inline

Definition at line 696 of file Importer.h.

697 : copy_params(copy_params), file_path(file_path) {}

import_export::DataStreamSink::copy_params

CopyParams copy_params

Definition: Importer.h:719

import_export::DataStreamSink::file_path

const std::string file_path

Definition: Importer.h:720

virtual import_export::DataStreamSink::~DataStreamSink ( )

inlinevirtual

Definition at line 698 of file Importer.h.

698 {}

Member Function Documentation

ImportStatus import_export::DataStreamSink::archivePlumber ( const Catalog_Namespace::SessionInfo * session_info )

protected

Definition at line 3560 of file Importer.cpp.

References copy_params, file_path, import_export::CopyParams::file_sort_order_by, import_export::CopyParams::file_sort_regex, get_filesize(), import_compressed(), import_status_, import_export::kParquetFile, shared::local_glob_filter_sort_files(), import_export::CopyParams::regex_path_filter, import_export::CopyParams::source_type, total_file_size, and shared::validate_sort_options().

Referenced by import_export::Importer::import(), and import_export::Detector::read_file().

                                                       {
   // in generalized importing scheme, reaching here file_path may
   // contain a file path, a url or a wildcard of file paths.
   // see CopyTableStmt::execute.
 
   std::vector<std::string> file_paths;
   try {
     const shared::FilePathOptions options{copy_params.regex_path_filter,
                                           copy_params.file_sort_order_by,
                                           copy_params.file_sort_regex};
     shared::validate_sort_options(options);
     file_paths = shared::local_glob_filter_sort_files(file_path, options);
   } catch (const shared::FileNotFoundException& e) {
     // After finding no matching files locally, file_path may still be an s3 url
     file_paths.push_back(file_path);
   }
 
   // sum up sizes of all local files -- only for local files. if
   // file_path is a s3 url, sizes will be obtained via S3Archive.
   for (const auto& file_path : file_paths) {
     total_file_size += get_filesize(file_path);
   }
 
   // s3 parquet goes different route because the files do not use libarchive
   // but parquet api, and they need to landed like .7z files.
   //
   // note: parquet must be explicitly specified by a WITH parameter
   // "source_type='parquet_file'", because for example spark sql users may specify a
   // output url w/o file extension like this:
   //                df.write
   //                  .mode("overwrite")
   //                  .parquet("s3://bucket/folder/parquet/mydata")
   // without the parameter, it means plain or compressed csv files.
   // note: .ORC and AVRO files should follow a similar path to Parquet?
   if (copy_params.source_type == import_export::SourceType::kParquetFile) {
 #ifdef ENABLE_IMPORT_PARQUET
     import_parquet(file_paths, session_info);
 #else
     throw std::runtime_error("Parquet not supported!");
 #endif
   } else {
     import_compressed(file_paths, session_info);
   }
 
   return import_status_;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

const CopyParams& import_export::DataStreamSink::get_copy_params ( ) const

inline

Definition at line 710 of file Importer.h.

References copy_params.

Referenced by DBHandler::detect_column_types().

                                             {
     return copy_params;
   }

Here is the caller graph for this function:

void import_export::DataStreamSink::import_compressed	(	std::vector< std::string > &	file_paths,
		const Catalog_Namespace::SessionInfo *	session_info
	)

Definition at line 4086 of file Importer.cpp.

References heavyai::close(), file_offsets, file_offsets_mutex, import_mutex_, import_status_, importDelimited(), import_export::kImportRowLimit, import_export::kNoHeader, import_export::ImportStatus::load_failed, LOG, p_file, Archive::parse_url(), import_export::ImportStatus::rows_completed, total_file_size, logger::WARNING, and File_Namespace::write().

Referenced by archivePlumber().

                                                       {
   // a new requirement is to have one single input stream into
   // Importer::importDelimited, so need to move pipe related
   // stuff to the outmost block.
   int fd[2];
 #ifdef _WIN32
   // For some reason when folly is used to create the pipe, reader can
   // read nothing.
   auto pipe_res =
       _pipe(fd, static_cast<unsigned int>(copy_params.buffer_size), _O_BINARY);
 #else
   auto pipe_res = pipe(fd);
 #endif
   if (pipe_res < 0) {
     throw std::runtime_error(std::string("failed to create a pipe: ") + strerror(errno));
   }
 #ifndef _WIN32
   signal(SIGPIPE, SIG_IGN);
 #endif
 
   std::exception_ptr teptr;
   // create a thread to read uncompressed byte stream out of pipe and
   // feed into importDelimited()
   ImportStatus ret1;
   auto th_pipe_reader = std::thread([&]() {
     try {
       // importDelimited will read from FILE* p_file
       if (0 == (p_file = fdopen(fd[0], "r"))) {
         throw std::runtime_error(std::string("failed to open a pipe: ") +
                                  strerror(errno));
       }
 
       // in future, depending on data types of this uncompressed stream
       // it can be feed into other function such like importParquet, etc
       ret1 = importDelimited(file_path, true, session_info);
 
     } catch (...) {
       if (!teptr) {  // no replace
         teptr = std::current_exception();
       }
     }
 
     if (p_file) {
       fclose(p_file);
     }
     p_file = 0;
   });
 
   // create a thread to iterate all files (in all archives) and
   // forward the uncompressed byte stream to fd[1] which is
   // then feed into importDelimited, importParquet, and etc.
   auto th_pipe_writer = std::thread([&]() {
     std::unique_ptr<S3Archive> us3arch;
     bool stop = false;
     for (size_t fi = 0; !stop && fi < file_paths.size(); fi++) {
       try {
         auto file_path = file_paths[fi];
         std::unique_ptr<Archive> uarch;
         std::map<int, std::string> url_parts;
         Archive::parse_url(file_path, url_parts);
         const std::string S3_objkey_url_scheme = "s3ok";
         if ("file" == url_parts[2] || "" == url_parts[2]) {
           uarch.reset(new PosixFileArchive(file_path, copy_params.plain_text));
         } else if ("s3" == url_parts[2]) {
 #ifdef HAVE_AWS_S3
           // new a S3Archive with a shared s3client.
           // should be safe b/c no wildcard with s3 url
           us3arch.reset(new S3Archive(file_path,
                                       copy_params.s3_access_key,
                                       copy_params.s3_secret_key,
                                       copy_params.s3_session_token,
                                       copy_params.s3_region,
                                       copy_params.s3_endpoint,
                                       copy_params.plain_text,
                                       copy_params.regex_path_filter,
                                       copy_params.file_sort_order_by,
                                       copy_params.file_sort_regex));
           us3arch->init_for_read();
           total_file_size += us3arch->get_total_file_size();
           // not land all files here but one by one in following iterations
           for (const auto& objkey : us3arch->get_objkeys()) {
             file_paths.emplace_back(std::string(S3_objkey_url_scheme) + "://" + objkey);
           }
           continue;
 #else
           throw std::runtime_error("AWS S3 support not available");
 #endif  // HAVE_AWS_S3
         } else if (S3_objkey_url_scheme == url_parts[2]) {
 #ifdef HAVE_AWS_S3
           auto objkey = file_path.substr(3 + S3_objkey_url_scheme.size());
           auto file_path =
               us3arch->land(objkey, teptr, nullptr != dynamic_cast<Detector*>(this));
           if (0 == file_path.size()) {
             throw std::runtime_error(std::string("failed to land s3 object: ") + objkey);
           }
           uarch.reset(new PosixFileArchive(file_path, copy_params.plain_text));
           // file not removed until file closed
           us3arch->vacuum(objkey);
 #else
           throw std::runtime_error("AWS S3 support not available");
 #endif  // HAVE_AWS_S3
         }
 #if 0  // TODO(ppan): implement and enable any other archive class
         else
         if ("hdfs" == url_parts[2])
           uarch.reset(new HdfsArchive(file_path));
 #endif
         else {
           throw std::runtime_error(std::string("unsupported archive url: ") + file_path);
         }
 
         // init the archive for read
         auto& arch = *uarch;
 
         // coming here, the archive of url should be ready to be read, unarchived
         // and uncompressed by libarchive into a byte stream (in csv) for the pipe
         const void* buf;
         size_t size;
         bool just_saw_archive_header;
         bool is_detecting = nullptr != dynamic_cast<Detector*>(this);
         bool first_text_header_skipped = false;
         // start reading uncompressed bytes of this archive from libarchive
         // note! this archive may contain more than one files!
         file_offsets.push_back(0);
         size_t num_block_read = 0;
         while (!stop && !!(just_saw_archive_header = arch.read_next_header())) {
           bool insert_line_delim_after_this_file = false;
           while (!stop) {
             int64_t offset{-1};
             auto ok = arch.read_data_block(&buf, &size, &offset);
             // can't use (uncompressed) size, so track (max) file offset.
             // also we want to capture offset even on e.o.f.
             if (offset > 0) {
               std::unique_lock<std::mutex> lock(file_offsets_mutex);
               file_offsets.back() = offset;
             }
             if (!ok) {
               break;
             }
             // one subtle point here is now we concatenate all files
             // to a single FILE stream with which we call importDelimited
             // only once. this would make it misunderstand that only one
             // header line is with this 'single' stream, while actually
             // we may have one header line for each of the files.
             // so we need to skip header lines here instead in importDelimited.
             const char* buf2 = (const char*)buf;
             int size2 = size;
             if (copy_params.has_header != import_export::ImportHeaderRow::kNoHeader &&
                 just_saw_archive_header && (first_text_header_skipped || !is_detecting)) {
               while (size2-- > 0) {
                 if (*buf2++ == copy_params.line_delim) {
                   break;
                 }
               }
               if (size2 <= 0) {
                 LOG(WARNING) << "No line delimiter in block." << std::endl;
               } else {
                 just_saw_archive_header = false;
                 first_text_header_skipped = true;
               }
             }
             // In very rare occasions the write pipe somehow operates in a mode similar
             // to non-blocking while pipe(fds) should behave like pipe2(fds, 0) which
             // means blocking mode. On such a unreliable blocking mode, a possible fix
             // is to loop reading till no bytes left, otherwise the annoying `failed to
             // write pipe: Success`...
             if (size2 > 0) {
               int nremaining = size2;
               while (nremaining > 0) {
                 // try to write the entire remainder of the buffer to the pipe
                 int nwritten = write(fd[1], buf2, nremaining);
                 // how did we do?
                 if (nwritten < 0) {
                   // something bad happened
                   if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) {
                     // ignore these, assume nothing written, try again
                     nwritten = 0;
                   } else if (errno == EPIPE &&
                              import_status_.rows_completed >= kImportRowLimit) {
                     // the reader thread has shut down the pipe from the read end
                     stop = true;
                     break;
                   } else {
                     // a real error
                     throw std::runtime_error(
                         std::string("failed or interrupted write to pipe: ") +
                         strerror(errno));
                   }
                 } else if (nwritten == nremaining) {
                   // we wrote everything; we're done
                   break;
                 }
                 // only wrote some (or nothing), try again
                 nremaining -= nwritten;
                 buf2 += nwritten;
                 // no exception when too many rejected
                 heavyai::shared_lock<heavyai::shared_mutex> read_lock(import_mutex_);
                 if (import_status_.load_failed) {
                   stop = true;
                   break;
                 }
               }
               // check that this file (buf for size) ended with a line delim
               if (size > 0) {
                 const char* plast = static_cast<const char*>(buf) + (size - 1);
                 insert_line_delim_after_this_file = (*plast != copy_params.line_delim);
               }
             }
             ++num_block_read;
           }
 
           // if that file didn't end with a line delim, we insert one here to terminate
           // that file's stream use a loop for the same reason as above
           if (insert_line_delim_after_this_file) {
             while (true) {
               // write the delim char to the pipe
               int nwritten = write(fd[1], &copy_params.line_delim, 1);
               // how did we do?
               if (nwritten < 0) {
                 // something bad happened
                 if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) {
                   // ignore these, assume nothing written, try again
                   nwritten = 0;
                 } else if (errno == EPIPE &&
                            import_status_.rows_completed >= kImportRowLimit) {
                   // the reader thread has shut down the pipe from the read end
                   stop = true;
                   break;
                 } else {
                   // a real error
                   throw std::runtime_error(
                       std::string("failed or interrupted write to pipe: ") +
                       strerror(errno));
                 }
               } else if (nwritten == 1) {
                 // we wrote it; we're done
                 break;
               }
             }
           }
         }
       } catch (...) {
         // when import is aborted because too many data errors or because end of a
         // detection, any exception thrown by s3 sdk or libarchive is okay and should be
         // suppressed.
         heavyai::shared_lock<heavyai::shared_mutex> read_lock(import_mutex_);
         if (import_status_.load_failed) {
           break;
         }
         if (import_status_.rows_completed > 0) {
           if (nullptr != dynamic_cast<Detector*>(this)) {
             break;
           }
         }
         if (!teptr) {  // no replace
           teptr = std::current_exception();
         }
         break;
       }
     }
     // close writer end
     close(fd[1]);
   });
 
   th_pipe_reader.join();
   th_pipe_writer.join();
 
   // rethrow any exception happened herebefore
   if (teptr) {
     std::rethrow_exception(teptr);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

virtual ImportStatus import_export::DataStreamSink::importDelimited	(	const std::string &	file_path,
		const bool	decompressed,
		const Catalog_Namespace::SessionInfo *	session_info
	)