OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
file_path_util.h
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #pragma once
26 #include <array>
27 #include <optional>
28 #include <stdexcept>
29 #include <string>
30 #include <vector>
31 
32 #ifdef HAVE_AWS_S3
33 #include <arrow/filesystem/filesystem.h>
34 #endif // HAVE_AWS_S3
35 #include <boost/filesystem.hpp>
36 
37 #include "Shared/DateTimeParser.h"
38 #include "Shared/StringTransform.h"
40 
41 namespace shared {
42 
43 using LocalFileComparator = std::function<bool(const std::string&, const std::string&)>;
44 #ifdef HAVE_AWS_S3
45 using ArrowFsComparator =
46  std::function<bool(const arrow::fs::FileInfo&, const arrow::fs::FileInfo&)>;
47 #endif // HAVE_AWS_S3
48 
49 inline const std::string FILE_SORT_ORDER_BY_KEY = "FILE_SORT_ORDER_BY";
50 inline const std::string FILE_SORT_REGEX_KEY = "FILE_SORT_REGEX";
51 
52 inline const std::string PATHNAME_ORDER_TYPE = "PATHNAME";
53 inline const std::string DATE_MODIFIED_ORDER_TYPE = "DATE_MODIFIED";
54 inline const std::string REGEX_ORDER_TYPE = "REGEX";
55 inline const std::string REGEX_DATE_ORDER_TYPE = "REGEX_DATE";
56 inline const std::string REGEX_NUMBER_ORDER_TYPE = "REGEX_NUMBER";
57 
58 inline const std::array<std::string, 5> supported_file_sort_order_types{
64 
65 inline const std::array<std::string, 2> non_regex_sort_order_types{
68 
69 inline const std::array<std::string, 3> regex_sort_order_types{REGEX_ORDER_TYPE,
72 
73 class FileNotFoundException : public std::runtime_error {
74  public:
75  FileNotFoundException(const std::string& error_message)
76  : std::runtime_error(error_message) {}
77 };
78 
79 inline void throw_file_not_found(const std::string& file_path) {
80  throw FileNotFoundException{"File or directory \"" + file_path + "\" does not exist."};
81 }
82 
83 class NoRegexFilterMatchException : public std::runtime_error {
84  public:
85  NoRegexFilterMatchException(const std::string& error_message)
86  : std::runtime_error(error_message) {}
87 };
88 
89 inline void throw_no_filter_match(const std::string& pattern) {
90  throw NoRegexFilterMatchException{"No files matched the regex file path \"" + pattern +
91  "\"."};
92 }
93 
94 void validate_sort_options(const std::optional<std::string>& sort_by,
95  const std::optional<std::string>& sort_regex);
96 
97 std::vector<std::string> local_glob_filter_sort_files(
98  const std::string& file_path,
99  const std::optional<std::string>& filter_regex,
100  const std::optional<std::string>& sort_by,
101  const std::optional<std::string>& sort_regex,
102  const bool recurse = true);
103 
104 #ifdef HAVE_AWS_S3
105 std::vector<arrow::fs::FileInfo> arrow_fs_filter_sort_files(
106  const std::vector<arrow::fs::FileInfo>& file_paths,
107  const std::optional<std::string>& filter_regex,
108  const std::optional<std::string>& sort_by,
109  const std::optional<std::string>& sort_regex);
110 #endif // HAVE_AWS_S3
111 
112 const std::function<bool(const std::string&, const std::string&)>
113  common_regex_date_comp_ = [](const std::string& lhs, const std::string& rhs) -> bool {
114  int64_t lhs_t;
115  int64_t rhs_t;
116  try {
117  lhs_t = dateTimeParse<kDATE>(lhs, 0);
118  } catch (const std::exception& e) {
119  lhs_t = 0;
120  }
121  try {
122  rhs_t = dateTimeParse<kDATE>(rhs, 0);
123  } catch (const std::exception& e) {
124  rhs_t = 0;
125  }
126  return lhs_t < rhs_t;
127 };
128 const std::function<bool(const std::string&, const std::string&)>
130  [](const std::string& lhs, const std::string& rhs) -> bool {
131  int64_t lhs_i;
132  int64_t rhs_i;
133  try {
134  lhs_i = stoll(lhs, 0);
135  } catch (const std::exception& e) {
136  lhs_i = 0;
137  }
138  try {
139  rhs_i = stoll(rhs, 0);
140  } catch (const std::exception& e) {
141  rhs_i = 0;
142  }
143  return lhs_i < rhs_i;
144 };
145 
146 template <class T>
148  public:
149  inline FileOrderBase(const std::optional<std::string>& sort_regex,
150  const std::optional<std::string>& sort_by)
151  : sort_regex_(sort_regex), sort_by_(sort_by) {}
152 
153  virtual inline std::string concatCaptureGroups(const std::string& file_name) const {
154  CHECK(sort_regex_.has_value());
155  boost::match_results<std::string::const_iterator> capture_groups;
156  boost::regex regex_pattern(sort_regex_.value());
157 
158  if (boost::regex_search(file_name, capture_groups, regex_pattern)) {
159  std::stringstream ss;
160  for (size_t i = 1; i < capture_groups.size(); i++) {
161  ss << capture_groups[i];
162  }
163  return ss.str();
164  }
165  return ""; // Empty strings sorted to beginning
166  }
167 
168  virtual inline std::string getSortBy() {
169  return to_upper(sort_by_.value_or(PATHNAME_ORDER_TYPE));
170  }
171 
172  virtual T getFileComparator() = 0;
173 
174  protected:
175  std::optional<std::string> sort_regex_;
176  std::optional<std::string> sort_by_;
177 };
178 
179 class FileOrderLocal : public FileOrderBase<LocalFileComparator> {
180  public:
181  FileOrderLocal(const std::optional<std::string>& sort_regex,
182  const std::optional<std::string>& sort_by)
183  : FileOrderBase<LocalFileComparator>(sort_regex, sort_by) {}
184 
186  auto comparator_pair = comparator_map_.find(getSortBy());
187  CHECK(comparator_pair != comparator_map_.end());
188  return comparator_pair->second;
189  }
190 
191  protected:
192  const std::map<std::string, LocalFileComparator> comparator_map_{
194  [](const std::string& lhs, const std::string& rhs) -> bool { return lhs < rhs; }},
196  [](const std::string& lhs, const std::string& rhs) -> bool {
197  return boost::filesystem::last_write_time(lhs) <
198  boost::filesystem::last_write_time(rhs);
199  }},
201  [this](const std::string& lhs, const std::string& rhs) -> bool {
202  return this->concatCaptureGroups(lhs) < this->concatCaptureGroups(rhs);
203  }},
205  [this](const std::string& lhs, const std::string& rhs) -> bool {
207  this->concatCaptureGroups(rhs));
208  }},
210  [this](const std::string& lhs, const std::string& rhs) -> bool {
212  this->concatCaptureGroups(rhs));
213  }}};
214 };
215 
216 #ifdef HAVE_AWS_S3
217 
218 class FileOrderArrow : public FileOrderBase<ArrowFsComparator> {
219  public:
220  FileOrderArrow(const std::optional<std::string>& sort_regex,
221  const std::optional<std::string>& sort_by)
222  : FileOrderBase<ArrowFsComparator>(sort_regex, sort_by) {}
223 
224  inline ArrowFsComparator getFileComparator() override {
225  auto comparator_pair = comparator_map_.find(getSortBy());
226  CHECK(comparator_pair != comparator_map_.end());
227  return comparator_pair->second;
228  }
229 
230  protected:
231  const std::map<std::string, ArrowFsComparator> comparator_map_{
233  [](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
234  return lhs.path() < rhs.path();
235  }},
237  [](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
238  return lhs.mtime() < rhs.mtime();
239  }},
241  [this](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
242  auto lhs_name = lhs.path();
243  auto rhs_name = rhs.path();
244  return this->concatCaptureGroups(lhs_name) < this->concatCaptureGroups(rhs_name);
245  }},
247  [this](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
248  return common_regex_date_comp_(this->concatCaptureGroups(lhs.path()),
249  this->concatCaptureGroups(rhs.path()));
250  }},
252  [this](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
253  return common_regex_number_comp_(this->concatCaptureGroups(lhs.path()),
254  this->concatCaptureGroups(rhs.path()));
255  }}};
256 };
257 
258 #endif // HAVE_AWS_S3
259 
260 bool file_or_glob_path_exists(const std::string& path);
261 
262 } // namespace shared
const std::array< std::string, 2 > non_regex_sort_order_types
std::function< bool(const std::string &, const std::string &)> LocalFileComparator
const std::string REGEX_NUMBER_ORDER_TYPE
LocalFileComparator getFileComparator() override
void throw_no_filter_match(const std::string &pattern)
const std::string REGEX_ORDER_TYPE
const std::string FILE_SORT_REGEX_KEY
void validate_sort_options(const std::optional< std::string > &sort_by, const std::optional< std::string > &sort_regex)
virtual std::string getSortBy()
NoRegexFilterMatchException(const std::string &error_message)
const std::string REGEX_DATE_ORDER_TYPE
void throw_file_not_found(const std::string &file_path)
FileNotFoundException(const std::string &error_message)
const std::string PATHNAME_ORDER_TYPE
FileOrderBase(const std::optional< std::string > &sort_regex, const std::optional< std::string > &sort_by)
const std::string FILE_SORT_ORDER_BY_KEY
const std::map< std::string, LocalFileComparator > comparator_map_
std::string to_upper(const std::string &str)
bool file_or_glob_path_exists(const std::string &path)
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const std::optional< std::string > &filter_regex, const std::optional< std::string > &sort_by, const std::optional< std::string > &sort_regex, const bool recurse)
std::optional< std::string > sort_regex_
const std::array< std::string, 5 > supported_file_sort_order_types
const std::string DATE_MODIFIED_ORDER_TYPE
virtual T getFileComparator()=0
virtual std::string concatCaptureGroups(const std::string &file_name) const
#define CHECK(condition)
Definition: Logger.h:223
const std::array< std::string, 3 > regex_sort_order_types
FileOrderLocal(const std::optional< std::string > &sort_regex, const std::optional< std::string > &sort_by)
std::optional< std::string > sort_by_
const std::function< bool(const std::string &, const std::string &)> common_regex_number_comp_
const std::function< bool(const std::string &, const std::string &)> common_regex_date_comp_