OmniSciDB  d2f719934e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
file_path_util.h
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #pragma once
26 #include <array>
27 #include <optional>
28 #include <stdexcept>
29 #include <string>
30 #include <vector>
31 
32 #ifdef HAVE_AWS_S3
33 #include <arrow/filesystem/filesystem.h>
34 #endif // HAVE_AWS_S3
35 #include <boost/filesystem.hpp>
36 #include <boost/regex.hpp>
37 
38 #include "Shared/DateTimeParser.h"
39 #include "Shared/StringTransform.h"
40 
41 namespace shared {
42 
43 using LocalFileComparator = std::function<bool(const std::string&, const std::string&)>;
44 #ifdef HAVE_AWS_S3
45 using ArrowFsComparator =
46  std::function<bool(const arrow::fs::FileInfo&, const arrow::fs::FileInfo&)>;
47 #endif // HAVE_AWS_S3
48 
49 inline const std::string FILE_SORT_ORDER_BY_KEY = "FILE_SORT_ORDER_BY";
50 inline const std::string FILE_SORT_REGEX_KEY = "FILE_SORT_REGEX";
51 
52 inline const std::string PATHNAME_ORDER_TYPE = "PATHNAME";
53 inline const std::string DATE_MODIFIED_ORDER_TYPE = "DATE_MODIFIED";
54 inline const std::string REGEX_ORDER_TYPE = "REGEX";
55 inline const std::string REGEX_DATE_ORDER_TYPE = "REGEX_DATE";
56 inline const std::string REGEX_NUMBER_ORDER_TYPE = "REGEX_NUMBER";
57 
58 inline const std::array<std::string, 5> supported_file_sort_order_types{
64 
65 inline const std::array<std::string, 2> non_regex_sort_order_types{
68 
69 inline const std::array<std::string, 3> regex_sort_order_types{REGEX_ORDER_TYPE,
72 
73 class FileNotFoundException : public std::runtime_error {
74  public:
75  FileNotFoundException(const std::string& error_message)
76  : std::runtime_error(error_message) {}
77 };
78 
79 inline void throw_file_not_found(const std::string& file_path) {
80  throw FileNotFoundException{"File or directory \"" + file_path + "\" does not exist."};
81 }
82 
83 class NoRegexFilterMatchException : public std::runtime_error {
84  public:
85  NoRegexFilterMatchException(const std::string& error_message)
86  : std::runtime_error(error_message) {}
87 };
88 
89 inline void throw_no_filter_match(const std::string& pattern) {
90  throw NoRegexFilterMatchException{"No files matched the regex file path \"" + pattern +
91  "\"."};
92 }
93 
94 void validate_sort_options(const std::optional<std::string>& sort_by,
95  const std::optional<std::string>& sort_regex);
96 
97 std::vector<std::string> local_glob_filter_sort_files(
98  const std::string& file_path,
99  const std::optional<std::string>& filter_regex,
100  const std::optional<std::string>& sort_by,
101  const std::optional<std::string>& sort_regex);
102 
103 #ifdef HAVE_AWS_S3
104 std::vector<arrow::fs::FileInfo> arrow_fs_filter_sort_files(
105  const std::vector<arrow::fs::FileInfo>& file_paths,
106  const std::optional<std::string>& filter_regex,
107  const std::optional<std::string>& sort_by,
108  const std::optional<std::string>& sort_regex);
109 #endif // HAVE_AWS_S3
110 
111 const std::function<bool(const std::string&, const std::string&)>
112  common_regex_date_comp_ = [](const std::string& lhs, const std::string& rhs) -> bool {
113  int64_t lhs_t;
114  int64_t rhs_t;
115  try {
116  lhs_t = dateTimeParse<kDATE>(lhs, 0);
117  } catch (const std::exception& e) {
118  lhs_t = 0;
119  }
120  try {
121  rhs_t = dateTimeParse<kDATE>(rhs, 0);
122  } catch (const std::exception& e) {
123  rhs_t = 0;
124  }
125  return lhs_t < rhs_t;
126 };
127 const std::function<bool(const std::string&, const std::string&)>
129  [](const std::string& lhs, const std::string& rhs) -> bool {
130  int64_t lhs_i;
131  int64_t rhs_i;
132  try {
133  lhs_i = stoll(lhs, 0);
134  } catch (const std::exception& e) {
135  lhs_i = 0;
136  }
137  try {
138  rhs_i = stoll(rhs, 0);
139  } catch (const std::exception& e) {
140  rhs_i = 0;
141  }
142  return lhs_i < rhs_i;
143 };
144 
145 template <class T>
147  public:
148  inline FileOrderBase(const std::optional<std::string>& sort_regex,
149  const std::optional<std::string>& sort_by)
150  : sort_regex_(sort_regex), sort_by_(sort_by) {}
151 
152  virtual inline std::string concatCaptureGroups(const std::string& file_name) const {
153  CHECK(sort_regex_.has_value());
154  boost::match_results<std::string::const_iterator> capture_groups;
155  boost::regex regex_pattern(sort_regex_.value());
156 
157  if (boost::regex_search(file_name, capture_groups, regex_pattern)) {
158  std::stringstream ss;
159  for (size_t i = 1; i < capture_groups.size(); i++) {
160  ss << capture_groups[i];
161  }
162  return ss.str();
163  }
164  return ""; // Empty strings sorted to beginning
165  }
166 
167  virtual inline std::string getSortBy() {
168  return to_upper(sort_by_.value_or(PATHNAME_ORDER_TYPE));
169  }
170 
171  virtual T getFileComparator() = 0;
172 
173  protected:
174  std::optional<std::string> sort_regex_;
175  std::optional<std::string> sort_by_;
176 };
177 
178 class FileOrderLocal : public FileOrderBase<LocalFileComparator> {
179  public:
180  FileOrderLocal(const std::optional<std::string>& sort_regex,
181  const std::optional<std::string>& sort_by)
182  : FileOrderBase<LocalFileComparator>(sort_regex, sort_by) {}
183 
185  auto comparator_pair = comparator_map_.find(getSortBy());
186  CHECK(comparator_pair != comparator_map_.end());
187  return comparator_pair->second;
188  }
189 
190  protected:
191  const std::map<std::string, LocalFileComparator> comparator_map_{
193  [](const std::string& lhs, const std::string& rhs) -> bool { return lhs < rhs; }},
195  [](const std::string& lhs, const std::string& rhs) -> bool {
196  return boost::filesystem::last_write_time(lhs) <
197  boost::filesystem::last_write_time(rhs);
198  }},
200  [this](const std::string& lhs, const std::string& rhs) -> bool {
201  return this->concatCaptureGroups(lhs) < this->concatCaptureGroups(rhs);
202  }},
204  [this](const std::string& lhs, const std::string& rhs) -> bool {
206  this->concatCaptureGroups(rhs));
207  }},
209  [this](const std::string& lhs, const std::string& rhs) -> bool {
211  this->concatCaptureGroups(rhs));
212  }}};
213 };
214 
215 #ifdef HAVE_AWS_S3
216 
217 class FileOrderArrow : public FileOrderBase<ArrowFsComparator> {
218  public:
219  FileOrderArrow(const std::optional<std::string>& sort_regex,
220  const std::optional<std::string>& sort_by)
221  : FileOrderBase<ArrowFsComparator>(sort_regex, sort_by) {}
222 
223  inline ArrowFsComparator getFileComparator() override {
224  auto comparator_pair = comparator_map_.find(getSortBy());
225  CHECK(comparator_pair != comparator_map_.end());
226  return comparator_pair->second;
227  }
228 
229  protected:
230  const std::map<std::string, ArrowFsComparator> comparator_map_{
232  [](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
233  return lhs.path() < rhs.path();
234  }},
236  [](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
237  return lhs.mtime() < rhs.mtime();
238  }},
240  [this](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
241  auto lhs_name = lhs.path();
242  auto rhs_name = rhs.path();
243  return this->concatCaptureGroups(lhs_name) < this->concatCaptureGroups(rhs_name);
244  }},
246  [this](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
247  return common_regex_date_comp_(this->concatCaptureGroups(lhs.path()),
248  this->concatCaptureGroups(rhs.path()));
249  }},
251  [this](const arrow::fs::FileInfo& lhs, const arrow::fs::FileInfo& rhs) -> bool {
252  return common_regex_number_comp_(this->concatCaptureGroups(lhs.path()),
253  this->concatCaptureGroups(rhs.path()));
254  }}};
255 };
256 
257 #endif // HAVE_AWS_S3
258 
259 } // namespace shared
const std::array< std::string, 2 > non_regex_sort_order_types
std::function< bool(const std::string &, const std::string &)> LocalFileComparator
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const std::optional< std::string > &filter_regex, const std::optional< std::string > &sort_by, const std::optional< std::string > &sort_regex)
const std::string REGEX_NUMBER_ORDER_TYPE
LocalFileComparator getFileComparator() override
void throw_no_filter_match(const std::string &pattern)
const std::string REGEX_ORDER_TYPE
const std::string FILE_SORT_REGEX_KEY
void validate_sort_options(const std::optional< std::string > &sort_by, const std::optional< std::string > &sort_regex)
virtual std::string getSortBy()
NoRegexFilterMatchException(const std::string &error_message)
const std::string REGEX_DATE_ORDER_TYPE
void throw_file_not_found(const std::string &file_path)
FileNotFoundException(const std::string &error_message)
const std::string PATHNAME_ORDER_TYPE
FileOrderBase(const std::optional< std::string > &sort_regex, const std::optional< std::string > &sort_by)
const std::string FILE_SORT_ORDER_BY_KEY
const std::map< std::string, LocalFileComparator > comparator_map_
std::string to_upper(const std::string &str)
std::optional< std::string > sort_regex_
const std::array< std::string, 5 > supported_file_sort_order_types
const std::string DATE_MODIFIED_ORDER_TYPE
virtual T getFileComparator()=0
virtual std::string concatCaptureGroups(const std::string &file_name) const
#define CHECK(condition)
Definition: Logger.h:211
const std::array< std::string, 3 > regex_sort_order_types
FileOrderLocal(const std::optional< std::string > &sort_regex, const std::optional< std::string > &sort_by)
std::optional< std::string > sort_by_
const std::function< bool(const std::string &, const std::string &)> common_regex_number_comp_
const std::function< bool(const std::string &, const std::string &)> common_regex_date_comp_