OmniSciDB  d2f719934e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
file_path_util.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
24 #include "Shared/file_path_util.h"
25 
26 #include "Logger/Logger.h"
28 #include "Shared/misc.h"
29 
30 namespace shared {
31 
32 void validate_sort_options(const std::optional<std::string>& sort_by,
33  const std::optional<std::string>& sort_regex) {
34  const auto sort_by_str = to_upper(sort_by.value_or(PATHNAME_ORDER_TYPE));
35 
37  throw std::runtime_error{FILE_SORT_ORDER_BY_KEY +
38  " must be one of the following options: " +
40  }
41 
42  if (shared::contains(non_regex_sort_order_types, sort_by_str) &&
43  sort_regex.has_value()) {
44  throw std::runtime_error{"Option \"" + FILE_SORT_REGEX_KEY +
45  "\" must not be set for selected option \"" +
46  FILE_SORT_ORDER_BY_KEY + "='" + sort_by_str + "'\"."};
47  }
48 
49  if (shared::contains(regex_sort_order_types, sort_by_str) && !sort_regex.has_value()) {
50  throw std::runtime_error{"Option \"" + FILE_SORT_REGEX_KEY +
51  "\" must be set for selected option \"" +
52  FILE_SORT_ORDER_BY_KEY + "='" + sort_by_str + "'\"."};
53  }
54 }
55 
56 namespace {
57 
58 std::vector<std::string> glob_local_recursive_files(const std::string& file_path) {
59  std::vector<std::string> file_paths;
60 
61  if (boost::filesystem::is_regular_file(file_path)) {
62  file_paths.emplace_back(file_path);
63  } else if (boost::filesystem::is_directory(file_path)) {
64  for (boost::filesystem::recursive_directory_iterator
65  it(file_path, boost::filesystem::symlink_option::recurse),
66  eit;
67  it != eit;
68  ++it) {
69  if (!boost::filesystem::is_directory(it->path())) {
70  file_paths.emplace_back(it->path().string());
71  }
72  }
73  // empty directories will not throw an error
74  } else {
75  auto glob_results = omnisci::glob(file_path);
76  for (const auto& path : glob_results) {
77  if (boost::filesystem::is_directory(path)) {
78  auto expanded_paths = glob_local_recursive_files(path);
79  file_paths.insert(file_paths.end(), expanded_paths.begin(), expanded_paths.end());
80  } else {
81  file_paths.emplace_back(path);
82  }
83  }
84  if (file_paths.empty()) {
85  throw_file_not_found(file_path);
86  }
87  }
88  return file_paths;
89 }
90 
91 std::vector<std::string> regex_file_filter(const std::string& pattern,
92  const std::vector<std::string>& file_paths) {
93  boost::regex regex_pattern(pattern);
94  std::vector<std::string> matched_file_paths;
95  for (const auto& path : file_paths) {
96  if (boost::regex_match(path, regex_pattern)) {
97  matched_file_paths.emplace_back(path);
98  }
99  }
100  if (matched_file_paths.empty()) {
101  throw_no_filter_match(pattern);
102  }
103  return matched_file_paths;
104 }
105 
106 } // namespace
107 
108 std::vector<std::string> local_glob_filter_sort_files(
109  const std::string& file_path,
110  const std::optional<std::string>& filter_regex,
111  const std::optional<std::string>& sort_by,
112  const std::optional<std::string>& sort_regex) {
113  auto result_files = glob_local_recursive_files(file_path);
114  if (filter_regex.has_value()) {
115  result_files = regex_file_filter(filter_regex.value(), result_files);
116  }
117  // initial lexicographical order ensures a determinisitc ordering for files not matching
118  // sort_regex
119  auto initial_file_order = FileOrderLocal(std::nullopt, PATHNAME_ORDER_TYPE);
120  auto lexi_comp = initial_file_order.getFileComparator();
121  std::stable_sort(result_files.begin(), result_files.end(), lexi_comp);
122 
123  auto file_order = FileOrderLocal(sort_regex, sort_by);
124  auto comp = file_order.getFileComparator();
125  std::stable_sort(result_files.begin(), result_files.end(), comp);
126  return result_files;
127 }
128 
129 #ifdef HAVE_AWS_S3
130 namespace {
131 
132 std::vector<arrow::fs::FileInfo> arrow_fs_regex_file_filter(
133  const std::string& pattern,
134  const std::vector<arrow::fs::FileInfo>& file_info_list) {
135  boost::regex regex_pattern(pattern);
136  std::vector<arrow::fs::FileInfo> matched_file_info_list;
137  for (const auto& file_info : file_info_list) {
138  if (boost::regex_match(file_info.path(), regex_pattern)) {
139  matched_file_info_list.emplace_back(file_info);
140  }
141  }
142  if (matched_file_info_list.empty()) {
143  throw_no_filter_match(pattern);
144  }
145  return matched_file_info_list;
146 }
147 
148 } // namespace
149 
150 std::vector<arrow::fs::FileInfo> arrow_fs_filter_sort_files(
151  const std::vector<arrow::fs::FileInfo>& file_paths,
152  const std::optional<std::string>& filter_regex,
153  const std::optional<std::string>& sort_by,
154  const std::optional<std::string>& sort_regex) {
155  auto result_files = filter_regex.has_value()
156  ? arrow_fs_regex_file_filter(filter_regex.value(), file_paths)
157  : file_paths;
158  // initial lexicographical order ensures a determinisitc ordering for files not matching
159  // sort_regex
160  auto initial_file_order = FileOrderArrow(std::nullopt, PATHNAME_ORDER_TYPE);
161  auto lexi_comp = initial_file_order.getFileComparator();
162  std::stable_sort(result_files.begin(), result_files.end(), lexi_comp);
163 
164  auto file_order = FileOrderArrow(sort_regex, sort_by);
165  auto comp = file_order.getFileComparator();
166  std::stable_sort(result_files.begin(), result_files.end(), comp);
167  return result_files;
168 }
169 
170 #endif // HAVE_AWS_S3
171 
172 } // namespace shared
bool contains(const T &container, const U &element)
Definition: misc.h:187
const std::array< std::string, 2 > non_regex_sort_order_types
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const std::optional< std::string > &filter_regex, const std::optional< std::string > &sort_by, const std::optional< std::string > &sort_regex)
void throw_no_filter_match(const std::string &pattern)
const std::string FILE_SORT_REGEX_KEY
shared utility for globbing files, paths can be specified as either a single file, directory or wildcards
void validate_sort_options(const std::optional< std::string > &sort_by, const std::optional< std::string > &sort_regex)
std::string join(T const &container, std::string const &delim)
void throw_file_not_found(const std::string &file_path)
std::vector< std::string > glob(const std::string &pattern)
const std::string PATHNAME_ORDER_TYPE
const std::string FILE_SORT_ORDER_BY_KEY
std::string to_upper(const std::string &str)
std::vector< std::string > glob_local_recursive_files(const std::string &file_path)
const std::array< std::string, 5 > supported_file_sort_order_types
const std::array< std::string, 3 > regex_sort_order_types
std::vector< std::string > regex_file_filter(const std::string &pattern, const std::vector< std::string > &file_paths)