OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Archive.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ARCHIVE_ARCHIVE_H_
18 #define ARCHIVE_ARCHIVE_H_
19 
20 #include <archive.h>
21 #include <archive_entry.h>
22 #include <map>
23 #include <regex>
24 #include <string>
25 
26 // archive_entry.h includes windows.h
28 
29 // this is the base class from which all archives that represent files sources
30 // hosted on native/netwrok filesystems, AWS S3, HDFS, HTTP URL, FTP URL, ...
31 // etc are derived.
32 class Archive {
33  public:
34  Archive(const std::string url, const bool plain_text)
35  : url(url), plain_text(plain_text) {
36  parse_url(url, url_parts);
37 
38  if (0 == (ar = archive_read_new())) {
39  throw std::runtime_error(std::string("archive_read_new failed!"));
40  }
41 
46 //#define LIBARCHIVE_ENABLE_ALL
47 #ifdef LIBARCHIVE_ENABLE_ALL
48  // this increases ~800kb code size
49  archive_read_support_format_all(ar);
50  archive_read_support_filter_all(ar);
51  archive_read_support_format_raw(ar);
52 #else
53  // list supported formats to bypass the mtree exception
54  archive_read_support_format_ar(ar);
55  archive_read_support_format_cpio(ar);
56  archive_read_support_format_empty(ar);
57  archive_read_support_format_lha(ar);
58  archive_read_support_format_tar(ar);
59  archive_read_support_format_xar(ar);
60  archive_read_support_format_7zip(ar);
61  archive_read_support_format_cab(ar);
62  archive_read_support_format_rar(ar);
63  archive_read_support_format_iso9660(ar);
64  archive_read_support_format_zip(ar);
65 
66  archive_read_support_filter_bzip2(ar);
67  archive_read_support_filter_compress(ar);
68  archive_read_support_filter_gzip(ar);
69  archive_read_support_filter_lzip(ar);
70  archive_read_support_filter_lzma(ar);
71  archive_read_support_filter_xz(ar);
72  archive_read_support_filter_uu(ar);
73  archive_read_support_filter_rpm(ar);
74  archive_read_support_filter_lrzip(ar);
75  archive_read_support_filter_lzop(ar);
76  archive_read_support_filter_grzip(ar);
77 #endif
78  // libarchive assumes archive formats, so without this bzip2 and gzip won't work!
79  // see related issue at https://github.com/libarchive/libarchive/issues/586
80  archive_read_support_format_raw(ar);
81  }
82 
83  virtual ~Archive() {
84  if (ar) {
85  archive_read_close(ar);
86  }
87  if (ar) {
88  archive_read_free(ar);
89  }
90  ar = 0;
91  }
92 
93  virtual std::string archive_error(int err) {
94  auto cstr = archive_error_string(ar);
95  return std::string("libarchive error: ") +
96  (cstr ? std::string(cstr) : std::to_string(err));
97  }
98 
99  virtual bool read_next_header() {
100  int rc;
101  switch (rc = archive_read_next_header(ar, &entry)) {
102  case ARCHIVE_EOF:
103  return false; // signal caller end of stream
104  case ARCHIVE_OK:
105  return true;
106  }
107  throw std::runtime_error(archive_error(rc));
108  }
109 
110  virtual bool read_data_block(const void** buff, size_t* size, int64_t* offset) {
111  int rc;
112  switch (rc = archive_read_data_block(ar, buff, size, offset)) {
113  case ARCHIVE_EOF:
114  return false; // signal caller end of stream
115  case ARCHIVE_OK:
116  return true;
117  }
118  throw std::runtime_error(archive_error(rc));
119  }
120 
121  virtual int64_t get_position_compressed() const {
122  return archive_filter_bytes(ar, -1);
123  }
124 
125  /* !!!
126  7z files can't work with streaming model. Only local 7z files work.
127  That is, use archive_read_open_filename for 7z and any posix file.
128 
129  Any non-7z data source in generic url format has two options:
130  1) stream data to a local named pipe file
131  2) a) specify url in COPY FROM
132  b) define url-specific Archive class
133  c) customize init_for_read() which uses archive_read_open
134 
135  */
136  virtual int open() {
137  return ARCHIVE_OK;
138  } // nop
139  virtual int close() {
140  return ARCHIVE_OK;
141  } // nop
142  virtual ptrdiff_t read(const void** buff) {
143  return 0;
144  } // nop
145 
146  virtual void init_for_read() {
147  // set libarchive callbacks
148  archive_read_open(ar, this, Archive::open, Archive::read, Archive::close);
149  }
150 
151  // these methods are callback for libarchive
152  static ptrdiff_t read(struct archive* a, void* client_data, const void** buff) {
153  return ((Archive*)client_data)->read(buff);
154  }
155 
156  static int open(struct archive* a, void* client_data) {
157  return ((Archive*)client_data)->open();
158  }
159 
160  static int close(struct archive* a, void* client_data) {
161  return ((Archive*)client_data)->close();
162  }
163 
164  static void parse_url(const std::string url, std::map<int, std::string>& url_parts) {
165  /*
166  input example: http://localhost.com/path\?hue\=br\#cool
167  output should be:
168  0: http://localhost.com/path?hue=br#cool
169  1: http:
170  2: http
171  3: //localhost.com
172  4: localhost.com
173  5: /path
174  6: ?hue=br
175  7: hue=br
176  8: #cool
177  9: cool
178  */
179  std::smatch sm;
180  std::regex url_regex(R"(^(([^:\/?#]+):)?(//([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?)",
181  std::regex::extended);
182  if (!std::regex_match(url, sm, url_regex)) {
183  throw std::runtime_error(std::string("malformed url: ") + url);
184  }
185 
186  // sm is only an iterator over local 'url'
187  // so have to copy out matched parts
188  for (size_t i = 0; i < sm.size(); i++) {
189  url_parts[i] = sm[i].str();
190  }
191  }
192 
193  const std::string url_part(const int i) {
194  return url_parts[i];
195  }
196 
197  std::string entryName() {
198  return std::string(archive_entry_pathname(entry));
199  }
200 
201  protected:
202  std::string url;
203  std::map<int, std::string> url_parts;
204  archive* ar = 0;
205  archive_entry* entry;
207 };
208 
209 #endif /* ARCHIVE_ARCHIVE_H_ */
virtual bool read_data_block(const void **buff, size_t *size, int64_t *offset)
Definition: Archive.h:110
static ptrdiff_t read(struct archive *a, void *client_data, const void **buff)
Definition: Archive.h:152
static int open(struct archive *a, void *client_data)
Definition: Archive.h:156
std::string entryName()
Definition: Archive.h:197
virtual ~Archive()
Definition: Archive.h:83
std::string to_string(char const *&&v)
constexpr double a
Definition: Utm.h:32
archive * ar
Definition: Archive.h:204
virtual ptrdiff_t read(const void **buff)
Definition: Archive.h:142
std::map< int, std::string > url_parts
Definition: Archive.h:203
archive_entry * entry
Definition: Archive.h:205
virtual int64_t get_position_compressed() const
Definition: Archive.h:121
virtual int close()
Definition: Archive.h:139
const std::string url_part(const int i)
Definition: Archive.h:193
virtual void init_for_read()
Definition: Archive.h:146
virtual int open()
Definition: Archive.h:136
std::string url
Definition: Archive.h:202
virtual bool read_next_header()
Definition: Archive.h:99
virtual std::string archive_error(int err)
Definition: Archive.h:93
bool plain_text
Definition: Archive.h:206
static int close(struct archive *a, void *client_data)
Definition: Archive.h:160
static void parse_url(const std::string url, std::map< int, std::string > &url_parts)
Definition: Archive.h:164
Archive(const std::string url, const bool plain_text)
Definition: Archive.h:34