OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Archive.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ARCHIVE_ARCHIVE_H_
18 #define ARCHIVE_ARCHIVE_H_
19 
20 #include <archive.h>
21 #include <archive_entry.h>
22 #include <map>
23 #include <regex>
24 #include <string>
25 
26 // archive_entry.h includes windows.h
28 
29 // this is the base class from which all archives that represent files sources
30 // hosted on native/netwrok filesystems, AWS S3, HDFS, HTTP URL, FTP URL, ...
31 // etc are derived.
32 class Archive {
33  public:
34  Archive(const std::string url, const bool plain_text)
35  : url(url), plain_text(plain_text) {
36  parse_url(url, url_parts);
37 
38  if (0 == (ar = archive_read_new())) {
39  throw std::runtime_error(std::string("archive_read_new failed!"));
40  }
41 
46 //#define LIBARCHIVE_ENABLE_ALL
47 #ifdef LIBARCHIVE_ENABLE_ALL
48  // this increases ~800kb code size
49  archive_read_support_format_all(ar);
50  archive_read_support_filter_all(ar);
51  archive_read_support_format_raw(ar);
52 #else
53  // list supported formats to bypass the mtree exception
54  archive_read_support_format_ar(ar);
55  archive_read_support_format_cpio(ar);
56  archive_read_support_format_empty(ar);
57  archive_read_support_format_lha(ar);
58  archive_read_support_format_tar(ar);
59  archive_read_support_format_xar(ar);
60  archive_read_support_format_7zip(ar);
61  archive_read_support_format_cab(ar);
62  archive_read_support_format_rar(ar);
63  archive_read_support_format_iso9660(ar);
64  archive_read_support_format_zip(ar);
65 
66  archive_read_support_filter_bzip2(ar);
67  archive_read_support_filter_compress(ar);
68  archive_read_support_filter_gzip(ar);
69  archive_read_support_filter_lzip(ar);
70  archive_read_support_filter_lzma(ar);
71  archive_read_support_filter_xz(ar);
72  archive_read_support_filter_uu(ar);
73  archive_read_support_filter_rpm(ar);
74  archive_read_support_filter_lrzip(ar);
75  archive_read_support_filter_lzop(ar);
76  archive_read_support_filter_grzip(ar);
77 #endif
78  // libarchive assumes archive formats, so without this bzip2 and gzip won't work!
79  // see related issue at https://github.com/libarchive/libarchive/issues/586
80  archive_read_support_format_raw(ar);
81  }
82 
83  virtual ~Archive() {
84  if (ar) {
85  archive_read_close(ar);
86  }
87  if (ar) {
88  archive_read_free(ar);
89  }
90  ar = 0;
91  }
92 
93  virtual std::string archive_error(int err) {
94  auto cstr = archive_error_string(ar);
95  return std::string("libarchive error: ") +
96  (cstr ? std::string(cstr) : std::to_string(err));
97  }
98 
99  virtual bool read_next_header() {
100  int rc;
101  switch (rc = archive_read_next_header(ar, &entry)) {
102  case ARCHIVE_EOF:
103  return false; // signal caller end of stream
104  case ARCHIVE_OK:
105  return true;
106  }
107  throw std::runtime_error(archive_error(rc));
108  }
109 
110  virtual bool read_data_block(const void** buff, size_t* size, int64_t* offset) {
111  int rc;
112  switch (rc = archive_read_data_block(ar, buff, size, offset)) {
113  case ARCHIVE_EOF:
114  return false; // signal caller end of stream
115  case ARCHIVE_OK:
116  return true;
117  }
118  throw std::runtime_error(archive_error(rc));
119  }
120 
121  virtual int64_t get_position_compressed() const { return archive_filter_bytes(ar, -1); }
122 
123  /* !!!
124  7z files can't work with streaming model. Only local 7z files work.
125  That is, use archive_read_open_filename for 7z and any posix file.
126 
127  Any non-7z data source in generic url format has two options:
128  1) stream data to a local named pipe file
129  2) a) specify url in COPY FROM
130  b) define url-specific Archive class
131  c) customize init_for_read() which uses archive_read_open
132 
133  */
134  virtual int open() { return ARCHIVE_OK; } // nop
135  virtual int close() { return ARCHIVE_OK; } // nop
136  virtual ptrdiff_t read(const void** buff) { return 0; } // nop
137 
138  virtual void init_for_read() {
139  // set libarchive callbacks
140  archive_read_open(ar, this, Archive::open, Archive::read, Archive::close);
141  }
142 
143  // these methods are callback for libarchive
144  static ptrdiff_t read(struct archive* a, void* client_data, const void** buff) {
145  return ((Archive*)client_data)->read(buff);
146  }
147 
148  static int open(struct archive* a, void* client_data) {
149  return ((Archive*)client_data)->open();
150  }
151 
152  static int close(struct archive* a, void* client_data) {
153  return ((Archive*)client_data)->close();
154  }
155 
156  static void parse_url(const std::string url, std::map<int, std::string>& url_parts) {
157  /*
158  input example: http://localhost.com/path\?hue\=br\#cool
159  output should be:
160  0: http://localhost.com/path?hue=br#cool
161  1: http:
162  2: http
163  3: //localhost.com
164  4: localhost.com
165  5: /path
166  6: ?hue=br
167  7: hue=br
168  8: #cool
169  9: cool
170  */
171  std::smatch sm;
172  std::regex url_regex(R"(^(([^:\/?#]+):)?(//([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?)",
173  std::regex::extended);
174  if (!std::regex_match(url, sm, url_regex)) {
175  throw std::runtime_error(std::string("malformed url: ") + url);
176  }
177 
178  // sm is only an iterator over local 'url'
179  // so have to copy out matched parts
180  for (size_t i = 0; i < sm.size(); i++) {
181  url_parts[i] = sm[i].str();
182  }
183  }
184 
185  const std::string url_part(const int i) { return url_parts[i]; }
186 
187  std::string entryName() { return std::string(archive_entry_pathname(entry)); }
188 
189  protected:
190  std::string url;
191  std::map<int, std::string> url_parts;
192  archive* ar = 0;
193  archive_entry* entry;
195 };
196 
197 #endif /* ARCHIVE_ARCHIVE_H_ */
virtual bool read_data_block(const void **buff, size_t *size, int64_t *offset)
Definition: Archive.h:110
static ptrdiff_t read(struct archive *a, void *client_data, const void **buff)
Definition: Archive.h:144
static int open(struct archive *a, void *client_data)
Definition: Archive.h:148
std::string entryName()
Definition: Archive.h:187
virtual ~Archive()
Definition: Archive.h:83
std::string to_string(char const *&&v)
constexpr double a
Definition: Utm.h:32
archive * ar
Definition: Archive.h:192
virtual ptrdiff_t read(const void **buff)
Definition: Archive.h:136
std::map< int, std::string > url_parts
Definition: Archive.h:191
archive_entry * entry
Definition: Archive.h:193
virtual int64_t get_position_compressed() const
Definition: Archive.h:121
virtual int close()
Definition: Archive.h:135
const std::string url_part(const int i)
Definition: Archive.h:185
virtual void init_for_read()
Definition: Archive.h:138
virtual int open()
Definition: Archive.h:134
std::string url
Definition: Archive.h:190
virtual bool read_next_header()
Definition: Archive.h:99
virtual std::string archive_error(int err)
Definition: Archive.h:93
bool plain_text
Definition: Archive.h:194
static int close(struct archive *a, void *client_data)
Definition: Archive.h:152
static void parse_url(const std::string url, std::map< int, std::string > &url_parts)
Definition: Archive.h:156
Archive(const std::string url, const bool plain_text)
Definition: Archive.h:34