OmniSciDB  04ee39c94c
Archive.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ARCHIVE_ARCHIVE_H_
18 #define ARCHIVE_ARCHIVE_H_
19 
20 #include <regex>
21 #include <string>
22 
23 #include <archive.h>
24 #include <archive_entry.h>
25 
26 // this is the base class from which all archives that represent files sources
27 // hosted on native/netwrok filesystems, AWS S3, HDFS, HTTP URL, FTP URL, ...
28 // etc are derived.
29 class Archive {
30  public:
31  Archive(const std::string url, const bool plain_text)
32  : url(url), plain_text(plain_text) {
33  parse_url(url, url_parts);
34 
35  if (0 == (ar = archive_read_new())) {
36  throw std::runtime_error(std::string("archive_read_new failed!"));
37  }
38 
43 //#define LIBARCHIVE_ENABLE_ALL
44 #ifdef LIBARCHIVE_ENABLE_ALL
45  // this increases ~800kb code size
46  archive_read_support_format_all(ar);
47  archive_read_support_filter_all(ar);
48  archive_read_support_format_raw(ar);
49 #else
50  // list supported formats to bypass the mtree exception
51  archive_read_support_format_ar(ar);
52  archive_read_support_format_cpio(ar);
53  archive_read_support_format_empty(ar);
54  archive_read_support_format_lha(ar);
55  archive_read_support_format_tar(ar);
56  archive_read_support_format_xar(ar);
57  archive_read_support_format_7zip(ar);
58  archive_read_support_format_cab(ar);
59  archive_read_support_format_rar(ar);
60  archive_read_support_format_iso9660(ar);
61  archive_read_support_format_zip(ar);
62 
63  archive_read_support_filter_bzip2(ar);
64  archive_read_support_filter_compress(ar);
65  archive_read_support_filter_gzip(ar);
66  archive_read_support_filter_lzip(ar);
67  archive_read_support_filter_lzma(ar);
68  archive_read_support_filter_xz(ar);
69  archive_read_support_filter_uu(ar);
70  archive_read_support_filter_rpm(ar);
71  archive_read_support_filter_lrzip(ar);
72  archive_read_support_filter_lzop(ar);
73  archive_read_support_filter_grzip(ar);
74 #endif
75  // libarchive assumes archive formats, so without this bzip2 and gzip won't work!
76  // see related issue at https://github.com/libarchive/libarchive/issues/586
77  archive_read_support_format_raw(ar);
78  }
79 
80  virtual ~Archive() {
81  if (ar) {
82  archive_read_close(ar);
83  }
84  if (ar) {
85  archive_read_free(ar);
86  }
87  ar = 0;
88  }
89 
90  virtual std::string archive_error(int err) {
91  auto cstr = archive_error_string(ar);
92  return std::string("libarchive error: ") +
93  (cstr ? std::string(cstr) : std::to_string(err));
94  }
95 
96  virtual bool read_next_header() {
97  int rc;
98  archive_entry* entry;
99  switch (rc = archive_read_next_header(ar, &entry)) {
100  case ARCHIVE_EOF:
101  return false; // signal caller end of stream
102  case ARCHIVE_OK:
103  return true;
104  }
105  throw std::runtime_error(archive_error(rc));
106  }
107 
108  virtual bool read_data_block(const void** buff, size_t* size, int64_t* offset) {
109  int rc;
110  switch (rc = archive_read_data_block(ar, buff, size, offset)) {
111  case ARCHIVE_EOF:
112  return false; // signal caller end of stream
113  case ARCHIVE_OK:
114  return true;
115  }
116  throw std::runtime_error(archive_error(rc));
117  }
118 
119  virtual int64_t get_position_compressed() const { return archive_filter_bytes(ar, -1); }
120 
121  /* !!!
122  7z files can't work with streaming model. Only local 7z files work.
123  That is, use archive_read_open_filename for 7z and any posix file.
124 
125  Any non-7z data source in generic url format has two options:
126  1) stream data to a local named pipe file
127  2) a) specify url in COPY FROM
128  b) define url-specific Archive class
129  c) customize init_for_read() which uses archive_read_open
130 
131  */
132  virtual int open() { return ARCHIVE_OK; } // nop
133  virtual int close() { return ARCHIVE_OK; } // nop
134  virtual ssize_t read(const void** buff) { return 0; } // nop
135 
136  virtual void init_for_read() {
137  // set libarchive callbacks
138  archive_read_open(ar, this, Archive::open, Archive::read, Archive::close);
139  }
140 
141  // these methods are callback for libarchive
142  static ssize_t read(struct archive* a, void* client_data, const void** buff) {
143  return ((Archive*)client_data)->read(buff);
144  }
145 
146  static int open(struct archive* a, void* client_data) {
147  return ((Archive*)client_data)->open();
148  }
149 
150  static int close(struct archive* a, void* client_data) {
151  return ((Archive*)client_data)->close();
152  }
153 
154  static void parse_url(const std::string url, std::map<int, std::string>& url_parts) {
155  /*
156  input example: http://localhost.com/path\?hue\=br\#cool
157  output should be:
158  0: http://localhost.com/path?hue=br#cool
159  1: http:
160  2: http
161  3: //localhost.com
162  4: localhost.com
163  5: /path
164  6: ?hue=br
165  7: hue=br
166  8: #cool
167  9: cool
168  */
169  std::smatch sm;
170  std::regex url_regex(R"(^(([^:\/?#]+):)?(//([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?)",
171  std::regex::extended);
172  if (!std::regex_match(url, sm, url_regex)) {
173  throw std::runtime_error(std::string("malformed url: ") + url);
174  }
175 
176  // sm is only an iterator over local 'url'
177  // so have to copy out matched parts
178  for (size_t i = 0; i < sm.size(); i++) {
179  url_parts[i] = sm[i].str();
180  }
181  }
182 
183  const std::string url_part(const int i) { return url_parts[i]; }
184 
185  protected:
186  std::string url;
187  std::map<int, std::string> url_parts;
188  archive* ar = 0;
190 };
191 
192 #endif /* ARCHIVE_ARCHIVE_H_ */
virtual ssize_t read(const void **buff)
Definition: Archive.h:134
virtual bool read_data_block(const void **buff, size_t *size, int64_t *offset)
Definition: Archive.h:108
static int open(struct archive *a, void *client_data)
Definition: Archive.h:146
virtual int64_t get_position_compressed() const
Definition: Archive.h:119
virtual ~Archive()
Definition: Archive.h:80
std::string to_string(char const *&&v)
archive * ar
Definition: Archive.h:188
std::map< int, std::string > url_parts
Definition: Archive.h:187
virtual int close()
Definition: Archive.h:133
const std::string url_part(const int i)
Definition: Archive.h:183
virtual void init_for_read()
Definition: Archive.h:136
virtual int open()
Definition: Archive.h:132
std::string url
Definition: Archive.h:186
virtual bool read_next_header()
Definition: Archive.h:96
virtual std::string archive_error(int err)
Definition: Archive.h:90
bool plain_text
Definition: Archive.h:189
static int close(struct archive *a, void *client_data)
Definition: Archive.h:150
static void parse_url(const std::string url, std::map< int, std::string > &url_parts)
Definition: Archive.h:154
static ssize_t read(struct archive *a, void *client_data, const void **buff)
Definition: Archive.h:142
Archive(const std::string url, const bool plain_text)
Definition: Archive.h:31