OmniSciDB  471d68cefb
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Archive.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ARCHIVE_ARCHIVE_H_
18 #define ARCHIVE_ARCHIVE_H_
19 
20 #include <regex>
21 #include <string>
22 
23 #if defined(_WIN32) && !defined(WIN32_LEAN_AND_MEAN)
24 // One of these archive includes on win32 includes Windows.h
25 // and we need to clean up macros such as ERROR and GetObject
26 // For some compilation paths the lean and mean must
27 // also be be set, or conflicts can arrise with ws2def.h
28 // (DelimtedParserUtils.cpp has this issue when buidling initdb)
29 #define WIN32_LEAN_AND_MEAN
30 #endif
31 
32 #include <archive.h>
33 #include <archive_entry.h>
34 
35 #if defined(_WIN32) && defined(WIN32_LEAN_AND_MEAN)
36 #undef WIN32_LEAN_AND_MEAN
38 #endif
39 
40 // this is the base class from which all archives that represent files sources
41 // hosted on native/netwrok filesystems, AWS S3, HDFS, HTTP URL, FTP URL, ...
42 // etc are derived.
43 class Archive {
44  public:
45  Archive(const std::string url, const bool plain_text)
46  : url(url), plain_text(plain_text) {
47  parse_url(url, url_parts);
48 
49  if (0 == (ar = archive_read_new())) {
50  throw std::runtime_error(std::string("archive_read_new failed!"));
51  }
52 
57 //#define LIBARCHIVE_ENABLE_ALL
58 #ifdef LIBARCHIVE_ENABLE_ALL
59  // this increases ~800kb code size
60  archive_read_support_format_all(ar);
61  archive_read_support_filter_all(ar);
62  archive_read_support_format_raw(ar);
63 #else
64  // list supported formats to bypass the mtree exception
65  archive_read_support_format_ar(ar);
66  archive_read_support_format_cpio(ar);
67  archive_read_support_format_empty(ar);
68  archive_read_support_format_lha(ar);
69  archive_read_support_format_tar(ar);
70  archive_read_support_format_xar(ar);
71  archive_read_support_format_7zip(ar);
72  archive_read_support_format_cab(ar);
73  archive_read_support_format_rar(ar);
74  archive_read_support_format_iso9660(ar);
75  archive_read_support_format_zip(ar);
76 
77  archive_read_support_filter_bzip2(ar);
78  archive_read_support_filter_compress(ar);
79  archive_read_support_filter_gzip(ar);
80  archive_read_support_filter_lzip(ar);
81  archive_read_support_filter_lzma(ar);
82  archive_read_support_filter_xz(ar);
83  archive_read_support_filter_uu(ar);
84  archive_read_support_filter_rpm(ar);
85  archive_read_support_filter_lrzip(ar);
86  archive_read_support_filter_lzop(ar);
87  archive_read_support_filter_grzip(ar);
88 #endif
89  // libarchive assumes archive formats, so without this bzip2 and gzip won't work!
90  // see related issue at https://github.com/libarchive/libarchive/issues/586
91  archive_read_support_format_raw(ar);
92  }
93 
94  virtual ~Archive() {
95  if (ar) {
96  archive_read_close(ar);
97  }
98  if (ar) {
99  archive_read_free(ar);
100  }
101  ar = 0;
102  }
103 
104  virtual std::string archive_error(int err) {
105  auto cstr = archive_error_string(ar);
106  return std::string("libarchive error: ") +
107  (cstr ? std::string(cstr) : std::to_string(err));
108  }
109 
110  virtual bool read_next_header() {
111  int rc;
112  switch (rc = archive_read_next_header(ar, &entry)) {
113  case ARCHIVE_EOF:
114  return false; // signal caller end of stream
115  case ARCHIVE_OK:
116  return true;
117  }
118  throw std::runtime_error(archive_error(rc));
119  }
120 
121  virtual bool read_data_block(const void** buff, size_t* size, int64_t* offset) {
122  int rc;
123  switch (rc = archive_read_data_block(ar, buff, size, offset)) {
124  case ARCHIVE_EOF:
125  return false; // signal caller end of stream
126  case ARCHIVE_OK:
127  return true;
128  }
129  throw std::runtime_error(archive_error(rc));
130  }
131 
132  virtual int64_t get_position_compressed() const { return archive_filter_bytes(ar, -1); }
133 
134  /* !!!
135  7z files can't work with streaming model. Only local 7z files work.
136  That is, use archive_read_open_filename for 7z and any posix file.
137 
138  Any non-7z data source in generic url format has two options:
139  1) stream data to a local named pipe file
140  2) a) specify url in COPY FROM
141  b) define url-specific Archive class
142  c) customize init_for_read() which uses archive_read_open
143 
144  */
145  virtual int open() { return ARCHIVE_OK; } // nop
146  virtual int close() { return ARCHIVE_OK; } // nop
147  virtual ptrdiff_t read(const void** buff) { return 0; } // nop
148 
149  virtual void init_for_read() {
150  // set libarchive callbacks
151  archive_read_open(ar, this, Archive::open, Archive::read, Archive::close);
152  }
153 
154  // these methods are callback for libarchive
155  static ptrdiff_t read(struct archive* a, void* client_data, const void** buff) {
156  return ((Archive*)client_data)->read(buff);
157  }
158 
159  static int open(struct archive* a, void* client_data) {
160  return ((Archive*)client_data)->open();
161  }
162 
163  static int close(struct archive* a, void* client_data) {
164  return ((Archive*)client_data)->close();
165  }
166 
167  static void parse_url(const std::string url, std::map<int, std::string>& url_parts) {
168  /*
169  input example: http://localhost.com/path\?hue\=br\#cool
170  output should be:
171  0: http://localhost.com/path?hue=br#cool
172  1: http:
173  2: http
174  3: //localhost.com
175  4: localhost.com
176  5: /path
177  6: ?hue=br
178  7: hue=br
179  8: #cool
180  9: cool
181  */
182  std::smatch sm;
183  std::regex url_regex(R"(^(([^:\/?#]+):)?(//([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?)",
184  std::regex::extended);
185  if (!std::regex_match(url, sm, url_regex)) {
186  throw std::runtime_error(std::string("malformed url: ") + url);
187  }
188 
189  // sm is only an iterator over local 'url'
190  // so have to copy out matched parts
191  for (size_t i = 0; i < sm.size(); i++) {
192  url_parts[i] = sm[i].str();
193  }
194  }
195 
196  const std::string url_part(const int i) { return url_parts[i]; }
197 
198  std::string entryName() { return std::string(archive_entry_pathname(entry)); }
199 
200  protected:
201  std::string url;
202  std::map<int, std::string> url_parts;
203  archive* ar = 0;
204  archive_entry* entry;
206 };
207 
208 #endif /* ARCHIVE_ARCHIVE_H_ */
virtual bool read_data_block(const void **buff, size_t *size, int64_t *offset)
Definition: Archive.h:121
static ptrdiff_t read(struct archive *a, void *client_data, const void **buff)
Definition: Archive.h:155
static int open(struct archive *a, void *client_data)
Definition: Archive.h:159
std::string entryName()
Definition: Archive.h:198
virtual ~Archive()
Definition: Archive.h:94
std::string to_string(char const *&&v)
constexpr double a
Definition: Utm.h:38
archive * ar
Definition: Archive.h:203
virtual ptrdiff_t read(const void **buff)
Definition: Archive.h:147
std::map< int, std::string > url_parts
Definition: Archive.h:202
archive_entry * entry
Definition: Archive.h:204
virtual int64_t get_position_compressed() const
Definition: Archive.h:132
virtual int close()
Definition: Archive.h:146
const std::string url_part(const int i)
Definition: Archive.h:196
virtual void init_for_read()
Definition: Archive.h:149
virtual int open()
Definition: Archive.h:145
std::string url
Definition: Archive.h:201
virtual bool read_next_header()
Definition: Archive.h:110
virtual std::string archive_error(int err)
Definition: Archive.h:104
bool plain_text
Definition: Archive.h:205
static int close(struct archive *a, void *client_data)
Definition: Archive.h:163
static void parse_url(const std::string url, std::map< int, std::string > &url_parts)
Definition: Archive.h:167
Archive(const std::string url, const bool plain_text)
Definition: Archive.h:45