OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GeospatialEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "DataMgr/Chunk/Chunk.h"
20 #include "Geospatial/Compression.h"
21 #include "Geospatial/Types.h"
22 
27 #include "ImportExport/Importer.h"
29 
30 namespace foreign_storage {
31 
32 template <typename T>
33 inline ArrayDatum encode_as_array_datum(const std::vector<T>& data) {
34  const size_t num_bytes = data.size() * sizeof(T);
35  std::shared_ptr<int8_t> buffer(new int8_t[num_bytes], std::default_delete<int8_t[]>());
36  memcpy(buffer.get(), data.data(), num_bytes);
37  return ArrayDatum(num_bytes, buffer, false);
38 }
39 
41  public:
42  virtual ~GeospatialEncoder() = default;
43 
44  GeospatialEncoder(const RenderGroupAnalyzerMap* render_group_analyzer_map)
45  : render_group_analyzer_map_{render_group_analyzer_map} {}
46 
47  GeospatialEncoder(std::list<Chunk_NS::Chunk>& chunks,
48  const RenderGroupAnalyzerMap* render_group_analyzer_map)
49  : geo_column_descriptor_(chunks.begin()->getColumnDesc())
50  , base_column_encoder_(nullptr)
51  , coords_column_encoder_(nullptr)
52  , bounds_column_encoder_(nullptr)
56  , base_column_metadata_(nullptr)
57  , coords_column_metadata_(nullptr)
58  , bounds_column_metadata_(nullptr)
62  , render_group_analyzer_map_{render_group_analyzer_map} {
64  validateChunksSizing(chunks);
65  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
66 
67  // initialize coords column
68  coords_column_descriptor_ = getColumnDescriptor(chunks, geo_column_type, COORDS);
69 
70  // initialize bounds column
71  if (hasBoundsColumn()) {
72  bounds_column_descriptor_ = getColumnDescriptor(chunks, geo_column_type, BOUNDS);
73  }
74 
75  // initialize ring sizes column & render group column
78  getColumnDescriptor(chunks, geo_column_type, RING_OR_LINE_SIZES);
79  }
80  if (hasRenderGroupColumn()) {
82  getColumnDescriptor(chunks, geo_column_type, RENDER_GROUP);
83  }
84 
85  // initialize poly rings column
86  if (hasPolyRingsColumn()) {
88  getColumnDescriptor(chunks, geo_column_type, POLY_RINGS);
89  }
90  }
91 
92  GeospatialEncoder(std::list<Chunk_NS::Chunk>& chunks,
93  std::list<std::unique_ptr<ChunkMetadata>>& chunk_metadata,
94  const RenderGroupAnalyzerMap* render_group_analyzer_map)
95  : geo_column_descriptor_(chunks.begin()->getColumnDesc())
96  , base_column_encoder_(nullptr)
97  , coords_column_encoder_(nullptr)
98  , bounds_column_encoder_(nullptr)
100  , poly_rings_column_encoder_(nullptr)
102  , base_column_metadata_(nullptr)
103  , coords_column_metadata_(nullptr)
104  , bounds_column_metadata_(nullptr)
106  , poly_rings_column_metadata_(nullptr)
108  , render_group_analyzer_map_{render_group_analyzer_map} {
110 
111  validateChunksSizing(chunks);
112  validateMetadataSizing(chunk_metadata);
113 
114  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
115 
116  // initialize base column encoder
117  auto base_chunk = chunks.begin();
118  base_chunk->initEncoder();
120  dynamic_cast<StringNoneEncoder*>(base_chunk->getBuffer()->getEncoder());
121  base_column_metadata_ = chunk_metadata.begin()->get();
123 
124  // initialize coords column
127  chunks, chunk_metadata, geo_column_type, COORDS);
128 
129  // initialize bounds column
130  if (hasBoundsColumn()) {
131  std::tie(
134  chunks, chunk_metadata, geo_column_type, BOUNDS);
135  }
136 
137  // initialize ring sizes column & render group column
138  if (hasRingOrLineSizesColumn()) {
143  chunks, chunk_metadata, geo_column_type, RING_OR_LINE_SIZES);
144  }
145  if (hasRenderGroupColumn()) {
150  chunks, chunk_metadata, geo_column_type, RENDER_GROUP);
151  }
152 
153  // initialize poly rings column
154  if (hasPolyRingsColumn()) {
159  chunks, chunk_metadata, geo_column_type, POLY_RINGS);
160  }
161  }
162 
163  protected:
164  void appendBaseAndRenderGroupDataAndUpdateMetadata(const int64_t row_count) {
165  base_values_.resize(row_count);
167  *base_column_encoder_->appendData(&base_values_, 0, row_count);
168  if (hasRenderGroupColumn()) {
169  CHECK_EQ(render_group_value_buffer_.size(), static_cast<size_t>(row_count))
170  << "Render Group Values not generated correctly!";
171  auto data_ptr = reinterpret_cast<int8_t*>(render_group_value_buffer_.data());
173  data_ptr, row_count, render_group_column_descriptor_->columnType);
174  }
175  }
176 
177  void validateChunksSizing(std::list<Chunk_NS::Chunk>& chunks) const {
178  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
179  if (geo_column_type == kPOINT) {
180  CHECK(chunks.size() == 2);
181  } else if (geo_column_type == kLINESTRING || geo_column_type == kMULTIPOINT) {
182  CHECK(chunks.size() == 3);
183  } else if (geo_column_type == kMULTILINESTRING) {
184  CHECK(chunks.size() == 4);
185  } else if (geo_column_type == kPOLYGON) {
186  CHECK(chunks.size() == 5);
187  } else if (geo_column_type == kMULTIPOLYGON) {
188  CHECK(chunks.size() == 6);
189  }
190  }
191 
193  std::list<std::unique_ptr<ChunkMetadata>>& chunk_metadata) const {
194  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
195  if (geo_column_type == kPOINT) {
196  CHECK(chunk_metadata.size() == 2);
197  } else if (geo_column_type == kLINESTRING || geo_column_type == kMULTIPOINT) {
198  CHECK(chunk_metadata.size() == 3);
199  } else if (geo_column_type == kMULTILINESTRING) {
200  CHECK(chunk_metadata.size() == 4);
201  } else if (geo_column_type == kPOLYGON) {
202  CHECK(chunk_metadata.size() == 5);
203  } else if (geo_column_type == kMULTIPOLYGON) {
204  CHECK(chunk_metadata.size() == 6);
205  }
206  }
207 
219  }
220 
222  const std::vector<ArrayDatum>& datum_parse_buffer,
223  Encoder* encoder,
224  ChunkMetadata* chunk_metadata) const {
225  if (!encoder) {
226  CHECK(!chunk_metadata);
227  return;
228  }
229  if (auto fixed_len_array_encoder =
230  dynamic_cast<FixedLengthArrayNoneEncoder*>(encoder)) {
231  auto new_chunk_metadata = fixed_len_array_encoder->appendData(
232  &datum_parse_buffer, 0, datum_parse_buffer.size());
233  *chunk_metadata = *new_chunk_metadata;
234  } else if (auto array_encoder = dynamic_cast<ArrayNoneEncoder*>(encoder)) {
235  auto new_chunk_metadata = array_encoder->appendData(
236  &datum_parse_buffer, 0, datum_parse_buffer.size(), false);
237  *chunk_metadata = *new_chunk_metadata;
238  } else {
239  UNREACHABLE();
240  }
241  }
242 
243  void processGeoElement(std::string_view geo_string_view) {
245  if (!Geospatial::GeoTypesFactory::getGeoColumns(std::string(geo_string_view),
246  import_ti,
253  }
254 
255  // validate types
256  if (geo_column_descriptor_->columnType.get_type() != import_ti.get_type()) {
258  !(import_ti.get_type() == SQLTypes::kPOLYGON &&
261  }
262  }
263 
264  // append coords
265  std::vector<uint8_t> compressed_coords = Geospatial::compress_coords(
267  coords_datum_buffer_.emplace_back(encode_as_array_datum(compressed_coords));
268 
269  // append bounds
270  if (hasBoundsColumn()) {
272  }
273 
274  // append ring sizes
275  if (hasRingOrLineSizesColumn()) {
278  }
279 
280  // append poly rings
281  if (hasPolyRingsColumn()) {
282  poly_rings_datum_buffer_.emplace_back(
284  }
285 
286  if (hasRenderGroupColumn()) {
287  if (IS_GEO_POLY(import_ti.get_type()) && render_group_analyzer_map_ &&
288  render_group_analyzer_map_->size()) {
289  CHECK_EQ(bounds_parse_buffer_.size(), 4u);
290  auto const itr =
292  int render_group{0};
293  if (itr != render_group_analyzer_map_->end()) {
294  auto& render_group_analyzer = *itr->second;
295  render_group = render_group_analyzer.insertBoundsAndReturnRenderGroup(
297  }
298  render_group_value_buffer_.emplace_back(render_group);
299  } else {
300  render_group_value_buffer_.emplace_back(0);
301  }
302  }
303  }
304 
313  // POINT columns are represented using fixed length arrays and need
314  // special treatment of nulls
316  std::vector<uint8_t> compressed_coords = Geospatial::compress_coords(
318  coords_datum_buffer_.emplace_back(encode_as_array_datum(compressed_coords));
319  } else {
322  }
323  if (hasBoundsColumn()) {
326  }
327  if (hasRingOrLineSizesColumn()) {
331  }
332  if (hasPolyRingsColumn()) {
333  poly_rings_datum_buffer_.emplace_back(
336  }
337  if (hasRenderGroupColumn()) {
338  static constexpr int32_t kNullRenderGroupValue = -1;
339  render_group_value_buffer_.emplace_back(kNullRenderGroupValue);
340  }
341  }
342 
344  coords_parse_buffer_.clear();
345  bounds_parse_buffer_.clear();
347  poly_rings_parse_buffer_.clear();
348  }
349 
351  coords_datum_buffer_.clear();
352  bounds_datum_buffer_.clear();
354  poly_rings_datum_buffer_.clear();
356  }
357 
359 
360  template <typename T>
361  typename std::list<T>::iterator getIteratorForGeoColumnType(
362  std::list<T>& list,
363  const SQLTypes column_type,
364  const GeoColumnType geo_column) {
365  auto list_iter = list.begin();
366  list_iter++; // skip base column
367  switch (column_type) {
368  case kPOINT: {
369  if (geo_column == COORDS) {
370  return list_iter;
371  }
372  UNREACHABLE();
373  }
374  case kMULTIPOINT:
375  case kLINESTRING: {
376  if (geo_column == COORDS) {
377  return list_iter;
378  }
379  list_iter++;
380  if (geo_column == BOUNDS) {
381  return list_iter;
382  }
383  UNREACHABLE();
384  }
385  case kMULTILINESTRING: {
386  if (geo_column == COORDS) {
387  return list_iter;
388  }
389  list_iter++;
390  if (geo_column == RING_OR_LINE_SIZES) {
391  return list_iter;
392  }
393  list_iter++;
394  if (geo_column == BOUNDS) {
395  return list_iter;
396  }
397  UNREACHABLE();
398  }
399  case kPOLYGON: {
400  if (geo_column == COORDS) {
401  return list_iter;
402  }
403  list_iter++;
404  if (geo_column == RING_OR_LINE_SIZES) {
405  return list_iter;
406  }
407  list_iter++;
408  if (geo_column == BOUNDS) {
409  return list_iter;
410  }
411  list_iter++;
412  if (geo_column == RENDER_GROUP) {
413  return list_iter;
414  }
415  UNREACHABLE();
416  }
417  case kMULTIPOLYGON: {
418  if (geo_column == COORDS) {
419  return list_iter;
420  }
421  list_iter++;
422  if (geo_column == RING_OR_LINE_SIZES) {
423  return list_iter;
424  }
425  list_iter++;
426  if (geo_column == POLY_RINGS) {
427  return list_iter;
428  }
429  list_iter++;
430  if (geo_column == BOUNDS) {
431  return list_iter;
432  }
433  list_iter++;
434  if (geo_column == RENDER_GROUP) {
435  return list_iter;
436  }
437  UNREACHABLE();
438  }
439  default:
440  UNREACHABLE();
441  }
442  return {};
443  }
444 
445  std::tuple<Encoder*, ChunkMetadata*, const ColumnDescriptor*>
447  std::list<Chunk_NS::Chunk>& chunks,
448  std::list<std::unique_ptr<ChunkMetadata>>& chunk_metadata,
449  const SQLTypes sql_type,
450  GeoColumnType geo_column_type) {
451  auto chunk = getIteratorForGeoColumnType(chunks, sql_type, geo_column_type);
452  chunk->initEncoder();
453  auto encoder = chunk->getBuffer()->getEncoder();
454  auto metadata =
455  getIteratorForGeoColumnType(chunk_metadata, sql_type, geo_column_type)->get();
456  auto column_descriptor = chunk->getColumnDesc();
457  return {encoder, metadata, column_descriptor};
458  }
459 
460  const ColumnDescriptor* getColumnDescriptor(std::list<Chunk_NS::Chunk>& chunks,
461  const SQLTypes sql_type,
462  GeoColumnType geo_column_type) {
463  auto chunk = getIteratorForGeoColumnType(chunks, sql_type, geo_column_type);
464  auto column_descriptor = chunk->getColumnDesc();
465  return column_descriptor;
466  }
467 
468  static void throwMalformedGeoElement(const std::string& omnisci_column_name) {
469  std::string error_message = "Failed to extract valid geometry in HeavyDB column '" +
470  omnisci_column_name + "'.";
471  throw foreign_storage::ForeignStorageException(error_message);
472  }
473 
474  static void throwMismatchedGeoElement(const std::string& omnisci_column_name) {
476  "Imported geometry"
477  " doesn't match the geospatial type of HeavyDB column '" +
478  omnisci_column_name + "'.");
479  }
480 
481  bool hasBoundsColumn() const {
482  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
483  return geo_column_type == kMULTIPOINT || geo_column_type == kLINESTRING ||
484  geo_column_type == kMULTILINESTRING || geo_column_type == kPOLYGON ||
485  geo_column_type == kMULTIPOLYGON;
486  }
487 
489  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
490  return geo_column_type == kPOLYGON || geo_column_type == kMULTIPOLYGON ||
491  geo_column_type == kMULTILINESTRING;
492  }
493 
494  bool hasRenderGroupColumn() const {
495  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
496  return geo_column_type == kPOLYGON || geo_column_type == kMULTIPOLYGON;
497  }
498 
499  bool hasPolyRingsColumn() const {
500  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
501  return geo_column_type == kMULTIPOLYGON;
502  }
503 
505 
506  constexpr static bool PROMOTE_POLYGON_TO_MULTIPOLYGON = true;
507 
514 
521 
527 
528  std::vector<std::string> base_values_;
529 
530  // Used repeatedly in parsing geo types, declared as members to prevent
531  // deallocation/reallocation costs
532  std::vector<double> coords_parse_buffer_;
533  std::vector<double> bounds_parse_buffer_;
535  std::vector<int> poly_rings_parse_buffer_;
536 
537  // Used to buffer array appends in memory for a batch
538  std::vector<ArrayDatum> coords_datum_buffer_;
539  std::vector<ArrayDatum> bounds_datum_buffer_;
540  std::vector<ArrayDatum> ring_or_line_sizes_datum_buffer_;
541  std::vector<ArrayDatum> poly_rings_datum_buffer_;
542  std::vector<int32_t> render_group_value_buffer_;
543 
545 };
546 
547 } // namespace foreign_storage
static void throwMismatchedGeoElement(const std::string &omnisci_column_name)
#define CHECK_EQ(x, y)
Definition: Logger.h:301
static constexpr bool PROMOTE_POLYGON_TO_MULTIPOLYGON
SQLTypes
Definition: sqltypes.h:55
const ColumnDescriptor * poly_rings_column_descriptor_
const ColumnDescriptor * render_group_column_descriptor_
std::vector< ArrayDatum > coords_datum_buffer_
std::vector< std::string > base_values_
static ArrayDatum composeNullArray(const SQLTypeInfo &ti)
Definition: Importer.cpp:394
std::vector< ArrayDatum > ring_or_line_sizes_datum_buffer_
std::vector< int > ring_or_line_sizes_parse_buffer_
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
#define UNREACHABLE()
Definition: Logger.h:337
void validateMetadataSizing(std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata) const
static void getNullGeoColumns(SQLTypeInfo &ti, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, const bool promote_poly_to_mpoly=false)
Definition: Types.cpp:1309
std::vector< ArrayDatum > bounds_datum_buffer_
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:381
void appendToArrayEncoderAndUpdateMetadata(const std::vector< ArrayDatum > &datum_parse_buffer, Encoder *encoder, ChunkMetadata *chunk_metadata) const
static void throwMalformedGeoElement(const std::string &omnisci_column_name)
std::vector< int32_t > render_group_value_buffer_
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:219
std::vector< uint8_t > compress_coords(const std::vector< double > &coords, const SQLTypeInfo &ti)
Definition: Compression.cpp:52
GeospatialEncoder(const RenderGroupAnalyzerMap *render_group_analyzer_map)
GeospatialEncoder(std::list< Chunk_NS::Chunk > &chunks, const RenderGroupAnalyzerMap *render_group_analyzer_map)
void validateChunksSizing(std::list< Chunk_NS::Chunk > &chunks) const
specifies the content in-memory of a row in the column metadata table
void processGeoElement(std::string_view geo_string_view)
const ColumnDescriptor * getColumnDescriptor(std::list< Chunk_NS::Chunk > &chunks, const SQLTypes sql_type, GeoColumnType geo_column_type)
const RenderGroupAnalyzerMap * render_group_analyzer_map_
ChunkMetadata * ring_or_line_sizes_column_metadata_
const ColumnDescriptor * coords_column_descriptor_
static bool getGeoColumns(const std::string &wkt_or_wkb_hex, SQLTypeInfo &ti, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, const bool promote_poly_to_mpoly=false)
Definition: Types.cpp:1079
std::vector< double > bounds_parse_buffer_
std::vector< double > coords_parse_buffer_
unencoded fixed length array encoder
void appendBaseAndRenderGroupDataAndUpdateMetadata(const int64_t row_count)
ArrayDatum encode_as_array_datum(const std::vector< T > &data)
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:592
For unencoded strings.
const ColumnDescriptor * geo_column_descriptor_
const ColumnDescriptor * ring_or_line_sizes_column_descriptor_
SQLTypeInfo columnType
std::map< int, std::unique_ptr< import_export::RenderGroupAnalyzer >> RenderGroupAnalyzerMap
std::tuple< Encoder *, ChunkMetadata *, const ColumnDescriptor * > initEncoderAndGetEncoderAndMetadata(std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const SQLTypes sql_type, GeoColumnType geo_column_type)
unencoded array encoder
std::vector< ArrayDatum > poly_rings_datum_buffer_
std::string columnName
GeospatialEncoder(std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const RenderGroupAnalyzerMap *render_group_analyzer_map)
std::list< T >::iterator getIteratorForGeoColumnType(std::list< T > &list, const SQLTypes column_type, const GeoColumnType geo_column)
virtual ~GeospatialEncoder()=default
const ColumnDescriptor * bounds_column_descriptor_
virtual std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1)=0
#define IS_GEO_POLY(T)
Definition: sqltypes.h:305