Utopia  2
Framework for studying models of complex & adaptive systems.
hdfdataset.hh
Go to the documentation of this file.
1 
10 #ifndef UTOPIA_DATAIO_HDFDATASET_HH
11 #define UTOPIA_DATAIO_HDFDATASET_HH
12 
13 #include <numeric>
14 #include <stdexcept>
15 #include <unordered_map>
16 #include <utility>
17 
18 #include <hdf5.h>
19 #include <hdf5_hl.h>
20 
21 #include "../core/type_traits.hh"
22 
23 #include "hdfattribute.hh"
24 #include "hdfbufferfactory.hh"
25 #include "hdfchunking.hh"
26 #include "hdfdataspace.hh"
27 #include "hdfobject.hh"
28 #include "hdftype.hh"
29 #include "hdfutilities.hh"
31 
32 namespace Utopia
33 {
34 namespace DataIO
35 {
52 class HDFDataset final : public HDFObject<HDFCategory::dataset>
53 {
54  private:
64  template <typename Datatype> void __create_dataset__(std::size_t typesize)
65  {
66 
67  this->_log->debug("Creating dataset with typesize {} at path {} ...",
68  typesize, _path);
69  this->_log->trace("refcount before creation: {}", get_refcount());
70 
71  // create group property list and (potentially) intermediate groups
72  hid_t group_plist = H5Pcreate(H5P_LINK_CREATE);
73  H5Pset_create_intermediate_group(group_plist, 1);
74 
75  _type.close();
76 
77  _type.open<Datatype>("datatype of " + _path, typesize);
78 
79  // this is something different than typesize, which has meaning for
80  // arrays only
82  {
83  if (_chunksizes.size() != _rank)
84  {
85  this->_log->debug("Computing chunksizes ...");
86  _chunksizes =
88  }
89  }
90 
91  hid_t plist = H5Pcreate(H5P_DATASET_CREATE);
92 
93  // distinguish by chunksize; chunked dataset needed for compression
94  if (_chunksizes.size() > 0)
95  {
96  // create creation property list, set chunksize and compress level
97 
98  this->_log->debug("Setting given chunksizes ...");
99  H5Pset_chunk(plist, _rank, _chunksizes.data());
100 
101  if (_compress_level > 0)
102  {
103  H5Pset_deflate(plist, _compress_level);
104  }
105 
106  _filespace.close();
107  // make dataspace
108  _filespace.open(_path + " file dataspace", _rank, _current_extent,
109  _capacity);
110 
111  // create dataset and return
112  this->_log->debug(
113  "Creating actual dataset and binding it to object class ...");
114 
115  bind_to(H5Dcreate(_parent_identifier.get_id(), _path.c_str(),
117  group_plist, plist, H5P_DEFAULT),
118  &H5Dclose);
119 
120  if (not is_valid())
121  {
122  throw std::runtime_error("Invalid dataset id " + _path + " " +
123  std::to_string(__LINE__));
124  }
125  }
126  else
127  {
128 
129  // make dataspace
130  _filespace.open(_path + "file dataspace", _rank, _current_extent,
131  _capacity);
132 
133  this->_log->debug(
134  "Creating actual dataset and binding it to object class ...");
135  // can create the dataset right away
136  bind_to(H5Dcreate(_parent_identifier.get_id(), _path.c_str(),
138  group_plist, H5P_DEFAULT, H5P_DEFAULT),
139  &H5Dclose);
140 
141  if (not is_valid())
142  {
143  throw std::runtime_error("Invalid dataset id " + _path + " " +
144  std::to_string(__LINE__));
145  }
146  }
147 
148  this->_log->debug("refcount of dataset after creation {}: {}", _path,
149  get_refcount());
150  }
151 
163  template <typename T> herr_t __write_container__(T &&data)
164  {
165  this->_log->debug("Writing container data to dataset {}...", _path);
166 
167  this->_log->debug("Dataset {} 's refcount write begin {}", _path,
168  get_refcount());
169 
170  using value_type_1 = typename Utils::remove_qualifier_t<T>::value_type;
171  using base_type = Utils::remove_qualifier_t<value_type_1>;
172 
173  // we can write directly if we have a plain vector, no nested or
174  // stringtype.
175  if constexpr (std::is_same_v<T, std::vector<value_type_1>> and
176  not Utils::is_container_v<value_type_1> and
177  not Utils::is_string_v<value_type_1>)
178  {
179  this->_log->debug("... of simple vectortype");
180 
181  // check if attribute has been created, else do
182  if (not is_valid())
183  {
184  this->_log->debug("... dataset not yet existing, creating it "
185  "for simple vectortype");
186  __create_dataset__<base_type>(0);
187  }
188  else
189  {
190  this->_log->debug(
191  "... dataset existing, reading out type and writing data");
192  // check if datatypes are compatible
193 
194  HDFType temp_type;
195  temp_type.open<base_type>("testtype", 0);
196 
197  if (temp_type != _type)
198  {
199  throw std::runtime_error(
200  "Error, cannot write container data of a "
201  "different type into dataset " +
202  _path);
203  }
204  }
205  this->_log->debug("Dataset {} 's refcount before write {}", _path,
206  get_refcount());
207 
208  return H5Dwrite(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
209  _filespace.get_C_id(), H5P_DEFAULT, data.data());
210  }
211  // when stringtype or containertype is stored in a container, then
212  // we have to buffer. bufferfactory handles how to do this in detail
213  else
214  {
215  this->_log->debug("... of nontrivial containertype");
216 
217  std::size_t typesize = 0;
218  // check if array, if yes, get typesize, else typesize is 0 and
219  // typefactory creates vlen data or string data
220  if constexpr (Utils::is_container_v<base_type> and
221  Utils::is_array_like_v<base_type>)
222  {
223  // get_size is a metafunction defined in hdfutilities.hh
225  }
226 
227  if (not is_valid())
228  {
229  this->_log->debug(
230  "... dataset not yet existing, creating it for array type");
231  __create_dataset__<base_type>(typesize);
232  }
233  else
234  {
235  // check if datatypes are compatible
236  this->_log->debug("... dataset existing, reading out type");
237 
238  HDFType temp_type;
239  temp_type.open<base_type>("testtype", typesize);
240 
241  if (temp_type != _type)
242  {
243  throw std::runtime_error(
244  "Error, cannot write fixedsize container data of a "
245  "different type into dataset " +
246  _path);
247  }
248  }
249 
250  this->_log->debug(
251  "... buffering data into vectortype appropriate for writing");
252  // the reference is needed here, because addresses of underlaying
253  // data arrays are needed.
254  auto buffer = HDFBufferFactory::buffer(
255  std::begin(data), std::end(data),
256  [](auto &value) -> value_type_1 & { return value; });
257 
258  this->_log->debug("Dataset {} 's refcount before write {}", _path,
259  get_refcount());
260 
261  this->_log->debug("... writing data");
262  return H5Dwrite(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
263  _filespace.get_C_id(), H5P_DEFAULT, buffer.data());
264  }
265  }
266 
276  template <typename T> herr_t __write_stringtype__(T data)
277  {
278  this->_log->debug("Writing string data to dataset {}...", _path);
279 
280  // Since std::string cannot be written directly,
281  // (only const char*/char* can), a buffer pointer has been added
282  // to handle writing in a clearer way and with less code
283  auto len = 0;
284  const char *buffer = nullptr;
285 
286  if constexpr (std::is_pointer_v<T>) // const char* or char* -> strlen
287  // needed
288  {
289  this->_log->debug("... stringtype is pointer-valued");
290  len = std::strlen(data);
291  buffer = data;
292  }
293  else // simple for strings
294  {
295  this->_log->debug("... stringtype is of not pointer-valued");
296  len = data.size();
297  buffer = data.c_str();
298  }
299 
300  // check if dataset has been created, else do
301  if (not is_valid())
302  {
303  this->_log->debug(
304  "... dataset not yet existing, creating it for stringtypee");
305  // check if datatypes are compatible
306 
307  __create_dataset__<const char *>(len);
308  }
309  else
310  {
311  this->_log->debug("... dataset existing, reading out type");
312  // check if datatypes are compatible
313  HDFType temp_type;
314  temp_type.open<const char *>("testtype", len);
315 
316  if (temp_type != _type)
317  {
318  throw std::runtime_error("Error, cannot write string data of a "
319  "different type into dataset " +
320  _path);
321  }
322  }
323 
324  this->_log->debug(" ... writing data");
325  // use that strings store data in consecutive memory
326  return H5Dwrite(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
327  _filespace.get_C_id(), H5P_DEFAULT, buffer);
328  }
329 
341  template <typename T> herr_t __write_pointertype__(T data)
342  {
343  this->_log->debug("Writing pointer data to dataset {}...", _path);
344 
345  // result types removes pointers, references, and qualifiers
346  using basetype = Utils::remove_qualifier_t<T>;
347 
348  if (not is_valid())
349  {
350  this->_log->debug(
351  "... dataset not yet existing, creating it for pointertype");
352 
353  __create_dataset__<basetype>(0);
354  }
355  else
356  {
357  // check if datatypes are compatible
358  this->_log->debug("... dataset existing, reading out type");
359 
360  HDFType temp_type;
361  temp_type.open<basetype>("testtype", 0);
362 
363  if (temp_type != _type)
364  {
365  throw std::runtime_error(
366  "Error, cannot write pointer data of a "
367  "different type into dataset " +
368  _path);
369  }
370  }
371  this->_log->debug(" ... writing data");
372 
373  return H5Dwrite(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
374  _filespace.get_C_id(), H5P_DEFAULT, data);
375  }
376 
387  template <typename T> herr_t __write_scalartype__(T data)
388  {
389  this->_log->debug("Writing scalar data to dataset {}...", _path);
390  // because we just write a scalar, the shape tells basically that
391  // the attribute is pointlike: 1D and 1 entry.
392  if (not is_valid())
393  {
394  this->_log->debug(
395  "... dataset not yet existing, creating it for pointertype");
396 
397  __create_dataset__<std::decay_t<T>>(0);
398  }
399  else
400  {
401  // check if datatypes are compatible
402  this->_log->debug("... dataset existing, reading out type");
403 
404  HDFType temp_type;
405  temp_type.open<std::decay_t<T>>("testtype", 0);
406 
407  if (temp_type != _type)
408  {
409  throw std::runtime_error("Error, cannot write scalar data of a "
410  "different type into dataset " +
411  _path);
412  }
413  }
414 
415  this->_log->debug(" ... writing data");
416 
417  return H5Dwrite(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
418  _filespace.get_C_id(), H5P_DEFAULT, &data);
419  }
420 
422  // We could want to read into a predefined buffer for some reason (frequent
423  // reads), and thus this and the following functions expect an argument
424  // 'buffer' to store their data in. The function 'read(..)' is then
425  // overloaded to allow for automatic buffer creation or a buffer argument.
426  template <typename Type> herr_t __read_container__(Type &buffer)
427  {
428 
429  this->_log->debug("Reading container data from dataset {}...", _path);
430 
431  using value_type_1 =
433 
434  // when the value_type of Type is a container again, we want nested
435  // arrays basically. Therefore we have to check if the desired type
436  // Type is suitable to hold them, read the nested data into a hvl_t
437  // container, assuming that they are varlen because this is the more
438  // general case, and then turn them into the desired type again...
439  if constexpr (Utils::is_container_v<value_type_1> ||
440  Utils::is_string_v<value_type_1>)
441  {
442  this->_log->debug(
443  "... reading nested container or container of strings ...");
444  using value_type_2 =
446 
447  // if we have nested containers of depth larger than 2, throw a
448  // runtime error because we cannot handle this
449  // TODO extend this to work more generally
450  if constexpr (Utils::is_container_v<value_type_2>)
451  {
452  throw std::runtime_error(
453  "Dataset" + _path +
454  ": Cannot read data into nested containers with depth > 3 "
455  "in attribute " +
456  _path + " into vector containers!");
457  }
458  if constexpr (!std::is_same_v<std::vector<value_type_1>, Type>)
459  {
460  throw std::runtime_error("Dataset" + _path +
461  ": Can only read data"
462  " into vector containers!");
463  }
464 
465  // everything is fine.
466 
467  // check if type given in the buffer is std::array.
468  // If it is, the user knew that the data stored there
469  // has always the same length, otherwise she does not
470  // know and thus it is assumed that the data is variable
471  // length.
472  if (_type.type_category() == H5T_ARRAY)
473  {
474  this->_log->debug("... nested type is array-like...");
475  // check if std::array is given as value_type,
476  // if not adjust sizes
477  if constexpr (!Utils::is_array_like_v<value_type_1>)
478  {
479  // if yes, throw exception is size is insufficient because
480  // the size cannot be adjusted
481 
482  throw std::invalid_argument(
483  "Dataset " + _path +
484  ": Cannot read into container of non arrays "
485  "when data type in file is fixed array type");
486  }
487 
488  return H5Dread(get_C_id(), _type.get_C_id(),
490  H5P_DEFAULT, buffer.data());
491  }
492  else if (_type.type_category() == H5T_STRING)
493  {
494  this->_log->debug("... nested type is string-like...");
495 
496  if constexpr (!Utils::is_string_v<value_type_1>)
497  {
498  throw std::invalid_argument(
499  "Dataset " + _path +
500  ": Can only read stringdata into string elements");
501  }
502  else
503  {
504  /*
505  * we have two possibilities, which have to be treated
506  * sparatly, thanks to fucking hdf5 being the most crappy
507  * designed library I ever came accross: 1): dataset
508  * contains variable length strings 2): dataset contains
509  * fixed size strings
510  *
511  * logic:
512  * - check if we have a stringtype
513  * - make a variable length stringtype
514  * - check if the type of the dataset is varlen
515  * string
516  * - yes:
517  * - read into char** buffer,
518  * - then put into container<std::string>
519  * - no:
520  * - get size of type
521  * - make string (=> char array) of size
522  * bufferlen*typesize
523  * - read into it
524  * - split the long string each typesize chars
525  * -> get entries
526  * - put them into final buffer
527  * Mind that the buffer is preallocated to the correct size
528  */
529  HDFType vlentype;
530  vlentype.open<std::string>("vlentype_temporary", 0ul);
531 
532  if (H5Tequal(vlentype.get_C_id(), _type.get_C_id()))
533  {
534  this->_log->debug(
535  "... nested type of variable length type ...");
536 
537  std::vector<char *> temp_buffer(buffer.size());
538  herr_t err =
539  H5Dread(get_C_id(), _type.get_C_id(),
541  H5P_DEFAULT, &temp_buffer[0]);
542 
543  /* README:
544  - hdf5 uses `NULL` as fill value for string entries
545  which are not written per default, and setting another
546  fillvalue did not succeed for variable length data.
547  - The NULL produces a segmentation fault when trying to
548  turn it into an std::string.
549  - Hence, as a workaround, teh `NULL`s are treated
550  explicitly when postprocessing the data into their final
551  form, which is what the code below does.
552  */
553  for (auto [b, tb] = std::make_tuple(
554  buffer.begin(), temp_buffer.begin());
555  b != buffer.end(); ++b, ++tb)
556  {
557  if (*tb != NULL)
558  {
559  *b = *tb;
560  }
561  else
562  {
563  *b = "\0";
564  }
565  }
566 
567  for (auto &&c : temp_buffer)
568  {
569  free(c);
570  }
571 
572  return err;
573  }
574  else
575  {
576 
577  this->_log->debug(
578  "... nested type of fixed length type ...");
579 
580  // get size of the type, set up intermediate string
581  // buffer, adjust its size
582  auto s = _type.size() / sizeof(char);
583  std::string temp_buffer;
584 
585  temp_buffer.resize(buffer.size() * s);
586 
587  // actual read
588  herr_t err =
589  H5Dread(get_C_id(), _type.get_C_id(),
591  H5P_DEFAULT, &temp_buffer[0]);
592 
593  // content of dataset is now one consectuive line of
594  // stuff in temp_buffer. Use read size s to cut out the
595  // strings we want. definitly not elegant and fast, but
596  // strings are ugly to work with in general, and this is
597  // the most simple solution I can currently come up with
598 
599  std::size_t i = 0;
600  std::size_t buffidx = 0;
601  while (i < temp_buffer.size())
602  {
603  buffer[buffidx] = temp_buffer.substr(i, s);
604  i += s;
605  buffidx += 1;
606  }
607 
608  // return
609  return err;
610  }
611  }
612  }
613  // variable length arrays
614  else if (_type.type_category() == H5T_VLEN)
615  {
616  this->_log->debug(
617  "... nested type of variable length array type ... ");
618 
619  std::vector<hvl_t> temp_buffer(buffer.size());
620 
621  herr_t err = H5Dread(
623  _filespace.get_C_id(), H5P_DEFAULT, temp_buffer.data());
624 
625  // turn the varlen buffer into the desired type
626  // Cumbersome, but necessary...
627 
628  this->_log->debug("... transforming the read data to the "
629  "actually desired type ... ");
630 
631  for (std::size_t i = 0; i < buffer.size(); ++i)
632  {
633  if constexpr (!Utils::is_array_like_v<value_type_1>)
634  {
635  buffer[i].resize(temp_buffer[i].len);
636  }
637 
638  // I consider this more elegant than using std::for_each
639  // and defining the 'j' index outside of the predicate
640  for (auto [it, j] =
641  std::make_tuple(std::begin(buffer[i]), 0ul);
642  it != std::end(buffer[i]); ++it, ++j)
643  {
644  *it = static_cast<value_type_2 *>(temp_buffer[i].p)[j];
645  }
646  }
647 
648  hid_t tempspace = H5Dget_space(get_C_id());
649 
650  // free stuff allocated by hdf5 within the hvl_t objects
651  #if H5_VERSION_GE(1, 12, 0)
652  herr_t status = H5Treclaim(_type.get_C_id(),
653  tempspace,
654  H5P_DEFAULT,
655  temp_buffer.data());
656  #else
657  herr_t status = H5Dvlen_reclaim(_type.get_C_id(),
658  tempspace,
659  H5P_DEFAULT,
660  temp_buffer.data());
661  #endif
662 
663  H5Sclose(tempspace);
664 
665  if (status < 0)
666  {
667  throw std::runtime_error(
668  "Error when reclaiming memory in " + _path +
669  " for variable_length datatype");
670  }
671 
672  return err;
673  }
674  else
675  {
676  throw std::runtime_error(
677  "Dataset " + _path +
678  ": Unknown kind of datatype in dataset when requesting to "
679  "read into container");
680  }
681  }
682 
683  else // no nested container or container of strings, but one containing
684  // simple types
685  {
686  this->_log->debug("... no nested type to read");
687  return H5Dread(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
688  _filespace.get_C_id(), H5P_DEFAULT, buffer.data());
689  }
690  }
691 
693 
696  template <typename Type> auto __read_stringtype__(Type &buffer)
697  {
698  this->_log->debug("Reading string data from dataset {}...", _path);
699 
700  buffer.resize(buffer.size() * _type.size());
701  // read data
702  return H5Dread(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
703  _filespace.get_C_id(), H5P_DEFAULT, buffer.data());
704  }
705 
707 
710  template <typename Type> auto __read_pointertype__(Type buffer)
711  {
712  this->_log->debug("Reading pointer data from dataset {}...", _path);
713 
714  return H5Dread(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
715  _filespace.get_C_id(), H5P_DEFAULT, buffer);
716  }
717 
719  template <typename Type> auto __read_scalartype__(Type &buffer)
720  {
721  this->_log->debug("Reading scalar data from dataset {}...", _path);
722 
723  return H5Dread(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
724  _filespace.get_C_id(), H5P_DEFAULT, &buffer);
725  }
726 
729  {
730  auto log = spdlog::get("data_io");
731 
732  log->debug("Writing attribute buffer of dataset {}...", _path);
733 
734  // do nothing if the buffer is empty;
735  if (_attribute_buffer.size() == 0)
736  {
737  return;
738  }
739 
740  // write out the attributes from the attribute buffer.
741  for (auto &[path, variant] : _attribute_buffer)
742  {
743  log->debug("... currently at attribute {}", path);
744 
745  HDFAttribute attr(static_cast<Base &>(*this), path);
746 
747  // Use visiting syntax on the variant to write the attribute value
748  std::visit(
749  // this is a universal reference and hence perfect
750  // forwarding can be employed via std::forward
751  [&attr](auto &&arg) {
752  attr.write(
753  std::forward<std::remove_reference_t<decltype(arg)>>(
754  arg));
755  },
756  variant);
757  }
758 
759  // free up memory.
760  _attribute_buffer.clear();
761  }
762 
767 
771  hsize_t _rank;
772 
776  std::vector<hsize_t> _current_extent;
777 
781  std::vector<hsize_t> _capacity;
782 
787  std::vector<hsize_t> _chunksizes;
788 
793  std::vector<hsize_t> _offset;
794 
799  std::vector<hsize_t> _new_extent;
800 
804  std::size_t _compress_level;
805 
814  std::vector<std::pair<std::string, typename HDFType::Variant>>
816 
822 
828 
834 
835  public:
841 
847  auto get_type() { return _type; }
848 
855 
862 
867 
874 
880  std::size_t get_rank() { return _rank; }
881 
888 
894  auto get_offset() { return _offset; }
900  auto get_capacity() { return _capacity; }
901 
907  auto get_chunksizes() { return _chunksizes; }
908 
915 
921  void set_capacity(std::vector<hsize_t> capacity)
922  {
923  if (is_valid())
924  {
925  throw std::runtime_error(
926  "Dataset " + _path +
927  ": Cannot set capacity after dataset has been created");
928  }
929  else
930  {
931  _rank = capacity.size();
932  _capacity = capacity;
933  }
934  }
935 
941  void set_chunksize(std::vector<hsize_t> chunksizes)
942  {
943  if (is_valid())
944  {
945  throw std::runtime_error(
946  "Dataset " + _path +
947  ": Cannot set chunksize after dataset has been created");
948  }
949 
950  // if chunksizes = {} then it will be automatically determined
951  if (chunksizes.size() != _rank and chunksizes.size() != 0)
952  {
953  throw std::runtime_error(
954  "Dataset " + _path +
955  ": Chunksizes size has to be equal to dataset rank");
956  }
957 
958  _chunksizes = chunksizes;
959  }
960 
975  template <typename Attrdata>
976  void add_attribute(std::string attribute_path, Attrdata data)
977  {
978  // Can only write directly, if the dataset is valid
979  if (is_valid())
980  {
981  this->_log->debug("Add attribute {} to valid dataset {}",
982  attribute_path, _path);
983  // make attribute and write
984  HDFAttribute attr(*this, attribute_path);
985  attr.write(data);
986  }
987  else
988  {
989 
990  this->_log->debug("Add atttribute {} to attribute buffer of {} "
991  "because it has not yet been created on disk",
992  attribute_path, _path);
993  // The dataset was not opened yet. Need to write to buffer
994 
995  // For non-vector container data, need to convert to vector
996  if constexpr (Utils::is_container_v<Attrdata>)
997  {
998  if constexpr (not std::is_same_v<
999  std::vector<typename Attrdata::value_type>,
1000  Attrdata>)
1001  {
1002  // Make it a vector and write to buffer
1003  _attribute_buffer.push_back(std::make_pair(
1004  attribute_path,
1005  std::vector<typename Attrdata::value_type>(
1006  std::begin(data), std::end(data))));
1007  }
1008  else
1009  {
1010  // Can write directly
1011  _attribute_buffer.push_back(
1012  std::make_pair(attribute_path, data));
1013  }
1014  }
1015  else
1016  {
1017  // Can write directly
1018  _attribute_buffer.push_back(
1019  std::make_pair(attribute_path, data));
1020  }
1021  }
1022  }
1023 
1031  void close()
1032  {
1033  auto log = spdlog::get("data_io");
1034 
1035  // write the attributebuffer out
1036  if (is_valid())
1037  {
1039  }
1040 
1041  // employ the object base class' close function to close the dataset,
1042  // then write attributes and close the filespaces
1043  Base::close();
1044 
1045  // close dataspaces
1046  _filespace.close();
1047  _memspace.close();
1048  _type.close();
1049  }
1050 
1064  template <HDFCategory cat>
1065  void open(const HDFObject<cat> &parent_object, std::string path,
1066  std::vector<hsize_t> capacity = {},
1067  std::vector<hsize_t> chunksizes = {}, hsize_t compress_level = 0)
1068  {
1069 
1070  this->_log->debug("Opening dataset {} within {}", path,
1071  parent_object.get_path());
1072 
1073  open(parent_object.get_id_object(), path, capacity, chunksizes,
1074  compress_level);
1075  }
1076 
1090  void open(const HDFIdentifier &parent_identifier, std::string path,
1091  std::vector<hsize_t> capacity = {},
1092  std::vector<hsize_t> chunksizes = {}, hsize_t compress_level = 0)
1093  {
1094 
1095  if (not parent_identifier.is_valid())
1096  {
1097  throw std::runtime_error("parent id not valid for dataset " + path);
1098  }
1099 
1100  _parent_identifier = parent_identifier;
1101  _path = path;
1102 
1103  _filespace.close();
1104  _memspace.close();
1105  // open with H5S_ALL
1106  _filespace.open();
1107  _memspace.open();
1108  // Try to find the dataset in the parent_object
1109  // If it is there, open it.
1110  // Else: postphone the dataset creation to the first write
1111  // the attribute buffer has to be written in both cases,
1112  // as its existence is independent from the existence of the
1113  // dataset in the file. We could use a dataset object repeatedly
1114  // to represent different datasets in the file via calling close
1115  // and open over and over, writing attributes to it while
1116  // it is closed. Therefore, the attribute buffer is written
1117  // out at the end of this function
1118  if (path_is_valid(_parent_identifier.get_id(), _path.c_str()))
1119  { // dataset exists
1120  // open it
1121 
1122  this->_log->debug("... binding existing dataset to object");
1123 
1124  bind_to(H5Dopen(_parent_identifier.get_id(), _path.c_str(),
1125  H5P_DEFAULT),
1126  &H5Dclose);
1127 
1128  _type.close();
1129  _type.open(*this);
1130 
1131  // get dataspace and read out rank, extend, capacity
1132  _filespace.open(*this);
1133 
1134  _rank = _filespace.rank();
1135 
1136  _chunksizes.resize(_rank, 0);
1137  // get chunksizes
1138  hid_t creation_plist = H5Dget_create_plist(get_C_id());
1139  hid_t layout = H5Pget_layout(creation_plist);
1140  if (layout == H5D_CHUNKED)
1141  {
1142  herr_t err =
1143  H5Pget_chunk(creation_plist, _rank, _chunksizes.data());
1144  if (err < 0)
1145  {
1146  throw std::runtime_error(
1147  "Dataset " + _path +
1148  ": Error in reading out chunksizes while opening.");
1149  }
1150  }
1151  H5Pclose(creation_plist);
1152 
1153  // temporary workaround for type inconsistentcy:
1154  // arma::row used by dataspace and std::vector by dataset, and
1155  // chunksize algo
1156  auto [size, capacity] = _filespace.get_properties();
1157 
1158  _current_extent.assign(size.begin(), size.end());
1159  _capacity.assign(capacity.begin(), capacity.end());
1161  }
1162  else
1163  {
1164  this->_log->debug("... dataset not yet existing, have to wait 'til "
1165  "data becomes available");
1166 
1167  // it is not expected that the _attribute_buffer will become big
1168  // and reallocate often, hence a reserve is foregone here,
1169  // which one might otherwise consider.
1170  // The size to reserve would be a rather wild guess however.
1171  if (capacity.size() == 0)
1172  {
1173  _rank = 1;
1174  _capacity = std::vector<hsize_t>(_rank, H5S_UNLIMITED);
1175  _offset = std::vector<hsize_t>(_rank, 0);
1176  }
1177  else
1178  {
1179  _capacity = capacity;
1180  _rank = _capacity.size();
1181  _offset = std::vector<hsize_t>(_rank, 0);
1182  }
1183 
1184  // if chunksizes is given, everything is fine, if not, it is empty
1185  // here and we will check in write method if calculation of
1186  // chunksize is needed
1187  _chunksizes = chunksizes;
1188 
1189  _compress_level = compress_level;
1190 
1191  _id.set_id(-1);
1192  }
1193  }
1194 
1200  void swap(HDFDataset &other)
1201  {
1202  using std::swap;
1203  using Utopia::DataIO::swap;
1204  swap(static_cast<Base &>(*this), static_cast<Base &>(other));
1206  swap(_rank, other._rank);
1208  swap(_capacity, other._capacity);
1209  swap(_chunksizes, other._chunksizes);
1210  swap(_offset, other._offset);
1211  swap(_new_extent, other._new_extent);
1214  swap(_filespace, other._filespace);
1215  swap(_memspace, other._memspace);
1216  swap(_type, other._type);
1217  }
1218 
1226  template <typename T>
1227  void write(T &&data, [[maybe_unused]] std::vector<hsize_t> shape = {})
1228  {
1229  this->_log->debug("Writing data to dataset {}", _path);
1230  this->_log->debug("... current extent {}", Utils::str(_current_extent));
1231  this->_log->debug("... current offset {}", Utils::str(_offset));
1232  this->_log->debug("... capacity {}", Utils::str(_capacity));
1233  this->_log->debug("... refcount {}", get_refcount());
1234 
1235  // dataset does not yet exist
1236  _memspace.close();
1237  _filespace.close();
1238 
1239  _memspace.open();
1240  _filespace.open();
1241 
1242  if (not is_valid())
1243  {
1244  // current limitation removed in future
1245  if (_rank > 2)
1246  {
1247  throw std::runtime_error("Rank > 2 not supported");
1248  }
1249 
1250  /*
1251  if dataset does not yet exist
1252  Get current extend.
1253  If is container:
1254  if 1d:
1255  current_extent = data.size()
1256  else:
1257  current_extent = {1, data.size()}, i.e one line in
1258  matrix
1259 
1260  if pointer:
1261  current_extent is shape
1262  if string or scalar:
1263  current_extent is 1
1264 
1265  then check if chunking is needed but not known and calculate it
1266  or throw error. this is done within the individual __write_X__
1267  methods because detailed type info is needed.
1268  */
1269  _current_extent.resize(_rank);
1270 
1271  if constexpr (Utils::is_container_v<std::decay_t<T>>)
1272  {
1273  if (_rank == 1)
1274  {
1275  _current_extent[_rank - 1] = data.size();
1276  }
1277  else
1278  {
1279  _current_extent[0] = 1;
1280  _current_extent[1] = data.size();
1281  }
1282  }
1283 
1284  else if constexpr (std::is_pointer_v<std::decay_t<T>> and
1285  !Utils::is_string_v<std::decay_t<T>>)
1286  {
1287  if (shape.size() == 0)
1288  {
1289  throw std::runtime_error(
1290  "Dataset " + _path +
1291  ": shape has to be given explicitly when writing "
1292  "pointer types");
1293  }
1294  _current_extent = shape;
1295  }
1296  else
1297  {
1298  _current_extent[_rank - 1] = 1;
1299  }
1300  }
1301  else
1302  {
1303 
1304  /*
1305  if dataset does exist:
1306  - check if the type of the data given to write is compatible
1307  with the one of the dataset
1308 
1309  - make _new_extent array equalling current_extent, leave
1310  current_extent If is container: if 1d: _new_extent = current_extent
1311  + data.size() else: _new_extent = {current_extent[0]+1,
1312  current_extent[1]}, i.e one new line in matrix
1313 
1314  if pointer:
1315  current_extent += shape
1316  if string or scalar:
1317  current_extent += 1
1318 
1319 
1320  offset = current_extent
1321  buf if 2d and current_extent[1]==capacity[1](end of line):
1322  offset = {current_extent[0]+1, 0};
1323 
1324  count = {1, data.size} if 2d, {data.size()} if 1d
1325 
1326  then extent data,
1327  select newly added line
1328  update current_ex
1329  write
1330  */
1331 
1332  // make a temporary for new extent
1333  std::vector<hsize_t> _new_extent = _current_extent;
1334 
1335  if (_capacity == _current_extent)
1336  {
1337  throw std::runtime_error("Dataset " + _path +
1338  ": Error, dataset cannot be extended "
1339  "because it reached its capacity");
1340  }
1341  else
1342  {
1343  // set offset array
1344  // this is needed because multiple writes one after the other
1345  // could occur without intermediate close and reopen (which
1346  // would set _offset correctly)
1348 
1349  if (_rank > 1)
1350  {
1351  if (_current_extent[1] == _capacity[1])
1352  {
1353  _offset[1] = 0;
1354  }
1355  }
1356 
1357  // if data is a container, then we have to add its size to
1358  // extend, if it is a pointer, we have to add the pointers
1359  // shape, else we have to add 1 because we either write
1360  // a single scalar or string
1361  if constexpr (Utils::is_container_v<std::decay_t<T>>)
1362  {
1363  if (_rank == 1)
1364  {
1365  _new_extent[0] += data.size();
1366  }
1367  else
1368  {
1369  _new_extent[0] += 1;
1370  }
1371  }
1372  else if constexpr (std::is_pointer_v<std::decay_t<T>> and
1373  !Utils::is_string_v<std::decay_t<T>>)
1374  {
1375  if (shape.size() == 0)
1376  {
1377  throw std::runtime_error(
1378  "Dataset " + _path +
1379  ": shape has to be given explicitly when writing "
1380  "pointer types");
1381  }
1382 
1383  for (std::size_t i = 0; i < _rank; ++i)
1384  {
1385  _new_extent[i] += shape[i];
1386  }
1387  }
1388  else
1389  {
1390  if (_rank == 1)
1391  {
1392  // if rank is one we can only extend into one direction
1393  _new_extent[0] += 1;
1394  }
1395  else
1396  {
1397  // first fill row, then column wise increase
1398  if (_current_extent[0] < _capacity[0])
1399  {
1400  _new_extent[0] += 1;
1401  }
1402  // if row is full, start a new one
1403  else
1404  {
1405  _new_extent[1] += 1;
1406  }
1407  }
1408  }
1409  }
1410  // select counts for dataset
1411  // this has to be generalized and refactored
1412  std::vector<hsize_t> counts(_rank, 0);
1413  if constexpr (Utils::is_container_v<std::decay_t<T>>)
1414  {
1415  if (_rank == 1)
1416  {
1417  counts = {data.size()};
1418  }
1419  else
1420  {
1421  counts = {1, data.size()};
1422  }
1423  }
1424  // when is pointer, the counts are given by shape
1425  else if constexpr (std::is_pointer_v<std::decay_t<T>> and
1426  !Utils::is_string_v<std::decay_t<T>>)
1427  {
1428  counts = shape;
1429  }
1430  else
1431  {
1432  counts = {1};
1433  }
1434 
1435  // extent the dataset
1436  for (std::size_t i = 0; i < _rank; ++i)
1437  {
1438  if (_new_extent[i] > _capacity[i])
1439  {
1440  throw std::runtime_error("Dataset " + _path +
1441  ": Cannot append data, "
1442  "_new_extent larger than capacity "
1443  "in dimension " +
1444  std::to_string(i));
1445  }
1446  }
1447 
1448  // extend the dataset to the new size
1449  herr_t err = H5Dset_extent(get_C_id(), _new_extent.data());
1450 
1451  if (err < 0)
1452  {
1453  throw std::runtime_error(
1454  "Dataset " + _path +
1455  ": Error when trying to increase extent");
1456  }
1457 
1458  // get file and memory spaces which represent the selection to write
1459  // at
1460  _filespace.open(*this);
1461 
1462  _memspace.open(_path + "memory dataspace", _rank, counts, {});
1463 
1465  _offset, // start
1466  arma::Row<hsize_t>(_offset) + arma::Row<hsize_t>(counts), // end
1467  {} // stride
1468  );
1469 
1471  }
1472 
1473  this->_log->debug("New extent {}", Utils::str(_new_extent));
1474  this->_log->debug("New offset {}", Utils::str(_offset));
1475  this->_log->debug(" Refcount before write {}", get_refcount());
1476 
1477  // everything is prepared, we can write the data
1478  if constexpr (Utils::is_container_v<std::decay_t<T>>)
1479  {
1480 
1481  herr_t err = __write_container__(std::forward<T>(data));
1482 
1483  if (err < 0)
1484  {
1485  throw std::runtime_error("Dataset " + _path +
1486  ": Error in appending container");
1487  }
1488  }
1489  else if constexpr (Utils::is_string_v<std::decay_t<T>>)
1490  {
1491  herr_t err = __write_stringtype__(std::forward<T>(data));
1492  if (err < 0)
1493  {
1494  throw std::runtime_error("Dataset " + _path +
1495  ": Error in appending string");
1496  }
1497  }
1498  else if constexpr (std::is_pointer_v<std::decay_t<T>> and
1499  !Utils::is_string_v<std::decay_t<T>>)
1500  {
1501  herr_t err = __write_pointertype__(std::forward<T>(data));
1502  if (err < 0)
1503  {
1504  throw std::runtime_error("Dataset " + _path +
1505  ": Error in appending pointer");
1506  }
1507  }
1508  else
1509  {
1510  herr_t err = __write_scalartype__(std::forward<T>(data));
1511  if (err < 0)
1512  {
1513  throw std::runtime_error("Dataset " + _path +
1514  ": Error in appending scalar");
1515  }
1516  }
1517  }
1518 
1532  template <typename Iter, typename Adaptor>
1533  void write(Iter begin, Iter end, Adaptor &&adaptor)
1534  {
1535  this->_log->debug("Writing iterator range to dataset {}", _path);
1536  // this->_log->debug("... current offset {}", Utils::str(_offset));
1537  // this->_log->debug("... capacity {}", Utils::str(_capacity));
1538 
1539  using Type = Utils::remove_qualifier_t<decltype(adaptor(*begin))>;
1540 
1541  write([&]() {
1542  std::vector<Type> buff(std::distance(begin, end));
1543 
1544  std::generate(buff.begin(), buff.end(),
1545  [&begin, &adaptor]() { return adaptor(*(begin++)); });
1546  return buff;
1547  }());
1548  }
1549 
1573  template <typename T, std::size_t d>
1574  void write_nd(const boost::multi_array<T, d> &data,
1575  std::vector<hsize_t> offset = {})
1576  {
1577  this->_log->debug("Writing N-dimensional dataset to dataset {}", _path);
1578  this->_log->debug("... current extent {}", Utils::str(_current_extent));
1579  this->_log->debug("... current offset {}", Utils::str(_offset));
1580  this->_log->debug("... capacity {}", Utils::str(_capacity));
1581 
1582  _filespace.close();
1583  _memspace.close();
1584 
1585  // create dataspaces
1586  _memspace.open();
1587  _filespace.open();
1588 
1589  // dataset does not yet exist
1590  if (not is_valid())
1591  {
1592 
1593  // two possibilities: capacity given or not:
1594  // if not given:
1595  // use data to determine extent and capacity, correcting the
1596  // assumed ones from 'open'
1597  // else use given values
1598 
1599  if (_capacity == std::vector<hsize_t>{H5S_UNLIMITED} and _rank == 1)
1600  {
1601  _rank = d;
1602  _current_extent.resize(_rank, 0);
1603  _offset.resize(_rank, 0);
1604  for (std::size_t i = 0; i < _rank; ++i)
1605  {
1606  _current_extent[i] = data.shape()[i];
1607  }
1608  _capacity.resize(d, H5S_UNLIMITED);
1609  }
1610  else
1611  {
1612  _current_extent.resize(_rank, 1);
1613  _offset.resize(_rank, 0);
1614 
1615  for (auto [i, j] = std::make_tuple(_rank - d, 0); i < _rank;
1616  ++i, ++j)
1617  {
1618  _current_extent[i] = data.shape()[j];
1619  }
1620  }
1621 
1622  _log->debug("Dataset {} does not exist yet, properties were "
1623  "determined to be",
1624  _path);
1625  _log->debug(" rank: {}", Utils::str(_capacity));
1626  _log->debug(" datarank: {}", Utils::str(d));
1627  _log->debug(" datashape: {}", Utils::str(std::vector<std::size_t>(
1628  data.shape(), data.shape() + d)));
1629  _log->debug(" capacity: {}", Utils::str(_capacity));
1630  _log->debug(" offset: {}", Utils::str(_offset));
1631  _log->debug(" current_extent: {}", Utils::str(_current_extent));
1632  }
1633  else
1634  {
1635 
1636  if (_rank < d)
1637  {
1638  throw std::invalid_argument(
1639  "Error, the dimensionality of the dataset, which is " +
1641  ", must be >= the dimensionality of the data to be "
1642  "written, which is " +
1643  std::to_string(d));
1644  }
1645  else
1646  {
1647 
1648  _log->debug("Dataset {} does exist", _path);
1649  _log->debug("Properties of data to be written");
1650  _log->debug(" datarank: {}", Utils::str(d));
1651  _log->debug(" datashape: {}",
1652  Utils::str(std::vector<std::size_t>(
1653  data.shape(), data.shape() + d)));
1654 
1655  _log->debug(
1656  "Properties before change for accomodating new data");
1657  _log->debug(" rank: {}", Utils::str(_capacity));
1658  _log->debug(" capacity: {}", Utils::str(_capacity));
1659  _log->debug(" offset: {}", Utils::str(_offset));
1660  _log->debug(" current_extent: {}", Utils::str(_current_extent));
1661 
1662  std::vector<hsize_t> _new_extent = _current_extent;
1663 
1664  // two cases: When offset is given, and when not. When it is
1665  // given, then it is assumed that the data has always the same
1666  // shape except in the first dimension
1667 
1668  if (offset.size() != 0)
1669  {
1670 
1671  // when offset is given we use it to
1672  // determine how to extent the dataset. Note that the
1673  // requirement that all data written have the same shape in
1674  // all dimensions but the first is not enforced here, hence
1675  // the algorithm works a little differently
1676  _offset = offset;
1677  for (std::size_t i = 0; i < _rank - d; ++i)
1678  {
1679  if (_offset[i] == _current_extent[i])
1680  {
1681  _new_extent[i] += 1;
1682  }
1683  }
1684 
1685  for (auto [i, j] = std::make_tuple(_rank - d, 0ul); i < d;
1686  ++i)
1687  {
1688  if (_current_extent[i] < (_offset[i] + data.shape()[j]))
1689  {
1690  _new_extent[i] = _offset[i] + data.shape()[j];
1691  }
1692  if (_new_extent[i] > _capacity[i])
1693  {
1694  throw std::runtime_error(
1695  "Dataset " + _path + ": Capacity[" +
1696  std::to_string(i) +
1697  "] = " + std::to_string(_capacity[i]) +
1698  ", which is too small for a desired new "
1699  "extent[" +
1700  std::to_string(i) +
1701  "] = " + std::to_string(_new_extent[i]));
1702  }
1703  }
1704 
1705  // extend the dataset to the new size
1706  herr_t err = H5Dset_extent(get_C_id(), _new_extent.data());
1707 
1708  if (err < 0)
1709  {
1710  throw std::runtime_error(
1711  "Dataset " + _path +
1712  ": Error when trying to increase extent");
1713  }
1714  }
1715 
1716  else
1717  {
1718  // zeroth index is treated separatlye because it is used to
1719  // increase the total available space in the dataset
1720 
1721  _new_extent[0] += (d == _rank) ? data.shape()[0]
1722  : 1; // add all needed slices
1723 
1724  if (_new_extent[0] > _capacity[0])
1725  {
1726  throw std::runtime_error(
1727  "Error in " + _path + ", capacity " +
1728  std::to_string(_capacity[0]) + " at index " +
1729  std::to_string(0) + " of " + std::to_string(d) +
1730  " is too small for new extent " +
1732  }
1733 
1734  for (auto [i, j] =
1735  std::make_tuple(1ul, (d == _rank) ? 1ul : 0ul);
1736  i < _rank && j < d; ++i, ++j)
1737  {
1738  if (data.shape()[j] > _current_extent[i])
1739  {
1740  _new_extent[i] +=
1741  data.shape()[j] - _current_extent[i];
1742  if (_new_extent[i] > _capacity[i])
1743  {
1744  throw std::runtime_error(
1745  "Error in " + _path +
1746  ", capacity at index " + std::to_string(i) +
1747  " of " + std::to_string(d) +
1748  " is too small");
1749  }
1750  }
1751  }
1752 
1753  // extend the dataset to the new size
1754  herr_t err = H5Dset_extent(get_C_id(), _new_extent.data());
1755 
1756  if (err < 0)
1757  {
1758  throw std::runtime_error(
1759  "Dataset " + _path +
1760  ": Error when trying to increase extent");
1761  }
1762 
1763  // if the algo progresses until here, it is safe to do this;
1764 
1765  _offset.resize(_rank);
1766  std::fill(_offset.begin(), _offset.end(), 0);
1767  _offset[0] = _current_extent[0];
1768  }
1769  /*
1770  * README: The count vector is needed for determining the slice
1771  * to write to in the datafile. hdf5 determines slices in the
1772  * dataset via [start, step, count], pattern, where the 'count'
1773  * gives the number fo steps in each dimension. Hence, the
1774  * counts have to be computed/assigned from the data to be
1775  * written
1776  */
1777  std::vector<hsize_t> counts(_rank, 1);
1778 
1779  for (auto [i, j] = std::make_tuple(_rank - d, 0); i < _rank;
1780  ++i, ++j)
1781  {
1782  counts[i] = data.shape()[j];
1783  }
1784 
1785  // get file and memory spaces which represent the selection to
1786  // write at
1787  _filespace.close();
1788  _memspace.close();
1789  _filespace.open(*this);
1790 
1791  _memspace.open(_path + "memory dataspace", _rank, counts, {});
1792 
1794  arma::Row<hsize_t>(_offset) +
1795  arma::Row<hsize_t>(counts),
1796  {});
1797 
1798  // update the current extent
1800 
1801  _log->debug(
1802  "Properties after change for accomodating new data");
1803  _log->debug(" rank: {}", Utils::str(_capacity));
1804  _log->debug(" capacity: {}", Utils::str(_capacity));
1805  _log->debug(" offset: {}", Utils::str(_offset));
1806  _log->debug(" current_extent: {}", Utils::str(_current_extent));
1807  _log->debug("new extent {}", Utils::str(_new_extent));
1808  }
1809  }
1810 
1811  // dataset extension is done, now we can check if we have to buffer data
1812  // FIXME: this has to be put into the bufferfactory class later, ideally
1813  // using a plain char buffer for it to avoid templating and enabling
1814  // memory reuse by making the bufferfactory a member.
1815 
1816  if constexpr (std::is_scalar_v<std::decay_t<T>>)
1817  {
1818  if (not is_valid())
1819  {
1820  __create_dataset__<std::decay_t<T>>(0);
1821  }
1822  else
1823  {
1824  HDFType temp_type;
1825  temp_type.open<std::decay_t<T>>("testtype", 0);
1826  if (_type != temp_type)
1827  {
1828  throw std::runtime_error("Error, cannot write data of a "
1829  "different type into dataset " +
1830  _path);
1831  }
1832  }
1833 
1834  herr_t err =
1835  H5Dwrite(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
1836  _filespace.get_C_id(), H5P_DEFAULT, data.data());
1837 
1838  if (err < 0)
1839  {
1840  throw std::runtime_error(
1841  "Dataset " + _path +
1842  ": Error in writing nd-array holding scalar values");
1843  }
1844  }
1845  else if constexpr (Utils::is_string_v<std::decay_t<T>>)
1846  {
1847  if (not is_valid())
1848  {
1849  __create_dataset__<std::decay_t<T>>(0);
1850  }
1851  else
1852  {
1853  HDFType temp_type;
1854  temp_type.open<std::decay_t<T>>("testtype", 0);
1855 
1856  if (_type != temp_type)
1857  {
1858  throw std::runtime_error("Error, cannot write data of a "
1859  "different type into dataset " +
1860  _path);
1861  }
1862  }
1863  // make a buffer that mirrors the shape of the data
1864  boost::multi_array<const char *, d> buffer(
1865  reinterpret_cast<boost::array<size_t, d> const &>(
1866  *data.shape()));
1867 
1868  // fill the buffer
1869  std::transform(data.data(), data.data() + data.num_elements(),
1870  buffer.data(),
1871  [](auto &&str) { return str.c_str(); });
1872 
1873  // write the buffer
1874  herr_t err =
1875  H5Dwrite(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
1876  _filespace.get_C_id(), H5P_DEFAULT, buffer.data());
1877 
1878  if (err < 0)
1879  {
1880  throw std::runtime_error(
1881  "Dataset " + _path +
1882  ": Error in writing nd-array holding string values");
1883  }
1884  }
1885  else if constexpr (Utils::is_container_v<std::decay_t<T>>)
1886  {
1887  if constexpr (Utils::is_array_like_v<std::decay_t<T>>)
1888  {
1889  hsize_t typesize = Utils::get_size<std::decay_t<T>>::value;
1890 
1891  // create dataset with given typesize
1892  if (not is_valid())
1893  {
1894  __create_dataset__<std::decay_t<T>>(typesize);
1895  }
1896  else
1897  {
1898  HDFType temp_type;
1899  temp_type.open<std::decay_t<T>>("testtype", typesize);
1900  if (_type != temp_type)
1901  {
1902  throw std::runtime_error(
1903  "Error, cannot write data of a "
1904  "different type into dataset " +
1905  _path);
1906  }
1907  }
1908 
1909  // write the buffer not needed here
1910  herr_t err =
1911  H5Dwrite(get_C_id(), _type.get_C_id(), _memspace.get_C_id(),
1912  _filespace.get_C_id(), H5P_DEFAULT, data.data());
1913  if (err < 0)
1914  {
1915  throw std::runtime_error(
1916  "Dataset " + _path +
1917  ": Error in writing nd-array holding array values");
1918  }
1919  }
1920  else
1921  {
1922  // create dataset with given typesize
1923  if (not is_valid())
1924  {
1925  __create_dataset__<std::decay_t<T>>(0);
1926  }
1927  else
1928  {
1929  HDFType temp_type;
1930  temp_type.open<std::decay_t<T>>("temp_type", 0);
1931  if (_type != temp_type)
1932  {
1933  throw std::runtime_error(
1934  "Error, cannot write data of a "
1935  "different type into dataset " +
1936  _path);
1937  }
1938  }
1939  // vector is stored
1940  if constexpr (std::is_same_v<
1941  std::vector<typename T::value_type>,
1942  std::decay_t<T>>)
1943  {
1944  // make buffer
1945  boost::multi_array<hvl_t, d> buffer(
1946  reinterpret_cast<boost::array<size_t, d> const &>(
1947  *data.shape()));
1948 
1950  data.data(), data.data() + data.num_elements(),
1951  buffer.data(), [](auto &&v) {
1952  return hvl_t{
1953  v.size(),
1954  // cumbersome const cast needed because I want
1955  // to keep const Reference argument 'cause it
1956  // can bind to lvalues and rvalues alike, i.e.
1957  // you can construct a multi_array in the arg
1958  // list, but also pass an existing one as
1959  // reference.
1960  const_cast<Utils::remove_qualifier_t<decltype(
1961  v.data())> *>(v.data())};
1962  });
1963 
1964  // write the buffer
1965  herr_t err = H5Dwrite(
1967  _filespace.get_C_id(), H5P_DEFAULT, buffer.data());
1968 
1969  if (err < 0)
1970  {
1971  throw std::runtime_error("Dataset " + _path +
1972  ": Error in writing nd-array "
1973  "holding vector values");
1974  }
1975  }
1976  // no vector is stored
1977  else
1978  {
1979  // make buffers, when no vector we need two of them,
1980  // one to transform to vector which has contiguous storage
1981  // which in turn is needed by hdf5, the other
1982  // for turning the new vectors into hvl_t
1983  boost::multi_array<std::vector<typename T::value_type>, d>
1984  vector_buffer(
1985  reinterpret_cast<boost::array<size_t, d> const &>(
1986  *data.shape()));
1987 
1988  boost::multi_array<hvl_t, d> buffer(
1989  reinterpret_cast<boost::array<size_t, d> const &>(
1990  *data.shape()));
1991 
1993  data.data(), data.data() + data.num_elements(),
1994  vector_buffer.data(), [](auto &&v) {
1995  return std::vector<typename T::value_type>(
1996  v.begin(), v.end());
1997  });
1998 
1999  std::transform(vector_buffer.data(),
2000  vector_buffer.data() +
2001  vector_buffer.num_elements(),
2002  buffer.data(), [](auto &&v) {
2003  return hvl_t{v.size(), v.data()};
2004  });
2005 
2006  // write the buffer
2007  herr_t err = H5Dwrite(
2009  _filespace.get_C_id(), H5P_DEFAULT, buffer.data());
2010 
2011  if (err < 0)
2012  {
2013  throw std::runtime_error(
2014  "Dataset " + _path +
2015  ": Error in writing nd-array holding non-vector "
2016  "container values");
2017  }
2018  }
2019  }
2020  }
2021  }
2022 
2036  template <typename Type>
2037  auto read([[maybe_unused]] std::vector<hsize_t> start = {},
2038  [[maybe_unused]] std::vector<hsize_t> end = {},
2039  [[maybe_unused]] std::vector<hsize_t> stride = {})
2040  {
2041  this->_log->debug(
2042  "Reading dataset {}, starting at {}, ending at {}, using stride {}",
2043  _path, Utils::str(start), Utils::str(end), Utils::str(stride));
2044 
2045  if (not is_valid())
2046  {
2047  throw std::runtime_error("Dataset " + _path +
2048  ": Dataset id is invalid");
2049  }
2050 
2051  _filespace.close();
2052  _memspace.close();
2053 
2054  // variables needed for reading
2055  std::vector<hsize_t> readshape; // shape vector for read, either
2056  // _current_extent or another shape
2057 
2058  std::size_t size = 1;
2059 
2060  // read entire dataset
2061  if (start.size() == 0)
2062  {
2063  readshape = _current_extent;
2064  _filespace.open();
2065  _memspace.open();
2066 
2067  // make flattened size of data to read
2068  for (auto &s : readshape)
2069  {
2070  size *= s;
2071  }
2072  }
2073  // read [start, end) with steps given by stride in each dimension
2074  else
2075  {
2076  // throw error if ranks and shape sizes do not match
2077  if (start.size() != _rank or end.size() != _rank or
2078  stride.size() != _rank)
2079  {
2080  throw std::invalid_argument(
2081  "Dataset " + _path +
2082  ": start, end, stride have to be "
2083  "same size as dataset rank, which is " +
2084  std::to_string(_rank));
2085  }
2086 
2087  // set offset of current array to start
2088  _offset = start;
2089 
2090  // make count vector
2091  // exploit that hsize_t((end-start)/stride) cuts off decimal
2092  // places and thus results in floor((end-start)/stride) always.
2093  std::vector<hsize_t> count(start.size());
2094 
2095  // build the count array -> how many elements to read in each
2096  // dimension
2097  for (std::size_t i = 0; i < _rank; ++i)
2098  {
2099  count[i] = (end[i] - start[i]) / stride[i];
2100  }
2101 
2102  for (auto &s : count)
2103  {
2104  size *= s;
2105  }
2106 
2107  readshape = count;
2108 
2109  _filespace.close();
2110  _memspace.close();
2111 
2112  _filespace.open(*this);
2113  _memspace.open(_path + " memory dataspace", _rank, count, {});
2114 
2115  this->_log->debug("... selecting slice in filespace for dataset {}",
2116  _path);
2117  _filespace.select_slice(start, end, stride);
2118  }
2119 
2120  // Below the actual reading happens
2121 
2122  // type to read in is a container type, which can hold containers
2123  // themselvels or just plain types.
2124  if constexpr (Utils::is_container_v<Type>)
2125  {
2126  Type buffer(size);
2127  herr_t err = __read_container__(buffer);
2128  if (err < 0)
2129  {
2130  throw std::runtime_error("Dataset " + _path +
2131  ": Error reading container type ");
2132  }
2133  return std::make_tuple(readshape, buffer);
2134  }
2135  else if constexpr (Utils::is_string_v<Type>) // we can have string
2136  // types too, i.e. char*,
2137  // const char*,
2138  // std::string
2139  {
2140  std::string buffer; // resized in __read_stringtype__ because this
2141  // as a scalar
2142  buffer.resize(size);
2143  herr_t err = __read_stringtype__(buffer);
2144  if (err < 0)
2145  {
2146  throw std::runtime_error("Dataset " + _path +
2147  ": Error reading string type ");
2148  }
2149 
2150  return std::make_tuple(readshape, buffer);
2151  }
2152  else if constexpr (std::is_pointer_v<Type> && !Utils::is_string_v<Type>)
2153  {
2154  std::shared_ptr<Utils::remove_qualifier_t<Type>> buffer(
2155  new Utils::remove_qualifier_t<Type>[size],
2156  std::default_delete<Utils::remove_qualifier_t<Type>[]>());
2157 
2158  herr_t err = __read_pointertype__(buffer.get());
2159 
2160  if (err < 0)
2161  {
2162  std::runtime_error("Dataset " + _path +
2163  ": Error reading pointer type ");
2164  }
2165  return std::make_tuple(readshape, buffer);
2166  }
2167  else // reading scalar types is simple enough
2168  {
2169  Type buffer(0);
2170  herr_t err = __read_scalartype__(buffer);
2171  if (err < 0)
2172  {
2173  std::runtime_error("Dataset " + _path +
2174  ": Error reading scalar type ");
2175  }
2176  return std::make_tuple(readshape, buffer);
2177  }
2178  }
2179 
2183  HDFDataset() = default;
2184 
2190  HDFDataset(const HDFDataset &other) = default;
2191 
2197  HDFDataset(HDFDataset &&other) = default;
2205  HDFDataset &operator=(const HDFDataset &other) = default;
2206 
2213  HDFDataset &operator=(HDFDataset &&other) = default;
2214 
2230  template <HDFCategory cat>
2231  HDFDataset(HDFObject<cat> &parent_object, std::string path,
2232  std::vector<hsize_t> capacity = {},
2233  std::vector<hsize_t> chunksizes = {}, hsize_t compress_level = 0)
2234 
2235  {
2236  open(parent_object, path, capacity, chunksizes, compress_level);
2237  }
2238 
2242  virtual ~HDFDataset() { close(); }
2243 }; // end of HDFDataset class
2244 
2253 void swap(HDFDataset &lhs, HDFDataset &rhs) { lhs.swap(rhs); }
2254  // end of group HDF5 // end of group DataIO
2257 
2258 } // namespace DataIO
2259 } // namespace Utopia
2260 #endif // UTOPIA_DATAIO_HDFDATASET_HH
Class for hdf5 attribute, which can be attached to groups and datasets.
Definition: hdfattribute.hh:46
void write(Type attribute_data, std::vector< hsize_t > shape={})
Function for writing data to the attribute.
Definition: hdfattribute.hh:780
static auto buffer(Iter begin, Iter end, Adaptor &&adaptor)
static function for turning an iterator range with arbitrarty datatypes into a vector of data as retu...
Definition: hdfbufferfactory.hh:96
Class representing a HDFDataset, wich reads and writes data and attributes.
Definition: hdfdataset.hh:53
std::vector< std::pair< std::string, typename HDFType::Variant > > _attribute_buffer
A buffer for storing attributes before the dataset exists.
Definition: hdfdataset.hh:815
auto get_capacity()
get the maximum extend of the dataset
Definition: hdfdataset.hh:900
std::vector< hsize_t > _current_extent
the currently occupied size of the dataset in number of elements
Definition: hdfdataset.hh:776
void write(T &&data, [[maybe_unused]] std::vector< hsize_t > shape={})
Writes data of arbitrary type.
Definition: hdfdataset.hh:1227
HDFDataset(const HDFDataset &other)=default
Copy constructor.
auto get_current_extent()
get the current extend of the dataset
Definition: hdfdataset.hh:887
HDFDataspace get_filespace()
Get the file dataspace id.
Definition: hdfdataset.hh:861
void write(Iter begin, Iter end, Adaptor &&adaptor)
Write function for writing iterator ranges [start, end), in accordance with respective stl pattern.
Definition: hdfdataset.hh:1533
HDFDataset & operator=(const HDFDataset &other)=default
Assignment operator.
std::vector< hsize_t > _chunksizes
the chunksizes per dimensions if dataset is extendible or compressed
Definition: hdfdataset.hh:787
herr_t __read_container__(Type &buffer)
Read a cointainer.
Definition: hdfdataset.hh:426
HDFDataspace _filespace
file dataspace identifier
Definition: hdfdataset.hh:827
HDFDataspace get_memspace()
Get the memory dataspace id.
Definition: hdfdataset.hh:854
auto get_attribute_buffer()
Returns the attribute buffer of this dataset.
Definition: hdfdataset.hh:866
auto get_chunksizes()
Get the chunksizes vector.
Definition: hdfdataset.hh:907
herr_t __write_scalartype__(T data)
Writes simple scalars, which are not pointers, containers or strings.
Definition: hdfdataset.hh:387
void set_chunksize(std::vector< hsize_t > chunksizes)
Set the chunksize object.
Definition: hdfdataset.hh:941
auto read([[maybe_unused]] std::vector< hsize_t > start={}, [[maybe_unused]] std::vector< hsize_t > end={}, [[maybe_unused]] std::vector< hsize_t > stride={})
Read (a subset of ) a dataset into a buffer of type 'Type'. Type gives the type of the buffer to read...
Definition: hdfdataset.hh:2037
void __write_attribute_buffer__()
write out the attribute buffer
Definition: hdfdataset.hh:728
void open(const HDFObject< cat > &parent_object, std::string path, std::vector< hsize_t > capacity={}, std::vector< hsize_t > chunksizes={}, hsize_t compress_level=0)
Open the dataset in parent_object with relative path 'path'.
Definition: hdfdataset.hh:1065
HDFDataset & operator=(HDFDataset &&other)=default
Move assignment operator.
herr_t __write_pointertype__(T data)
Writes pointers, shape is like numpy shape arg.
Definition: hdfdataset.hh:341
void add_attribute(std::string attribute_path, Attrdata data)
add attribute to the dataset
Definition: hdfdataset.hh:976
virtual ~HDFDataset()
Destructor.
Definition: hdfdataset.hh:2242
auto __read_stringtype__(Type &buffer)
read attirbute data which contains a single string.
Definition: hdfdataset.hh:696
HDFDataset(HDFDataset &&other)=default
Move constructor.
auto __read_pointertype__(Type buffer)
read pointertype.
Definition: hdfdataset.hh:710
HDFDataset(HDFObject< cat > &parent_object, std::string path, std::vector< hsize_t > capacity={}, std::vector< hsize_t > chunksizes={}, hsize_t compress_level=0)
Construct a new HDFDataset object.
Definition: hdfdataset.hh:2231
auto get_type()
Get the type object.
Definition: hdfdataset.hh:847
HDFIdentifier _parent_identifier
Identifier of the parent object.
Definition: hdfdataset.hh:766
std::size_t get_rank()
get the rank of the dataset, i.e. the dimensionality
Definition: hdfdataset.hh:880
HDFType _type
Type of the data the dataset holds.
Definition: hdfdataset.hh:821
std::vector< hsize_t > _new_extent
buffer for extent update
Definition: hdfdataset.hh:799
void swap(HDFDataset &other)
swap the state of the objects
Definition: hdfdataset.hh:1200
auto get_offset()
Get the offset object.
Definition: hdfdataset.hh:894
hsize_t _rank
number of dimensions of the dataset
Definition: hdfdataset.hh:771
HDFIdentifier get_parent_id()
get a shared_ptr to the parent_object
Definition: hdfdataset.hh:873
HDFDataspace _memspace
memory dataspace identifier
Definition: hdfdataset.hh:833
std::vector< hsize_t > _offset
offset of the data
Definition: hdfdataset.hh:793
auto __read_scalartype__(Type &buffer)
read scalar type, trivial
Definition: hdfdataset.hh:719
std::size_t _compress_level
the level of compression, 0 to 10
Definition: hdfdataset.hh:804
void write_nd(const boost::multi_array< T, d > &data, std::vector< hsize_t > offset={})
Write a boost::multi_array of arbitrary type and dimension to the dataset. The dataset needs to be of...
Definition: hdfdataset.hh:1574
void close()
Close the dataset.
Definition: hdfdataset.hh:1031
auto get_compresslevel()
Get the compress level object.
Definition: hdfdataset.hh:914
void __create_dataset__(std::size_t typesize)
helper function for making a non compressed dataset
Definition: hdfdataset.hh:64
std::vector< hsize_t > _capacity
the maximum number of elements which can be stored in the dataset
Definition: hdfdataset.hh:781
herr_t __write_container__(T &&data)
Writes containers to the dataset.
Definition: hdfdataset.hh:163
void set_capacity(std::vector< hsize_t > capacity)
Set the capacity object, and sets rank of dataset to capacity.size.
Definition: hdfdataset.hh:921
void open(const HDFIdentifier &parent_identifier, std::string path, std::vector< hsize_t > capacity={}, std::vector< hsize_t > chunksizes={}, hsize_t compress_level=0)
Open the dataset in parent_object with relative path 'path'.
Definition: hdfdataset.hh:1090
herr_t __write_stringtype__(T data)
writes stringtypes
Definition: hdfdataset.hh:276
HDFDataset()=default
default consturctor
Class that wraps an HDF5 dataspace and takes care of managing its resources.
Definition: hdfdataspace.hh:37
Wrapper class around an hdf5 identifier, used to manage reference counts of the object this identifie...
Definition: hdfidentifier.hh:29
hid_t get_id() const
Get the HDF5 id held by this object.
Definition: hdfidentifier.hh:53
bool is_valid() const
Check if thi ID refers to a valid object.
Definition: hdfidentifier.hh:77
void set_id(hid_t id)
Set id to the given argument. Only to be used to invalidate objects upon move or similar.
Definition: hdfidentifier.hh:65
Common base class for all HDF5 classes in the DATAIO Module i.e., for all classes that wrap HDF5-C-Li...
Definition: hdfobject.hh:37
auto get_id_object() const
Get the id object.
Definition: hdfobject.hh:99
std::string _path
Name of the object.
Definition: hdfobject.hh:50
auto get_refcount()
Get the reference count of object.
Definition: hdfobject.hh:131
std::string get_path() const
Get the name or path object.
Definition: hdfobject.hh:88
void close()
Close function which takes care of correctly closing the object and managing the reference counter.
Definition: hdfobject.hh:161
virtual bool is_valid() const
Check if the object is still valid.
Definition: hdfobject.hh:143
std::shared_ptr< spdlog::logger > _log
pointer to the logger for dataio
Definition: hdfobject.hh:56
HDFIdentifier _id
Identifier object that binds an instance of this class to an HDF5 object.
Definition: hdfobject.hh:44
hid_t get_C_id() const
Get the C id object.
Definition: hdfobject.hh:120
void bind_to(hid_t id, std::function< herr_t(hid_t) > closing_func, std::string path={})
Open the object and bind it to a HDF5 object identified by 'id' with name 'path'. Object should be cr...
Definition: hdfobject.hh:186
Class which handles the conversion of C-types into hdf5types.
Definition: hdftype.hh:136
void open(T &&object)
Open the HDF5 type associated with an HDFObject, i.e., a dataset or an attribute.
Definition: hdftype.hh:224
void close()
Construct close from the given arguments.
Definition: hdftype.hh:322
auto type_category() const
Get the type category of the held type, i.e., scala, string, varlen,...
Definition: hdftype.hh:199
std::size_t size() const
Size of the type held in bytes.
Definition: hdftype.hh:210
OutputIt transform(const Utopia::ExecPolicy policy, InputIt first1, InputIt last1, OutputIt d_first, UnaryOperation unary_op)
Apply a unary operator to a range and store the result in a new range.
Definition: parallel.hh:368
const Cont calc_chunksize(const hsize_t typesize, const Cont io_extend, Cont max_extend={}, const bool opt_inf_dims=true, const bool larger_high_dims=true, const unsigned int CHUNKSIZE_MAX=1048576, const unsigned int CHUNKSIZE_MIN=8192, const unsigned int CHUNKSIZE_BASE=262144)
Try to guess a good chunksize for a dataset.
Definition: hdfchunking.hh:604
std::string to_string(const Config &node)
Given a config node, returns a string representation of it.
Definition: cfg_utils.hh:110
void swap(WriteTask< BGB, DW, DB, AWG, AWD > &lhs, WriteTask< BGB, DW, DB, AWG, AWD > &rhs)
Swaps the state of lhs and rhs.
Definition: write_task.hh:240
void select_slice(arma::Row< hsize_t > start, arma::Row< hsize_t > end, arma::Row< hsize_t > stride)
Select a slice in the dataspace defined by [start, end, stride] in the manner of numpy....
Definition: hdfdataspace.hh:225
auto path_is_valid(hid_t id, std::string path)
Check if the path given relative to the object identified by 'id' exists and points to a valid hdf5 o...
Definition: hdfutilities.hh:150
void swap(HDFDataset &lhs, HDFDataset &rhs)
Exchange state between lhs and rhs.
Definition: hdfdataset.hh:2253
hsize_t rank()
Get thet dataspace's rank, i.e., number of dimensions.
Definition: hdfdataspace.hh:46
void open()
Open the dataspace - set it to be equivalent to any data that later will be used to write or read.
Definition: hdfdataspace.hh:117
std::pair< arma::Row< hsize_t >, arma::Row< hsize_t > > get_properties()
Get the properties object: size and capacity. @notice The dimensions can be infered from the size of ...
Definition: hdfdataspace.hh:67
This file implements a C++ class which wraps a C HDF5 attribute to a HDF5-object (group or dataset),...
In this file, a class for automatically creating intermediate buffer data structures between the user...
This file provides a class which is responsible for the automatic conversion between C/C++ types and ...
This file provides metafunctions for automatically determining the nature of a C/C++ types at compile...
auto end(zip< Containers... > &zipper)
end function like std::end
Definition: zip.hh:550
auto begin(zip< Containers... > &zipper)
Begin function like std::begin.
Definition: zip.hh:537
std::string str(T &&t)
Turn any object for which operator<< exists into a string. Mostly useful for logging data via spdlog ...
Definition: ostream.hh:164
constexpr bool is_container_v
Shorthand for 'is_container::value.
Definition: type_traits.hh:181
typename remove_qualifier< T >::type remove_qualifier_t
Shorthand for 'typename remove_qualifier::value'.
Definition: type_traits.hh:97
constexpr bool is_array_like_v
Shorthand for is_array_like<T>::value.
Definition: type_traits.hh:633
constexpr bool is_string_v
Shorthand for 'is_string<T>::value'.
Definition: type_traits.hh:140
Definition: agent.hh:11
Return the size of a Type T containing other types at compile time. If no object for which an overloa...
Definition: type_traits.hh:438