Utopia 2
Framework for studying models of complex & adaptive systems.
Loading...
Searching...
No Matches
hdfchunking.hh
Go to the documentation of this file.
1#ifndef UTOPIA_DATAIO_HDFCHUNKING_HH
2#define UTOPIA_DATAIO_HDFCHUNKING_HH
3
4#include <numeric>
5#include <sstream>
6
7#include <hdf5.h>
8
9#include "../core/logging.hh"
10
11#include "hdfutilities.hh"
12
13namespace Utopia
14{
15namespace DataIO
16{
27// ++ Helper functions ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
28namespace _chunk_helpers
29{
30
41template < typename Cont, typename Predicate >
42std::vector< unsigned short >
44{
45 // Create the return container
46 std::vector< unsigned short > idcs;
47
48 // Repeatedly start iterating over the vector until reached the end
49 auto iter = vec.begin();
50 while ((iter = std::find_if(iter, vec.end(), pred)) != vec.end())
51 {
52 // Add the value of the iterator to the indices vector
53 idcs.push_back(std::distance(vec.begin(), iter));
54
55 // Increment iterator to continue with next element
56 iter++;
57 }
58
59 return idcs;
60};
61
63template < typename Cont = std::vector< hsize_t > >
64std::string
66{
67 std::stringstream s;
68 s << "{ ";
69 for (auto& extd : vec)
70 {
71 if (extd < H5S_UNLIMITED)
72 {
73 s << extd << " ";
74 }
75 else
76 {
77 s << "∞ ";
78 }
79 }
80 s << "}";
81 return s.str();
82};
83
84// -- Optimization algorithms -------------------------------------------------
85
111template < typename Cont, typename Logger >
112void
114 double bytes_target,
115 const hsize_t typesize,
116 const unsigned int CHUNKSIZE_MAX,
117 const unsigned int CHUNKSIZE_MIN,
118 const bool larger_high_dims,
119 const Logger& log)
120{
121 // Helper lambda for calculating bytesize of a chunks configuration
122 auto bytes = [&typesize](Cont c) {
123 return typesize *
124 std::accumulate(c.begin(), c.end(), 1, std::multiplies<>());
125 };
126
127 // Check the case of typesize larger than CHUNKSIZE_MAX; cannot do anything
128 // in that case -> safer to throw an exception.
130 {
131 throw std::invalid_argument("Cannot use opt_chunks_target with a "
132 "typesize larger than CHUNKSIZE_MAX!");
133 }
134
135 log->debug("Starting optimization towards target size:"
136 " {:7.0f}B ({:.1f} kiB)",
138 bytes_target / 1024.);
139
140 // Ensure the target chunk size is between CHUNKSIZE_MIN and CHUNKSIZE_MAX
141 // in order to not choose too large or too small chunks
143 {
145
146 log->debug("Target size too large! New target size:"
147 " {:7.0f}B ({:.1f} kiB)",
149 bytes_target / 1024.);
150 }
151 else if (bytes_target < CHUNKSIZE_MIN)
152 {
154
155 log->debug("Target size too small! New target size:"
156 " {:7.0f}B ({:.1f} kiB)",
158 bytes_target / 1024.);
159 }
160
161 // ... and a variable that will store the size (in bytes) of this specific
162 // chunk configuration
163 std::size_t bytes_chunks;
164
165 // Calculate the rank (need it to know iteration -> dim mapping)
166 auto rank = chunks.size();
167
168 /* Now optimize the chunks for each dimension by repeatedly looping over
169 * the vector and dividing the values by two (rounding up).
170 *
171 * The loop is left when the following condition is fulfilled:
172 * within 50% of target chunk size
173 * AND
174 * within bounds of minimum and maximum chunk size
175 *
176 * NOTE Limiting the optimization to 42 iterations in order to be on the
177 * safe side. This goes through all entries (rank / 42) times and
178 * halves or doubles the chunk extent.
179 */
180 for (unsigned short i = 0; i < 42 * rank; i++)
181 {
182 // With the current values of the chunks, calculate the chunk size
184
185 log->debug("Chunks: {} -> {:7d} B ({:.1f} kiB)",
186 to_str(chunks),
188 bytes_chunks / 1024.);
189
190 // If close enough to target size, optimization is finished
191 if ((std::abs(bytes_chunks - bytes_target) / bytes_target < 0.5) &&
193 {
194 log->debug("Close enough to target size now.");
195 break;
196 }
197 // else: not yet close enough
198
199 // Calculate the dimension this iteration belongs to
200 auto dim = i % rank;
201
202 // Adjust the chunksize towards the target size
204 {
205 // Can increase the size of the chunk extend in the current dim
206
207 // If high dimensions should be favoured, change the dim to work
208 // on such that first the high dimensions are increased in size
210 {
211 dim = (rank - 1) - dim;
212 }
213
214 // Multiply by two
215 log->debug("Doubling extend of chunk dimension {} ...", dim);
216 chunks[dim] = chunks[dim] * 2;
217 }
218 else
219 {
220 // Need to decrease the size of the chunk extend in the current dim
221
222 // For the larger_high_dims option, smaller dimensions are to be
223 // favoured. However, in order to allow a reduction, these need to
224 // have a chunk extend that is larger than one; once that is no
225 // longer fulfilled, the below if condition will not perform
226 // any change of the dim variable.
227 if (larger_high_dims && rank > 1 && dim > 0 && chunks[dim - 1] > 1)
228 {
229 // Stay on low dimensions one step longer
230 if (dim > 0)
231 {
232 dim--;
233 }
234
235 // Skip the reduction if this is the last dim
236 if (dim == rank - 1)
237 {
238 log->debug("Skipping reduction of chunk dimension {}, "
239 "because it is the highest ...",
240 dim);
241 continue;
242 }
243 }
244
245 // Do not continue if halving is not possible
246 if (chunks[dim] == 1)
247 {
248 log->debug("Extend of chunk dimension {} is already 1.", dim);
249 continue;
250 }
251
252 // TODO generalise the above if blocks! The cleanest way would be
253 // to determine which chunk dimensions _can_ be reduced before
254 // determining the dimension that is to be reduced. This would
255 // alleviate the iterations in which the chunk extent is 1
256 // and no halving can take place ...
257
258 // Divide the chunk size of the current dim by two
259 log->debug("Halving extend of chunk dimension {} ...", dim);
260 chunks[dim] = 1 + ((chunks[dim] - 1) / 2); // ceiling!
261 // NOTE integer division fun; can do this because all are unsigned
262 // and the chunks entry is always nonzero
263 }
264 }
265
266 return;
267}
268
303template < typename Cont, typename Logger >
304void
306 const Cont& max_extend,
307 const hsize_t typesize,
308 const unsigned int CHUNKSIZE_MAX,
309 const bool opt_inf_dims,
310 const bool larger_high_dims,
311 const Logger& log)
312{
313 // Helper lambda for calculating bytesize of a chunks configuration
314 auto bytes = [&typesize](Cont c) {
315 return typesize *
316 std::accumulate(c.begin(), c.end(), 1, std::multiplies<>());
317 };
318
319 // Check the case of typesize larger than CHUNKSIZE_MAX; cannot do anything
320 // in that case -> safer to throw an exception.
322 {
323 throw std::invalid_argument(
324 "Cannot use opt_chunks_with_max_extend "
325 "with a typesize larger than CHUNKSIZE_MAX!");
326 }
327
328 // .. Parse dims and prepare algorithm ....................................
329
330 // Determine the finite dims
331 auto dims_fin =
332 find_all_idcs(max_extend, [](auto l) { return l != H5S_UNLIMITED; });
333 // Ideally, an integer multiple of the chunk size along this dim should
334 // be equal to the maximum extend
335
336 // Determine the infinite dims
337 auto dims_inf =
338 find_all_idcs(max_extend, [](auto l) { return l == H5S_UNLIMITED; });
339 // As the final extend along these dims is not known, we can not make a
340 // good guess for these. Instead, we should use the leverage we have for
341 // optimizing the chunk size along the finite dims. The infinite dims will
342 // thus, most likely, end up with shorter chunk sizes.
343
344 // Declare a container type for storing indices, same as those returned by
345 // the find_all_idcs function
346 using IdxCont = decltype(dims_fin);
347
348 // Create a container with the available dimension indices
349 IdxCont dims(chunks.size());
350 std::iota(dims.begin(), dims.end(), 0);
351
352 // Among the finite dims, determine the dims that can still be filled,
353 // i.e. those where the chunk size does not reach the max_extend
355 for (auto dim : dims_fin)
356 {
357 if (max_extend[dim] > chunks[dim])
358 {
359 dims_fillable.push_back(dim);
360 }
361 }
362
363 // Check if to reverse index containers to favour higher dims
365 {
366 // Reverse all index containers
367 std::reverse(dims_fillable.begin(), dims_fillable.end());
368 std::reverse(dims_fin.begin(), dims_fin.end());
369 std::reverse(dims_inf.begin(), dims_inf.end());
370
371 // NOTE do not actually _need_ to reverse the finite dims container,
372 // doing it only for consistency.
373 }
374
375 // .. Optimization of finite (and still fillable) dims ....................
376
377 if (!dims_fillable.size())
378 {
379 log->debug("No finite dimensions available to optimize.");
380 }
381 else
382 {
383 log->debug("Optimizing {} finite dimension(s) where max_extend is not "
384 "yet reached ...",
385 dims_fillable.size());
386
387 // Loop over the fillable dims indices
388 for (auto dim : dims_fillable)
389 {
390 // Check if there is still potential for optimization
391 // NOTE this could be more thorough
392 if (bytes(chunks) == CHUNKSIZE_MAX)
393 {
394 log->debug("Reached maximum chunksize.");
395 break;
396 }
397
398 // Check if the max_extend is an integer multiple of the chunksize
399 if (max_extend[dim] % chunks[dim] == 0)
400 {
401 // Find the divisor
402 std::size_t factor = max_extend[dim] / chunks[dim];
403
404 // It might fit in completely ...
406 {
407 // It does. Adjust chunks and continue with next dim
408 log->debug("Dimension {} can be filled completely. "
409 "Factor: {}",
410 dim,
411 factor);
412 chunks[dim] = chunks[dim] * factor;
413 continue;
414 }
415 // else: would not fit in completely
416
417 // Starting from the largest possible factor, find the largest
418 // integer divisor of the original factor, i.e.: one that will
419 // also completely cover the max_extend
420 for (std::size_t div = (CHUNKSIZE_MAX / bytes(chunks));
421 div >= 1;
422 div--)
423 {
424 // Check if it is an integer divisor
425 if (factor % div == 0)
426 {
427 // Yes! The _new_ factor is now this value
428 factor = div;
429 break;
430 }
431 }
432 // NOTE Covers the edge case of max. factor == 1: the loop will
433 // perform only one iteration and resulting factor will be 1,
434 // leading (effectively) to no scaling.
435
436 // Scale the chunksize with this factor
437 if (factor > 1)
438 {
439 log->debug(
440 "Scaling dimension {} with factor {} ...", dim, factor);
441
442 chunks[dim] = chunks[dim] * factor;
443 }
444 }
445 else
446 {
447 // Not divisible. Check if the max_extend could be reached w/o
448 // exceeding the max chunksize
449 const double factor = double(max_extend[dim]) / chunks[dim];
450
452 {
453 // Yep. Just extend this dimension to the max_extend, done.
454 log->debug("Dimension {} can be filled completely. "
455 "(difference: {}, factor: {})",
456 dim,
457 max_extend[dim] - chunks[dim],
458 factor);
459
460 chunks[dim] = max_extend[dim];
461 }
462 else
463 {
464 // Cannot further extend.
465 log->debug("Dimension {} cannot be extended to fill "
466 "max_extend without exceeding maximum "
467 "chunksize! "
468 "(difference: {}, factor: {})",
469 dim,
470 max_extend[dim] - chunks[dim],
471 factor);
472 }
473 }
474 // Done with this index
475 }
476 }
477
478 // .. Optimization of infinite dims .......................................
479
480 if (!opt_inf_dims)
481 {
482 log->debug("Optimization of unlimited dimensions is disabled.");
483 }
484 else if (!dims_inf.size())
485 {
486 log->debug("No unlimited dimensions available to optimize.");
487 }
488 else if (bytes(chunks) == CHUNKSIZE_MAX)
489 {
490 log->debug("Cannot further optimize using unlimited dimensions.");
491 }
492 else
493 {
494 log->debug("Optimizing {} unlimited dimension(s) to fill the maximum "
495 "chunk size ...",
496 dims_inf.size());
497
498 // Loop over indices of inf. dims
499 // NOTE Depending on the chunk sizes, this might only have an effect
500 // on the first index considered ... but that's fine for now.
501 for (auto dim : dims_inf)
502 {
503 // Calculate the factor to make the chunk as big as possible
504 const std::size_t factor = CHUNKSIZE_MAX / bytes(chunks); // floors
505
506 // If large enough, scale it by that factor
507 if (factor > 1)
508 {
509 log->debug(
510 "Scaling dimension {} with factor {} ...", dim, factor);
511
512 chunks[dim] = chunks[dim] * factor;
513 }
514 }
515 }
516
517 // Done.
518 // Check if everything went fine (only a safeguard ...)
520 {
521 throw std::runtime_error("Calculated chunks exceed CHUNKSIZE_MAX! "
522 "This should not have happened!");
523 }
524
525 return;
526}
527
528} // namespace helpers
529
530// ++ The actual guess_chunksize method +++++++++++++++++++++++++++++++++++++++
531
601// NOTE Could include compression level here in the future
602template < typename Cont = std::vector< hsize_t > >
603const Cont
605 const Cont io_extend,
606 Cont max_extend = {},
607 const bool opt_inf_dims = true,
608 const bool larger_high_dims = true,
609 const unsigned int CHUNKSIZE_MAX = 1048576, // 1M
610 const unsigned int CHUNKSIZE_MIN = 8192, // 8k
611 const unsigned int CHUNKSIZE_BASE = 262144) // 256k
612{
613 // Make the helper functions available
614 using namespace _chunk_helpers;
615
616 // Helper lambda for calculating bytesize of a chunks configuration
617 auto bytes = [&typesize](Cont c) {
618 return typesize *
619 std::accumulate(c.begin(), c.end(), 1, std::multiplies<>());
620 };
621
622 // Get a logger to use here; note that it needs to have been set up outside
623 // of here beforehand!
624 const auto log = spdlog::get("data_io");
625
626 // .. Check correctness of arguments and extract some info ................
627 // Get the rank
628 unsigned short rank = io_extend.size();
629
630 // For scalar datasets, chunking is not available
631 if (rank == 0)
632 {
633 throw std::invalid_argument("Cannot guess chunksize for a scalar "
634 "dataset!");
635 }
636
637 // Make sure io_extend has no illegal values (<=0)
638 for (const auto& val : io_extend)
639 {
640 if (val <= 0)
641 {
642 throw std::invalid_argument(
643 "Argument 'io_extend' contained "
644 "illegal (zero or negative) value(s)! io_extend: " +
646 }
647 }
648
649 // Find out if the max_extend is given and determine whether dset is finite
650 bool dset_finite;
651 bool all_dims_inf;
652
653 if (max_extend.size())
654 {
655 // Yes, was given. Need to check that the max_extend values are ok.
656 // Check that it matches the rank
657 if (max_extend.size() != rank)
658 {
659 throw std::invalid_argument(
660 "Argument 'max_extend' does not have the same dimensionality "
661 "as the rank of this dataset (as extracted from the "
662 "'io_extend' argument).");
663 }
664
665 // And that all values are valid, i.e. larger than corresp.io_extend
666 for (unsigned short i = 0; i < rank; i++)
667 {
668 if (max_extend[i] < io_extend[i])
669 {
670 throw std::invalid_argument(
671 "Index " + std::to_string(i) +
672 " of argument 'max_extend' (" + to_str(max_extend) +
673 ") was smaller than the corresponding 'io_extend' (" +
674 to_str(io_extend) + ") value! ");
675 }
676 }
677 // max_extend content is valid now
678
679 // Now extract information on the properties of max_extend
680 // Need to check whether any dataset dimension can be infinitely long
681 dset_finite = (std::find(max_extend.begin(),
682 max_extend.end(),
683 H5S_UNLIMITED) ==
684 max_extend.end()); // i.e., H5S_UNLIMITED _not_ found
685
686 // Or even all are infinitely long
687 all_dims_inf = true;
688 for (const auto& ext : max_extend)
689 {
690 if (ext < H5S_UNLIMITED)
691 {
692 // This one is not infinite
693 all_dims_inf = false;
694 break;
695 }
696 }
697 }
698 else
699 {
700 // max_extend not given
701 // Have to assume the max_extend is the same as the io_extend
702 // Thus, the properties are known:
703 dset_finite = true;
704 all_dims_inf = false;
705
706 // Set the values to those of io_extend
707 max_extend.insert(
708 max_extend.begin(), io_extend.begin(), io_extend.end());
709 }
710
711 // NOTE max_extend is now a vector of same rank as io_extend
712 log->info("Calculating optimal chunk size for io_extend {} and "
713 "max_extend {} ...",
716 log->debug("rank: {}", rank);
717 log->debug("finite dset? {}", dset_finite);
718 log->debug("all dims infinite? {}", all_dims_inf);
719 log->debug("optimize inf dims? {}", opt_inf_dims);
720 log->debug("larger high dims? {}", larger_high_dims);
721 log->debug("typesize: {}", typesize);
722 log->debug("max. chunksize: {:7d} ({:.1f} kiB)",
724 CHUNKSIZE_MAX / 1024.);
725 log->debug("min. chunksize: {:7d} ({:.1f} kiB)",
727 CHUNKSIZE_MIN / 1024.);
728 log->debug("base chunksize: {:7d} ({:.1f} kiB)",
730 CHUNKSIZE_BASE / 1024.);
731
732 // .. For the simple cases, evaluate the chunksize directly ...............
733
734 // For large typesizes, each chunk can at most contain a single element.
735 // Chunks that extend to more than one element require a typesize smaller
736 // than half the maximum chunksize.
737 if (typesize > CHUNKSIZE_MAX / 2)
738 {
739 log->debug("Type size >= 1/2 max. chunksize -> Each cell needs to be "
740 "its own chunk.");
741 return Cont(rank, 1);
742 }
743
744 // For a finite dataset, that would fit into CHUNKSIZE_MAX when maximally
745 // extended, we can only have (and only need!) a single chunk
747 {
748 log->debug("Maximally extended dataset will fit into single chunk.");
749 return Cont(max_extend);
750 }
751
752 // .. Step 1: Optimize for one I/O operation fitting into chunk ...........
753 log->debug("Cannot apply simple optimizations. Try to fit single I/O "
754 "operation into a chunk ...");
755
756 // Create the temporary container that will store the chunksize values.
757 // It starts with a copy of the extend values for I/O operations.
759
760 // Determine the size (in bytes) of a write operation with this extend
761 const auto bytes_io = bytes(io_extend);
762 log->debug(
763 "I/O operation size: {:7d} ({:.1f} kiB)", bytes_io, bytes_io / 1024.);
764
765 // Determine if an I/O operation fits into a single chunk, then decide on
766 // how to optimize accordingly
768 {
769 // The I/O operation does _not_ fit into a chunk
770 // Aim to fit the I/O operation into the chunk -> target: max chunksize
771 log->debug("Single I/O operation does not fit into chunk.");
772 log->debug("Trying to use the fewest possible chunks for a single "
773 "I/O operation ...");
774
776 CHUNKSIZE_MAX, // <- target value
777 typesize,
781 log);
782 // NOTE The algorithm is also able to _increase_ the chunk size in
783 // certain dimensions. However, with _chunks == io_extend and the
784 // knowledge that the current bytesize of _chunks is above the
785 // maximum size, the chunk extensions will only be _reduced_.
786 }
788 {
789 // The I/O operation _does_ fit into a chunk, but the dataset is
790 // infinite in _all directions_ and small chunksizes can be very
791 // inefficient -> optimize towards some base value
792 log->debug("Single I/O operation does fit into chunk.");
793 log->debug("Optimizing chunks in unlimited dimensions to be closer "
794 "to base chunksize ...");
795
797 CHUNKSIZE_BASE, // <- target value
798 typesize,
802 log);
803 // NOTE There is no issue with going beyond the maximum chunksize here
804 }
805 else
806 {
807 // no other optimization towards a target size make sense
808 log->debug("Single I/O operation does fit into a chunk.");
809 }
810
811 // To be on the safe side: Check that _chunks did not exceed max_extend
812 for (unsigned short i = 0; i < rank; i++)
813 {
814 if (_chunks[i] > max_extend[i])
815 {
816 log->warn("Optimization led to chunks larger than max_extend. "
817 "This should not have happened!");
818 _chunks[i] = max_extend[i];
819 }
820 }
821
822 // .. Step 2: Optimize by taking the max_extend into account ..............
823
824 // This is only possible if the current chunk size is not already above the
825 // upper limit, CHUNKSIZE_MAX, and the max_extend is not already reached.
826 // Also, it should not be enabled if the optimization towards unlimited
827 // dimensions was already performed
828 if (!(opt_inf_dims && all_dims_inf) && (_chunks != max_extend) &&
830 {
831 log->debug("Have max_extend information and can (potentially) use it "
832 "to optimize chunk extensions.");
833
836 typesize,
840 log);
841 }
842 // else: no further optimization possible
843
844 // Done.
845 // Make sure that chunksize is smaller than maximum chunksize
847 {
848 throw std::runtime_error(
849 "Byte size of chunks " + to_str(_chunks) +
850 " is larger than CHUNKSIZE_MAX! This should not have happened!");
851 }
852
853 // Create a const version of the temporary chunks vector
854 const Cont chunks(_chunks);
855 log->info("Optimized chunk size: {}", to_str(chunks));
856
857 return chunks;
858}
// end of group ChunkingUtilities // end of group DataIO
861
862} // namespace DataIO
863} // namespace Utopia
864#endif // UTOPIA_DATAIO_HDFCHUNKING_HH
const Cont calc_chunksize(const hsize_t typesize, const Cont io_extend, Cont max_extend={}, const bool opt_inf_dims=true, const bool larger_high_dims=true, const unsigned int CHUNKSIZE_MAX=1048576, const unsigned int CHUNKSIZE_MIN=8192, const unsigned int CHUNKSIZE_BASE=262144)
Try to guess a good chunksize for a dataset.
Definition hdfchunking.hh:604
Container select_entities(const Manager &mngr, const DataIO::Config &sel_cfg)
Select entities according to parameters specified in a configuration.
Definition select.hh:213
This file provides metafunctions for automatically determining the nature of a C/C++ types at compile...
void opt_chunks_with_max_extend(Cont &chunks, const Cont &max_extend, const hsize_t typesize, const unsigned int CHUNKSIZE_MAX, const bool opt_inf_dims, const bool larger_high_dims, const Logger &log)
Optimize chunk sizes using max_extend information.
Definition hdfchunking.hh:305
std::vector< unsigned short > find_all_idcs(Cont &vec, Predicate pred)
Finds all indices of elements in a vector that matches the given predicate.
Definition hdfchunking.hh:43
void opt_chunks_target(Cont &chunks, double bytes_target, const hsize_t typesize, const unsigned int CHUNKSIZE_MAX, const unsigned int CHUNKSIZE_MIN, const bool larger_high_dims, const Logger &log)
Optimizes the chunks along all axes to find a good default.
Definition hdfchunking.hh:113
std::string to_str(const Cont &vec)
Helper function to create a string representation of containers.
Definition hdfchunking.hh:65
Definition agent.hh:11