1 #ifndef UTOPIA_DATAIO_HDFCHUNKING_HH
2 #define UTOPIA_DATAIO_HDFCHUNKING_HH
9 #include "../core/logging.hh"
28 namespace _chunk_helpers
41 template <
typename Cont,
typename Predicate >
42 std::vector< unsigned short >
46 std::vector< unsigned short > idcs;
49 auto iter = vec.begin();
50 while ((iter = std::find_if(iter, vec.end(), pred)) != vec.end())
53 idcs.push_back(std::distance(vec.begin(), iter));
63 template <
typename Cont = std::vector< h
size_t > >
69 for (
auto& extd : vec)
71 if (extd < H5S_UNLIMITED)
111 template <
typename Cont,
typename Logger >
115 const hsize_t typesize,
116 const unsigned int CHUNKSIZE_MAX,
117 const unsigned int CHUNKSIZE_MIN,
118 const bool larger_high_dims,
122 auto bytes = [&typesize](Cont c) {
124 std::accumulate(c.begin(), c.end(), 1, std::multiplies<>());
129 if (typesize > CHUNKSIZE_MAX)
131 throw std::invalid_argument(
"Cannot use opt_chunks_target with a "
132 "typesize larger than CHUNKSIZE_MAX!");
135 log->debug(
"Starting optimization towards target size:"
136 " {:7.0f}B ({:.1f} kiB)",
138 bytes_target / 1024.);
142 if (bytes_target > CHUNKSIZE_MAX)
144 bytes_target = CHUNKSIZE_MAX;
146 log->debug(
"Target size too large! New target size:"
147 " {:7.0f}B ({:.1f} kiB)",
149 bytes_target / 1024.);
151 else if (bytes_target < CHUNKSIZE_MIN)
153 bytes_target = CHUNKSIZE_MIN;
155 log->debug(
"Target size too small! New target size:"
156 " {:7.0f}B ({:.1f} kiB)",
158 bytes_target / 1024.);
163 std::size_t bytes_chunks;
166 auto rank = chunks.size();
180 for (
unsigned short i = 0; i < 42 * rank; i++)
183 bytes_chunks = bytes(chunks);
185 log->debug(
"Chunks: {} -> {:7d} B ({:.1f} kiB)",
188 bytes_chunks / 1024.);
191 if ((std::abs(bytes_chunks - bytes_target) / bytes_target < 0.5) &&
192 bytes_chunks <= CHUNKSIZE_MAX && bytes_chunks >= CHUNKSIZE_MIN)
194 log->debug(
"Close enough to target size now.");
203 if (bytes_chunks < bytes_target)
209 if (larger_high_dims)
211 dim = (rank - 1) - dim;
215 log->debug(
"Doubling extend of chunk dimension {} ...", dim);
216 chunks[dim] = chunks[dim] * 2;
227 if (larger_high_dims && rank > 1 && dim > 0 && chunks[dim - 1] > 1)
238 log->debug(
"Skipping reduction of chunk dimension {}, "
239 "because it is the highest ...",
246 if (chunks[dim] == 1)
248 log->debug(
"Extend of chunk dimension {} is already 1.", dim);
259 log->debug(
"Halving extend of chunk dimension {} ...", dim);
260 chunks[dim] = 1 + ((chunks[dim] - 1) / 2);
303 template <
typename Cont,
typename Logger >
306 const Cont& max_extend,
307 const hsize_t typesize,
308 const unsigned int CHUNKSIZE_MAX,
309 const bool opt_inf_dims,
310 const bool larger_high_dims,
314 auto bytes = [&typesize](Cont c) {
316 std::accumulate(c.begin(), c.end(), 1, std::multiplies<>());
321 if (typesize > CHUNKSIZE_MAX)
323 throw std::invalid_argument(
324 "Cannot use opt_chunks_with_max_extend "
325 "with a typesize larger than CHUNKSIZE_MAX!");
332 find_all_idcs(max_extend, [](
auto l) {
return l != H5S_UNLIMITED; });
338 find_all_idcs(max_extend, [](
auto l) {
return l == H5S_UNLIMITED; });
346 using IdxCont = decltype(dims_fin);
349 IdxCont dims(chunks.size());
350 std::iota(dims.begin(), dims.end(), 0);
354 IdxCont dims_fillable;
355 for (
auto dim : dims_fin)
357 if (max_extend[dim] > chunks[dim])
359 dims_fillable.push_back(dim);
364 if (larger_high_dims)
367 std::reverse(dims_fillable.begin(), dims_fillable.end());
368 std::reverse(dims_fin.begin(), dims_fin.end());
369 std::reverse(dims_inf.begin(), dims_inf.end());
377 if (!dims_fillable.size())
379 log->debug(
"No finite dimensions available to optimize.");
383 log->debug(
"Optimizing {} finite dimension(s) where max_extend is not "
385 dims_fillable.size());
388 for (
auto dim : dims_fillable)
392 if (bytes(chunks) == CHUNKSIZE_MAX)
394 log->debug(
"Reached maximum chunksize.");
399 if (max_extend[dim] % chunks[dim] == 0)
402 std::size_t factor = max_extend[dim] / chunks[dim];
405 if (factor * bytes(chunks) <= CHUNKSIZE_MAX)
408 log->debug(
"Dimension {} can be filled completely. "
412 chunks[dim] = chunks[dim] * factor;
420 for (std::size_t div = (CHUNKSIZE_MAX / bytes(chunks));
425 if (factor % div == 0)
440 "Scaling dimension {} with factor {} ...", dim, factor);
442 chunks[dim] = chunks[dim] * factor;
449 const double factor = double(max_extend[dim]) / chunks[dim];
451 if (factor * bytes(chunks) <= CHUNKSIZE_MAX)
454 log->debug(
"Dimension {} can be filled completely. "
455 "(difference: {}, factor: {})",
457 max_extend[dim] - chunks[dim],
460 chunks[dim] = max_extend[dim];
465 log->debug(
"Dimension {} cannot be extended to fill "
466 "max_extend without exceeding maximum "
468 "(difference: {}, factor: {})",
470 max_extend[dim] - chunks[dim],
482 log->debug(
"Optimization of unlimited dimensions is disabled.");
484 else if (!dims_inf.size())
486 log->debug(
"No unlimited dimensions available to optimize.");
488 else if (bytes(chunks) == CHUNKSIZE_MAX)
490 log->debug(
"Cannot further optimize using unlimited dimensions.");
494 log->debug(
"Optimizing {} unlimited dimension(s) to fill the maximum "
501 for (
auto dim : dims_inf)
504 const std::size_t factor = CHUNKSIZE_MAX / bytes(chunks);
510 "Scaling dimension {} with factor {} ...", dim, factor);
512 chunks[dim] = chunks[dim] * factor;
519 if (bytes(chunks) > CHUNKSIZE_MAX)
521 throw std::runtime_error(
"Calculated chunks exceed CHUNKSIZE_MAX! "
522 "This should not have happened!");
602 template <
typename Cont = std::vector< h
size_t > >
605 const Cont io_extend,
606 Cont max_extend = {},
607 const bool opt_inf_dims =
true,
608 const bool larger_high_dims =
true,
609 const unsigned int CHUNKSIZE_MAX = 1048576,
610 const unsigned int CHUNKSIZE_MIN = 8192,
611 const unsigned int CHUNKSIZE_BASE = 262144)
614 using namespace _chunk_helpers;
617 auto bytes = [&typesize](Cont c) {
619 std::accumulate(c.begin(), c.end(), 1, std::multiplies<>());
624 const auto log = spdlog::get(
"data_io");
628 unsigned short rank = io_extend.size();
633 throw std::invalid_argument(
"Cannot guess chunksize for a scalar "
638 for (
const auto& val : io_extend)
642 throw std::invalid_argument(
643 "Argument 'io_extend' contained "
644 "illegal (zero or negative) value(s)! io_extend: " +
653 if (max_extend.size())
657 if (max_extend.size() != rank)
659 throw std::invalid_argument(
660 "Argument 'max_extend' does not have the same dimensionality "
661 "as the rank of this dataset (as extracted from the "
662 "'io_extend' argument).");
666 for (
unsigned short i = 0; i < rank; i++)
668 if (max_extend[i] < io_extend[i])
670 throw std::invalid_argument(
672 " of argument 'max_extend' (" +
to_str(max_extend) +
673 ") was smaller than the corresponding 'io_extend' (" +
674 to_str(io_extend) +
") value! ");
681 dset_finite = (std::find(max_extend.begin(),
688 for (
const auto& ext : max_extend)
690 if (ext < H5S_UNLIMITED)
693 all_dims_inf =
false;
704 all_dims_inf =
false;
708 max_extend.begin(), io_extend.begin(), io_extend.end());
712 log->info(
"Calculating optimal chunk size for io_extend {} and "
716 log->debug(
"rank: {}", rank);
717 log->debug(
"finite dset? {}", dset_finite);
718 log->debug(
"all dims infinite? {}", all_dims_inf);
719 log->debug(
"optimize inf dims? {}", opt_inf_dims);
720 log->debug(
"larger high dims? {}", larger_high_dims);
721 log->debug(
"typesize: {}", typesize);
722 log->debug(
"max. chunksize: {:7d} ({:.1f} kiB)",
724 CHUNKSIZE_MAX / 1024.);
725 log->debug(
"min. chunksize: {:7d} ({:.1f} kiB)",
727 CHUNKSIZE_MIN / 1024.);
728 log->debug(
"base chunksize: {:7d} ({:.1f} kiB)",
730 CHUNKSIZE_BASE / 1024.);
737 if (typesize > CHUNKSIZE_MAX / 2)
739 log->debug(
"Type size >= 1/2 max. chunksize -> Each cell needs to be "
741 return Cont(rank, 1);
746 if (dset_finite && (bytes(max_extend) <= CHUNKSIZE_MAX))
748 log->debug(
"Maximally extended dataset will fit into single chunk.");
749 return Cont(max_extend);
753 log->debug(
"Cannot apply simple optimizations. Try to fit single I/O "
754 "operation into a chunk ...");
758 Cont _chunks(io_extend);
761 const auto bytes_io = bytes(io_extend);
763 "I/O operation size: {:7d} ({:.1f} kiB)", bytes_io, bytes_io / 1024.);
767 if (bytes_io > CHUNKSIZE_MAX)
771 log->debug(
"Single I/O operation does not fit into chunk.");
772 log->debug(
"Trying to use the fewest possible chunks for a single "
773 "I/O operation ...");
787 else if (all_dims_inf && opt_inf_dims && bytes(_chunks) < CHUNKSIZE_BASE)
792 log->debug(
"Single I/O operation does fit into chunk.");
793 log->debug(
"Optimizing chunks in unlimited dimensions to be closer "
794 "to base chunksize ...");
808 log->debug(
"Single I/O operation does fit into a chunk.");
812 for (
unsigned short i = 0; i < rank; i++)
814 if (_chunks[i] > max_extend[i])
816 log->warn(
"Optimization led to chunks larger than max_extend. "
817 "This should not have happened!");
818 _chunks[i] = max_extend[i];
828 if (!(opt_inf_dims && all_dims_inf) && (_chunks != max_extend) &&
829 (bytes(_chunks) < CHUNKSIZE_MAX))
831 log->debug(
"Have max_extend information and can (potentially) use it "
832 "to optimize chunk extensions.");
846 if (bytes(_chunks) > CHUNKSIZE_MAX)
848 throw std::runtime_error(
849 "Byte size of chunks " +
to_str(_chunks) +
850 " is larger than CHUNKSIZE_MAX! This should not have happened!");
854 const Cont chunks(_chunks);
855 log->info(
"Optimized chunk size: {}",
to_str(chunks));
const Cont calc_chunksize(const hsize_t typesize, const Cont io_extend, Cont max_extend={}, const bool opt_inf_dims=true, const bool larger_high_dims=true, const unsigned int CHUNKSIZE_MAX=1048576, const unsigned int CHUNKSIZE_MIN=8192, const unsigned int CHUNKSIZE_BASE=262144)
Try to guess a good chunksize for a dataset.
Definition: hdfchunking.hh:604
std::string to_string(const Config &node)
Given a config node, returns a string representation of it.
Definition: cfg_utils.hh:110
This file provides metafunctions for automatically determining the nature of a C/C++ types at compile...
void opt_chunks_with_max_extend(Cont &chunks, const Cont &max_extend, const hsize_t typesize, const unsigned int CHUNKSIZE_MAX, const bool opt_inf_dims, const bool larger_high_dims, const Logger &log)
Optimize chunk sizes using max_extend information.
Definition: hdfchunking.hh:305
std::vector< unsigned short > find_all_idcs(Cont &vec, Predicate pred)
Finds all indices of elements in a vector that matches the given predicate.
Definition: hdfchunking.hh:43
void opt_chunks_target(Cont &chunks, double bytes_target, const hsize_t typesize, const unsigned int CHUNKSIZE_MAX, const unsigned int CHUNKSIZE_MIN, const bool larger_high_dims, const Logger &log)
Optimizes the chunks along all axes to find a good default.
Definition: hdfchunking.hh:113
std::string to_str(const Cont &vec)
Helper function to create a string representation of containers.
Definition: hdfchunking.hh:65