Utopia 2
Framework for studying models of complex & adaptive systems.
Loading...
Searching...
No Matches
Functions
Utopia::DataIO::_chunk_helpers Namespace Reference

Functions

template<typename Cont , typename Predicate >
std::vector< unsigned shortfind_all_idcs (Cont &vec, Predicate pred)
 Finds all indices of elements in a vector that matches the given predicate.
 
template<typename Cont = std::vector< hsize_t >>
std::string to_str (const Cont &vec)
 Helper function to create a string representation of containers.
 
template<typename Cont , typename Logger >
void opt_chunks_target (Cont &chunks, double bytes_target, const hsize_t typesize, const unsigned int CHUNKSIZE_MAX, const unsigned int CHUNKSIZE_MIN, const bool larger_high_dims, const Logger &log)
 Optimizes the chunks along all axes to find a good default.
 
template<typename Cont , typename Logger >
void opt_chunks_with_max_extend (Cont &chunks, const Cont &max_extend, const hsize_t typesize, const unsigned int CHUNKSIZE_MAX, const bool opt_inf_dims, const bool larger_high_dims, const Logger &log)
 Optimize chunk sizes using max_extend information.
 

Function Documentation

◆ find_all_idcs()

std::vector< unsigned short > Utopia::DataIO::_chunk_helpers::find_all_idcs ( Cont vec,
Predicate  pred 
)

Finds all indices of elements in a vector that matches the given predicate.

Parameters
vecThe object to find the indices in
predThe predicate to determine the indices to be found
Template Parameters
ContThe container type
PredicateThe predicate type
44{
45 // Create the return container
46 std::vector< unsigned short > idcs;
47
48 // Repeatedly start iterating over the vector until reached the end
49 auto iter = vec.begin();
50 while ((iter = std::find_if(iter, vec.end(), pred)) != vec.end())
51 {
52 // Add the value of the iterator to the indices vector
53 idcs.push_back(std::distance(vec.begin(), iter));
54
55 // Increment iterator to continue with next element
56 iter++;
57 }
58
59 return idcs;
60};

◆ opt_chunks_target()

template<typename Cont , typename Logger >
void Utopia::DataIO::_chunk_helpers::opt_chunks_target ( Cont chunks,
double  bytes_target,
const hsize_t  typesize,
const unsigned int  CHUNKSIZE_MAX,
const unsigned int  CHUNKSIZE_MIN,
const bool  larger_high_dims,
const Logger log 
)

Optimizes the chunks along all axes to find a good default.

This algorithm is only aware of the current size of the chunks and the target byte size of a chunk. Given that information, it either tries to reduce the extend of chunk dimensions, or enlarge it. To do that, it iterates over all chunk dimensions and either doubles the extend or halves it. Once within 50% of the target byte size, the algorithm stops. Also, it takes care to remain within the bounds of CHUNKSIZE_MAX and CHUNKSIZE_MIN. If a target byte size outside of these bounds is given, it will adjust it accordingly. For a typesize larger than CHUNKSIZE_MAX, this algorithm cannot perform any reasonable actions and will throw an exception; this case should be handled outside of this function!

Parameters
chunksThe current chunk values that are to be optimized
bytes_targetWhich byte size to optimize the chunks to
typesizeThe byte size of a single entry, needed to calculate the total bytesize of a whole chunk
CHUNKSIZE_MAXThe maximum allowed bytesize of a chunk
CHUNKSIZE_MINThe minimum allowed bytesize of a chunk
larger_high_dimsIf true, dimensions with high indices will be favoured for enlarging chunk extend in that dim
logThe logger object to use
120{
121 // Helper lambda for calculating bytesize of a chunks configuration
122 auto bytes = [&typesize](Cont c) {
123 return typesize *
124 std::accumulate(c.begin(), c.end(), 1, std::multiplies<>());
125 };
126
127 // Check the case of typesize larger than CHUNKSIZE_MAX; cannot do anything
128 // in that case -> safer to throw an exception.
129 if (typesize > CHUNKSIZE_MAX)
130 {
131 throw std::invalid_argument("Cannot use opt_chunks_target with a "
132 "typesize larger than CHUNKSIZE_MAX!");
133 }
134
135 log->debug("Starting optimization towards target size:"
136 " {:7.0f}B ({:.1f} kiB)",
137 bytes_target,
138 bytes_target / 1024.);
139
140 // Ensure the target chunk size is between CHUNKSIZE_MIN and CHUNKSIZE_MAX
141 // in order to not choose too large or too small chunks
142 if (bytes_target > CHUNKSIZE_MAX)
143 {
144 bytes_target = CHUNKSIZE_MAX;
145
146 log->debug("Target size too large! New target size:"
147 " {:7.0f}B ({:.1f} kiB)",
148 bytes_target,
149 bytes_target / 1024.);
150 }
151 else if (bytes_target < CHUNKSIZE_MIN)
152 {
153 bytes_target = CHUNKSIZE_MIN;
154
155 log->debug("Target size too small! New target size:"
156 " {:7.0f}B ({:.1f} kiB)",
157 bytes_target,
158 bytes_target / 1024.);
159 }
160
161 // ... and a variable that will store the size (in bytes) of this specific
162 // chunk configuration
163 std::size_t bytes_chunks;
164
165 // Calculate the rank (need it to know iteration -> dim mapping)
166 auto rank = chunks.size();
167
168 /* Now optimize the chunks for each dimension by repeatedly looping over
169 * the vector and dividing the values by two (rounding up).
170 *
171 * The loop is left when the following condition is fulfilled:
172 * within 50% of target chunk size
173 * AND
174 * within bounds of minimum and maximum chunk size
175 *
176 * NOTE Limiting the optimization to 42 iterations in order to be on the
177 * safe side. This goes through all entries (rank / 42) times and
178 * halves or doubles the chunk extent.
179 */
180 for (unsigned short i = 0; i < 42 * rank; i++)
181 {
182 // With the current values of the chunks, calculate the chunk size
183 bytes_chunks = bytes(chunks);
184
185 log->debug("Chunks: {} -> {:7d} B ({:.1f} kiB)",
186 to_str(chunks),
187 bytes_chunks,
188 bytes_chunks / 1024.);
189
190 // If close enough to target size, optimization is finished
191 if ((std::abs(bytes_chunks - bytes_target) / bytes_target < 0.5) &&
192 bytes_chunks <= CHUNKSIZE_MAX && bytes_chunks >= CHUNKSIZE_MIN)
193 {
194 log->debug("Close enough to target size now.");
195 break;
196 }
197 // else: not yet close enough
198
199 // Calculate the dimension this iteration belongs to
200 auto dim = i % rank;
201
202 // Adjust the chunksize towards the target size
203 if (bytes_chunks < bytes_target)
204 {
205 // Can increase the size of the chunk extend in the current dim
206
207 // If high dimensions should be favoured, change the dim to work
208 // on such that first the high dimensions are increased in size
209 if (larger_high_dims)
210 {
211 dim = (rank - 1) - dim;
212 }
213
214 // Multiply by two
215 log->debug("Doubling extend of chunk dimension {} ...", dim);
216 chunks[dim] = chunks[dim] * 2;
217 }
218 else
219 {
220 // Need to decrease the size of the chunk extend in the current dim
221
222 // For the larger_high_dims option, smaller dimensions are to be
223 // favoured. However, in order to allow a reduction, these need to
224 // have a chunk extend that is larger than one; once that is no
225 // longer fulfilled, the below if condition will not perform
226 // any change of the dim variable.
227 if (larger_high_dims && rank > 1 && dim > 0 && chunks[dim - 1] > 1)
228 {
229 // Stay on low dimensions one step longer
230 if (dim > 0)
231 {
232 dim--;
233 }
234
235 // Skip the reduction if this is the last dim
236 if (dim == rank - 1)
237 {
238 log->debug("Skipping reduction of chunk dimension {}, "
239 "because it is the highest ...",
240 dim);
241 continue;
242 }
243 }
244
245 // Do not continue if halving is not possible
246 if (chunks[dim] == 1)
247 {
248 log->debug("Extend of chunk dimension {} is already 1.", dim);
249 continue;
250 }
251
252 // TODO generalise the above if blocks! The cleanest way would be
253 // to determine which chunk dimensions _can_ be reduced before
254 // determining the dimension that is to be reduced. This would
255 // alleviate the iterations in which the chunk extent is 1
256 // and no halving can take place ...
257
258 // Divide the chunk size of the current dim by two
259 log->debug("Halving extend of chunk dimension {} ...", dim);
260 chunks[dim] = 1 + ((chunks[dim] - 1) / 2); // ceiling!
261 // NOTE integer division fun; can do this because all are unsigned
262 // and the chunks entry is always nonzero
263 }
264 }
265
266 return;
267}
std::string to_str(const Cont &vec)
Helper function to create a string representation of containers.
Definition hdfchunking.hh:65

◆ opt_chunks_with_max_extend()

template<typename Cont , typename Logger >
void Utopia::DataIO::_chunk_helpers::opt_chunks_with_max_extend ( Cont chunks,
const Cont max_extend,
const hsize_t  typesize,
const unsigned int  CHUNKSIZE_MAX,
const bool  opt_inf_dims,
const bool  larger_high_dims,
const Logger log 
)

Optimize chunk sizes using max_extend information.

This algorithm is aware of the maximum extend of a dataset and can use that information during optimization, aiming to increase the size of the chunks towards CHUNKSIZE_MAX as far as possible without going beyond max_extend. The paradigm here is that the number of chunks needed for read/write operations should be minimized while trying to keep a chunk's byte size below a certain value. The algorithm distinguishes between dimensions that have a finite extend and those that can grow to H5S_UNLIMITED, i.e. "infinite" extend. First, the aim is to try to cover the max_extend in the finite dimensions. It checks if an integer multiple is needed to reach the maximum extend. If, after that, the target CHUNKSIZE_MAX is not yet reached and the opt_inf_dims flag is set, the chunk sizes in the unlimited dimensions are extended as far as possible, assuming that they were chosen unlimited because they will be filled at some point and larger chunk sizes will reduce the number of chunks needed during read/write operations.

Parameters
chunksThe current chunk values that are to be optimized
max_extendThe maximum extend of the dataset
typesizeThe byte size of a single entry, needed to calculate the total bytesize of a whole chunk
CHUNKSIZE_MAXThe maximum allowed bytesize of a chunk
opt_inf_dimsWhether to optimize the infinite dimensions or not
larger_high_dimsIf true, dimensions with high indices will be favoured for enlarging chunk extend in that dim
logThe logger object to use
Template Parameters
ContThe container type for the chunks
LoggerThe logger type
312{
313 // Helper lambda for calculating bytesize of a chunks configuration
314 auto bytes = [&typesize](Cont c) {
315 return typesize *
316 std::accumulate(c.begin(), c.end(), 1, std::multiplies<>());
317 };
318
319 // Check the case of typesize larger than CHUNKSIZE_MAX; cannot do anything
320 // in that case -> safer to throw an exception.
321 if (typesize > CHUNKSIZE_MAX)
322 {
323 throw std::invalid_argument(
324 "Cannot use opt_chunks_with_max_extend "
325 "with a typesize larger than CHUNKSIZE_MAX!");
326 }
327
328 // .. Parse dims and prepare algorithm ....................................
329
330 // Determine the finite dims
331 auto dims_fin =
332 find_all_idcs(max_extend, [](auto l) { return l != H5S_UNLIMITED; });
333 // Ideally, an integer multiple of the chunk size along this dim should
334 // be equal to the maximum extend
335
336 // Determine the infinite dims
337 auto dims_inf =
338 find_all_idcs(max_extend, [](auto l) { return l == H5S_UNLIMITED; });
339 // As the final extend along these dims is not known, we can not make a
340 // good guess for these. Instead, we should use the leverage we have for
341 // optimizing the chunk size along the finite dims. The infinite dims will
342 // thus, most likely, end up with shorter chunk sizes.
343
344 // Declare a container type for storing indices, same as those returned by
345 // the find_all_idcs function
346 using IdxCont = decltype(dims_fin);
347
348 // Create a container with the available dimension indices
349 IdxCont dims(chunks.size());
350 std::iota(dims.begin(), dims.end(), 0);
351
352 // Among the finite dims, determine the dims that can still be filled,
353 // i.e. those where the chunk size does not reach the max_extend
354 IdxCont dims_fillable;
355 for (auto dim : dims_fin)
356 {
357 if (max_extend[dim] > chunks[dim])
358 {
359 dims_fillable.push_back(dim);
360 }
361 }
362
363 // Check if to reverse index containers to favour higher dims
364 if (larger_high_dims)
365 {
366 // Reverse all index containers
367 std::reverse(dims_fillable.begin(), dims_fillable.end());
368 std::reverse(dims_fin.begin(), dims_fin.end());
369 std::reverse(dims_inf.begin(), dims_inf.end());
370
371 // NOTE do not actually _need_ to reverse the finite dims container,
372 // doing it only for consistency.
373 }
374
375 // .. Optimization of finite (and still fillable) dims ....................
376
377 if (!dims_fillable.size())
378 {
379 log->debug("No finite dimensions available to optimize.");
380 }
381 else
382 {
383 log->debug("Optimizing {} finite dimension(s) where max_extend is not "
384 "yet reached ...",
385 dims_fillable.size());
386
387 // Loop over the fillable dims indices
388 for (auto dim : dims_fillable)
389 {
390 // Check if there is still potential for optimization
391 // NOTE this could be more thorough
392 if (bytes(chunks) == CHUNKSIZE_MAX)
393 {
394 log->debug("Reached maximum chunksize.");
395 break;
396 }
397
398 // Check if the max_extend is an integer multiple of the chunksize
399 if (max_extend[dim] % chunks[dim] == 0)
400 {
401 // Find the divisor
402 std::size_t factor = max_extend[dim] / chunks[dim];
403
404 // It might fit in completely ...
405 if (factor * bytes(chunks) <= CHUNKSIZE_MAX)
406 {
407 // It does. Adjust chunks and continue with next dim
408 log->debug("Dimension {} can be filled completely. "
409 "Factor: {}",
410 dim,
411 factor);
412 chunks[dim] = chunks[dim] * factor;
413 continue;
414 }
415 // else: would not fit in completely
416
417 // Starting from the largest possible factor, find the largest
418 // integer divisor of the original factor, i.e.: one that will
419 // also completely cover the max_extend
420 for (std::size_t div = (CHUNKSIZE_MAX / bytes(chunks));
421 div >= 1;
422 div--)
423 {
424 // Check if it is an integer divisor
425 if (factor % div == 0)
426 {
427 // Yes! The _new_ factor is now this value
428 factor = div;
429 break;
430 }
431 }
432 // NOTE Covers the edge case of max. factor == 1: the loop will
433 // perform only one iteration and resulting factor will be 1,
434 // leading (effectively) to no scaling.
435
436 // Scale the chunksize with this factor
437 if (factor > 1)
438 {
439 log->debug(
440 "Scaling dimension {} with factor {} ...", dim, factor);
441
442 chunks[dim] = chunks[dim] * factor;
443 }
444 }
445 else
446 {
447 // Not divisible. Check if the max_extend could be reached w/o
448 // exceeding the max chunksize
449 const double factor = double(max_extend[dim]) / chunks[dim];
450
451 if (factor * bytes(chunks) <= CHUNKSIZE_MAX)
452 {
453 // Yep. Just extend this dimension to the max_extend, done.
454 log->debug("Dimension {} can be filled completely. "
455 "(difference: {}, factor: {})",
456 dim,
457 max_extend[dim] - chunks[dim],
458 factor);
459
460 chunks[dim] = max_extend[dim];
461 }
462 else
463 {
464 // Cannot further extend.
465 log->debug("Dimension {} cannot be extended to fill "
466 "max_extend without exceeding maximum "
467 "chunksize! "
468 "(difference: {}, factor: {})",
469 dim,
470 max_extend[dim] - chunks[dim],
471 factor);
472 }
473 }
474 // Done with this index
475 }
476 }
477
478 // .. Optimization of infinite dims .......................................
479
480 if (!opt_inf_dims)
481 {
482 log->debug("Optimization of unlimited dimensions is disabled.");
483 }
484 else if (!dims_inf.size())
485 {
486 log->debug("No unlimited dimensions available to optimize.");
487 }
488 else if (bytes(chunks) == CHUNKSIZE_MAX)
489 {
490 log->debug("Cannot further optimize using unlimited dimensions.");
491 }
492 else
493 {
494 log->debug("Optimizing {} unlimited dimension(s) to fill the maximum "
495 "chunk size ...",
496 dims_inf.size());
497
498 // Loop over indices of inf. dims
499 // NOTE Depending on the chunk sizes, this might only have an effect
500 // on the first index considered ... but that's fine for now.
501 for (auto dim : dims_inf)
502 {
503 // Calculate the factor to make the chunk as big as possible
504 const std::size_t factor = CHUNKSIZE_MAX / bytes(chunks); // floors
505
506 // If large enough, scale it by that factor
507 if (factor > 1)
508 {
509 log->debug(
510 "Scaling dimension {} with factor {} ...", dim, factor);
511
512 chunks[dim] = chunks[dim] * factor;
513 }
514 }
515 }
516
517 // Done.
518 // Check if everything went fine (only a safeguard ...)
519 if (bytes(chunks) > CHUNKSIZE_MAX)
520 {
521 throw std::runtime_error("Calculated chunks exceed CHUNKSIZE_MAX! "
522 "This should not have happened!");
523 }
524
525 return;
526}

◆ to_str()

template<typename Cont = std::vector< hsize_t >>
std::string Utopia::DataIO::_chunk_helpers::to_str ( const Cont vec)

Helper function to create a string representation of containers.

66{
67 std::stringstream s;
68 s << "{ ";
69 for (auto& extd : vec)
70 {
71 if (extd < H5S_UNLIMITED)
72 {
73 s << extd << " ";
74 }
75 else
76 {
77 s << "∞ ";
78 }
79 }
80 s << "}";
81 return s.str();
82};