Cabana 0.8.0-dev
 
Loading...
Searching...
No Matches
Cabana_Grid_SparseDimPartitioner.hpp
Go to the documentation of this file.
1/****************************************************************************
2 * Copyright (c) 2018-2023 by the Cabana authors *
3 * All rights reserved. *
4 * *
5 * This file is part of the Cabana library. Cabana is distributed under a *
6 * BSD 3-clause license. For the licensing terms see the LICENSE file in *
7 * the top-level directory. *
8 * *
9 * SPDX-License-Identifier: BSD-3-Clause *
10 ****************************************************************************/
11
16#ifndef CABANA_GRID_SPARSEDIMPARTITIONER_HPP
17#define CABANA_GRID_SPARSEDIMPARTITIONER_HPP
18
21#include <Cabana_Utils.hpp> // FIXME: remove after next release.
22
23#include <Kokkos_Core.hpp>
24
25#include <array>
26#include <vector>
27
28#include <mpi.h>
29
30namespace Cabana
31{
32namespace Grid
33{
34//---------------------------------------------------------------------------//
41template <typename MemorySpace, unsigned long long CellPerTileDim = 4,
42 std::size_t NumSpaceDim = 3>
43class SparseDimPartitioner : public BlockPartitioner<NumSpaceDim>
44{
45 public:
47 static constexpr std::size_t num_space_dim = NumSpaceDim;
48
50 using memory_space = MemorySpace;
51 static_assert( Kokkos::is_memory_space<MemorySpace>() );
52
54 using execution_space = typename memory_space::execution_space;
55
57 using workload_view = Kokkos::View<int***, memory_space>;
59 using partition_view = Kokkos::View<int* [num_space_dim], memory_space>;
62 Kokkos::View<int***, typename execution_space::array_layout,
63 Kokkos::HostSpace>;
66 Kokkos::View<int* [num_space_dim],
67 typename execution_space::array_layout, Kokkos::HostSpace>;
68
70 static constexpr unsigned long long cell_bits_per_tile_dim =
71 bitCount( CellPerTileDim );
74 static constexpr unsigned long long cell_num_per_tile_dim =
76
90 MPI_Comm comm, float max_workload_coeff, int workload_num,
91 int num_step_rebalance,
92 const std::array<int, num_space_dim>& global_cells_per_dim,
93 int max_optimize_iteration = 10 )
94 : _workload_threshold(
95 static_cast<int>( max_workload_coeff * workload_num ) )
96 , _num_step_rebalance( num_step_rebalance )
97 , _max_optimize_iteration( max_optimize_iteration )
98 {
99 // compute the ranks_per_dim from MPI communicator
100 allocate( global_cells_per_dim );
101 ranksPerDimension( comm );
102 }
103
119 MPI_Comm comm, float max_workload_coeff, int workload_num,
120 int num_step_rebalance,
121 const std::array<int, num_space_dim>& ranks_per_dim,
122 const std::array<int, num_space_dim>& global_cells_per_dim,
123 int max_optimize_iteration = 10 )
124 : _workload_threshold(
125 static_cast<int>( max_workload_coeff * workload_num ) )
126 , _num_step_rebalance( num_step_rebalance )
127 , _max_optimize_iteration( max_optimize_iteration )
128 {
129 allocate( global_cells_per_dim );
130 std::copy( ranks_per_dim.begin(), ranks_per_dim.end(),
131 _ranks_per_dim.data() );
132
133 // init MPI topology
134 int comm_size;
135 MPI_Comm_size( comm, &comm_size );
136 MPI_Dims_create( comm_size, num_space_dim, _ranks_per_dim.data() );
137 }
138
144 std::array<int, num_space_dim> ranksPerDimension( MPI_Comm comm )
145 {
146 int comm_size;
147 MPI_Comm_size( comm, &comm_size );
148
149 std::array<int, num_space_dim> ranks_per_dim;
150 for ( std::size_t d = 0; d < num_space_dim; ++d )
151 ranks_per_dim[d] = 0;
152 MPI_Dims_create( comm_size, num_space_dim, ranks_per_dim.data() );
153
154 std::copy( ranks_per_dim.begin(), ranks_per_dim.end(),
155 _ranks_per_dim.data() );
156
157 return ranks_per_dim;
158 }
159
165 std::array<int, num_space_dim>
166 ranksPerDimension( MPI_Comm comm,
167 const std::array<int, num_space_dim>& ) const override
168 {
169 std::array<int, num_space_dim> ranks_per_dim = {
170 _ranks_per_dim[0], _ranks_per_dim[1], _ranks_per_dim[2] };
171 int comm_size;
172 MPI_Comm_size( comm, &comm_size );
173 int nrank = 1;
174 for ( std::size_t d = 0; d < num_space_dim; ++d )
175 nrank *= _ranks_per_dim[d];
176 if ( comm_size != nrank )
177 throw std::runtime_error(
178 "Cabana::Grid::SparseDimPartitioner::ranksPerDimension: "
179 "SparseDimPartitioner ranks do not match comm size" );
180 return ranks_per_dim;
181 }
182
187 std::array<int, num_space_dim>
188 ownedTilesPerDimension( MPI_Comm cart_comm ) const
189 {
190 // Get the Cartesian topology index of this rank.
191 std::array<int, num_space_dim> cart_rank;
192 int linear_rank;
193 MPI_Comm_rank( cart_comm, &linear_rank );
194 MPI_Cart_coords( cart_comm, linear_rank, num_space_dim,
195 cart_rank.data() );
196
197 // Get the tiles per dimension and the remainder.
198 std::array<int, num_space_dim> tiles_per_dim;
199 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
200 Kokkos::HostSpace(), _rectangle_partition_dev );
201 for ( std::size_t d = 0; d < num_space_dim; ++d )
202 tiles_per_dim[d] = rec_mirror( cart_rank[d] + 1, d ) -
203 rec_mirror( cart_rank[d], d );
204 return tiles_per_dim;
205 }
206
211 std::array<int, num_space_dim>
212 ownedCellsPerDimension( MPI_Comm cart_comm ) const
213 {
214 auto tiles_per_dim = ownedTilesPerDimension( cart_comm );
215 for ( std::size_t d = 0; d < num_space_dim; ++d )
216 {
217 // compute cells_per_dim from tiles_per_dim
218 tiles_per_dim[d] <<= cell_bits_per_tile_dim;
219 }
220 return tiles_per_dim;
221 }
222
232 void
233 ownedTileInfo( MPI_Comm cart_comm,
234 std::array<int, num_space_dim>& owned_num_tile,
235 std::array<int, num_space_dim>& global_tile_offset ) const
236 {
237 // Get the Cartesian topology index of this rank.
238 std::array<int, num_space_dim> cart_rank;
239 int linear_rank;
240 MPI_Comm_rank( cart_comm, &linear_rank );
241 MPI_Cart_coords( cart_comm, linear_rank, num_space_dim,
242 cart_rank.data() );
243
244 // Get the tiles per dimension and the remainder.
245 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
246 Kokkos::HostSpace(), _rectangle_partition_dev );
247 for ( std::size_t d = 0; d < num_space_dim; ++d )
248 {
249 owned_num_tile[d] = rec_mirror( cart_rank[d] + 1, d ) -
250 rec_mirror( cart_rank[d], d );
251 global_tile_offset[d] = rec_mirror( cart_rank[d], d );
252 }
253 }
254
265 MPI_Comm cart_comm, const std::array<int, num_space_dim>&,
266 std::array<int, num_space_dim>& owned_num_cell,
267 std::array<int, num_space_dim>& global_cell_offset ) const override
268 {
269 ownedTileInfo( cart_comm, owned_num_cell, global_cell_offset );
270 for ( std::size_t d = 0; d < num_space_dim; ++d )
271 {
272 // compute cells_per_dim from tiles_per_dim
273 owned_num_cell[d] <<= cell_bits_per_tile_dim;
274 global_cell_offset[d] <<= cell_bits_per_tile_dim;
275 }
276 }
277
286 void initializeRecPartition( std::vector<int>& rec_partition_i,
287 std::vector<int>& rec_partition_j,
288 std::vector<int>& rec_partition_k )
289 {
290 int max_size = 0;
291 for ( std::size_t d = 0; d < num_space_dim; ++d )
292 max_size =
293 max_size < _ranks_per_dim[d] ? _ranks_per_dim[d] : max_size;
294
295 typedef typename execution_space::array_layout layout;
296 Kokkos::View<int* [num_space_dim], layout, Kokkos::HostSpace>
297 rectangle_partition( "rectangle_partition_host", max_size + 1 );
298
299 for ( int id = 0; id < _ranks_per_dim[0] + 1; ++id )
300 rectangle_partition( id, 0 ) = rec_partition_i[id];
301
302 for ( int id = 0; id < _ranks_per_dim[1] + 1; ++id )
303 rectangle_partition( id, 1 ) = rec_partition_j[id];
304
305 for ( int id = 0; id < _ranks_per_dim[2] + 1; ++id )
306 rectangle_partition( id, 2 ) = rec_partition_k[id];
307
308 _rectangle_partition_dev = Kokkos::create_mirror_view_and_copy(
309 memory_space(), rectangle_partition );
310 }
311
316 std::array<std::vector<int>, num_space_dim> getCurrentPartition()
317 {
318 std::array<std::vector<int>, num_space_dim> rec_part;
319 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
320 Kokkos::HostSpace(), _rectangle_partition_dev );
321 for ( std::size_t d = 0; d < num_space_dim; ++d )
322 {
323 rec_part[d].resize( _ranks_per_dim[d] + 1 );
324 for ( int id = 0; id < _ranks_per_dim[d] + 1; ++id )
325 {
326 rec_part[d][id] = rec_mirror( id, d );
327 }
328 }
329 return rec_part;
330 }
331
337 {
338 Kokkos::deep_copy( _workload_per_tile, 0 );
339 Kokkos::deep_copy( _workload_prefix_sum, 0 );
340 }
341
351 template <class ParticlePosViewType, typename ArrayType, typename CellUnit>
352 void computeLocalWorkLoad( const ParticlePosViewType& view,
353 int particle_num,
354 const ArrayType& global_lower_corner,
355 const CellUnit dx )
356 {
358 // make a local copy
359 auto workload = _workload_per_tile;
360 Kokkos::Array<CellUnit, num_space_dim> lower_corner;
361 for ( std::size_t d = 0; d < num_space_dim; ++d )
362 {
363 lower_corner[d] = global_lower_corner[d];
364 }
365
366 Kokkos::parallel_for(
367 "Cabana::Grid::SparseDimPartitioner::computeLocalWorkLoadPosition",
368 Kokkos::RangePolicy<execution_space>( 0, particle_num ),
369 KOKKOS_LAMBDA( const int i ) {
370 int ti = static_cast<int>(
371 ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >>
373 int tj = static_cast<int>(
374 ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >>
376 int tz = static_cast<int>(
377 ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >>
379 Kokkos::atomic_inc( &workload( ti + 1, tj + 1, tz + 1 ) );
380 } );
381 Kokkos::fence();
382 }
383
389 template <class SparseMapType>
390 void computeLocalWorkLoad( const SparseMapType& sparseMap )
391 {
393 // make a local copy
394 auto workload = _workload_per_tile;
395 Kokkos::parallel_for(
396 "Cabana::Grid::SparseDimPartitioner::computeLocalWorkLoadSparseMap",
397 Kokkos::RangePolicy<execution_space>( 0, sparseMap.capacity() ),
398 KOKKOS_LAMBDA( uint32_t i ) {
399 if ( sparseMap.valid_at( i ) )
400 {
401 auto key = sparseMap.key_at( i );
402 int ti, tj, tk;
403 sparseMap.key2ijk( key, ti, tj, tk );
404 Kokkos::atomic_inc( &workload( ti + 1, tj + 1, tk + 1 ) );
405 }
406 } );
407 Kokkos::fence();
408 }
409
415 void computeFullPrefixSum( MPI_Comm comm )
416 {
417 // local copy
418 auto workload = _workload_per_tile;
419 auto prefix_sum = _workload_prefix_sum;
420 int total_size = _workload_per_tile.size();
421
422 // MPI all reduce: compute workload in all MPI ranks from the local
423 // workload matrix, save the results in _workload_prefix_sum
424 MPI_Allreduce( workload.data(), prefix_sum.data(), total_size, MPI_INT,
425 MPI_SUM, comm );
426 MPI_Barrier( comm );
427
428 // compute the prefix sum (in three dimensions)
429 // prefix sum in the dimension 0
430 for ( int j = 0;
431 j < static_cast<int>( _workload_prefix_sum.extent( 1 ) ); ++j )
432 for ( int k = 0;
433 k < static_cast<int>( _workload_prefix_sum.extent( 2 ) );
434 ++k )
435 Kokkos::parallel_scan(
436 "scan_prefix_sum_dim0",
437 Kokkos::RangePolicy<execution_space>(
438 0, _workload_prefix_sum.extent( 0 ) ),
439 KOKKOS_LAMBDA( const int i, int& update,
440 const bool final ) {
441 const float val_i = prefix_sum( i, j, k );
442 update += val_i;
443 if ( final )
444 {
445 prefix_sum( i, j, k ) = update;
446 }
447 } );
448 Kokkos::fence();
449
450 // prefix sum in the dimension 1
451 for ( int i = 0;
452 i < static_cast<int>( _workload_prefix_sum.extent( 0 ) ); ++i )
453 for ( int k = 0;
454 k < static_cast<int>( _workload_prefix_sum.extent( 2 ) );
455 ++k )
456 Kokkos::parallel_scan(
457 "scan_prefix_sum_dim1",
458 Kokkos::RangePolicy<execution_space>(
459 0, _workload_prefix_sum.extent( 1 ) ),
460 KOKKOS_LAMBDA( const int j, int& update,
461 const bool final ) {
462 const float val_i = prefix_sum( i, j, k );
463 update += val_i;
464 if ( final )
465 {
466 prefix_sum( i, j, k ) = update;
467 }
468 } );
469 Kokkos::fence();
470
471 // prefix sum in the dimension 2
472 for ( int i = 0;
473 i < static_cast<int>( _workload_prefix_sum.extent( 0 ) ); ++i )
474 for ( int j = 0;
475 j < static_cast<int>( _workload_prefix_sum.extent( 1 ) );
476 ++j )
477 Kokkos::parallel_scan(
478 "scan_prefix_sum_dim2",
479 Kokkos::RangePolicy<execution_space>(
480 0, _workload_prefix_sum.extent( 2 ) ),
481 KOKKOS_LAMBDA( const int k, int& update,
482 const bool final ) {
483 const float val_i = prefix_sum( i, j, k );
484 update += val_i;
485 if ( final )
486 {
487 prefix_sum( i, j, k ) = update;
488 }
489 } );
490 Kokkos::fence();
491 }
492
503 template <class ParticlePosViewType, typename ArrayType, typename CellUnit>
504 int optimizePartition( const ParticlePosViewType& view, int particle_num,
505 const ArrayType& global_lower_corner,
506 const CellUnit dx, MPI_Comm comm )
507 {
508 computeLocalWorkLoad( view, particle_num, global_lower_corner, dx );
509 MPI_Barrier( comm );
510
511 computeFullPrefixSum( comm );
512 MPI_Barrier( comm );
513
514 // each iteration covers partitioner optization in all three dimensions
515 // (with a random dim sequence)
516 for ( int i = 0; i < _max_optimize_iteration; ++i )
517 {
518 bool is_changed = false; // record changes in current iteration
519 bool dim_covered[3] = { false, false, false };
520 for ( int d = 0; d < 3; ++d )
521 {
522 int random_dim_id = std::rand() % num_space_dim;
523 while ( dim_covered[random_dim_id] )
524 random_dim_id = std::rand() % num_space_dim;
525
526 bool is_dim_changed = false; // record changes in current dim
527 optimizePartition( is_dim_changed, random_dim_id );
528
529 // update control info
530 is_changed = is_changed || is_dim_changed;
531 dim_covered[random_dim_id] = true;
532 }
533 // return if the current partition is optimal
534 if ( !is_changed )
535 return i;
536 }
537 return _max_optimize_iteration;
538 }
539
546 template <class SparseMapType>
547 int optimizePartition( const SparseMapType& sparseMap, MPI_Comm comm )
548 {
549 computeLocalWorkLoad( sparseMap );
550 MPI_Barrier( comm );
551
552 computeFullPrefixSum( comm );
553 MPI_Barrier( comm );
554
555 for ( int i = 0; i < _max_optimize_iteration; ++i )
556 {
557 bool is_changed = false; // record changes in current iteration
558 bool dim_covered[3] = { false, false, false };
559 for ( int d = 0; d < 3; ++d )
560 {
561 int random_dim_id = std::rand() % num_space_dim;
562 while ( dim_covered[random_dim_id] )
563 random_dim_id = std::rand() % num_space_dim;
564
565 bool is_dim_changed = false; // record changes in current dim
566 optimizePartition( is_dim_changed, random_dim_id );
567
568 // update control info
569 is_changed = is_changed || is_dim_changed;
570 dim_covered[random_dim_id] = true;
571 }
572 // return if the current partition is optimal
573 if ( !is_changed )
574 return i;
575 }
576 return _max_optimize_iteration;
577 }
578
585 void optimizePartition( bool& is_changed, int iter_seed )
586 {
587 is_changed = false;
588 // loop over three dimensions, optimize the partition in dimension di
589 for ( int iter_id = iter_seed;
590 iter_id < iter_seed + static_cast<int>( num_space_dim );
591 ++iter_id )
592 {
593 int di = iter_id % num_space_dim;
594 // compute the dimensions that should be fixed (dj and dk)
595 int dj = ( di + 1 ) % num_space_dim;
596 int dk = ( di + 2 ) % num_space_dim;
597 auto rank_k = _ranks_per_dim[dk];
598
599 auto rank = _ranks_per_dim[di];
600 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
601 Kokkos::HostSpace(), _rectangle_partition_dev );
602 auto rec_partition = _rectangle_partition_dev;
603
605 compute_sub_workload( _rectangle_partition_dev,
606 _workload_prefix_sum );
607
608 // compute average workload in the dimension di
609 Kokkos::View<int*, memory_space> ave_workload(
610 "ave_workload", _ranks_per_dim[dj] * _ranks_per_dim[dk] );
611 Kokkos::parallel_for(
612 "Cabana::Grid::SparseDimPartitioner::computeAverageWorkLoad",
613 Kokkos::RangePolicy<execution_space>(
614 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
615 KOKKOS_LAMBDA( uint32_t jnk ) {
616 // compute rank_id in the fixed dimensions
617 int j = static_cast<int>( jnk / rank_k );
618 int k = static_cast<int>( jnk % rank_k );
619 // compute the average workload with the partition of the
620 // two fixed dimensions
621 ave_workload( jnk ) =
622 compute_sub_workload( di, 0, rec_partition( rank, di ),
623 dj, j, dk, k ) /
624 rank;
625 } );
626 Kokkos::fence();
627
628 // point_i: current partition position
629 int point_i = 1;
630 // equal_start_point: register the beginning pos of potentially
631 // equivalent partitions
632 int equal_start_point = 1;
633 // last_point: the optimized position for the lask partition
634 int last_point = 0;
635 // current_workload: the workload between [last_point, point_i)
636 Kokkos::View<int*, memory_space> current_workload(
637 "current_workload", _ranks_per_dim[dj] * _ranks_per_dim[dk] );
638 for ( int current_rank = 1; current_rank < rank; current_rank++ )
639 {
640 int last_diff = __INT_MAX__;
641 while ( true )
642 {
643 // compute current workload between [last_point, point_i)
644 Kokkos::parallel_for(
645 "Cabana::Grid::SparseDimPartitioner::"
646 "computeCurrentWorkLoad",
647 Kokkos::RangePolicy<execution_space>(
648 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
649 KOKKOS_LAMBDA( uint32_t jnk ) {
650 int j = static_cast<int>( jnk / rank_k );
651 int k = static_cast<int>( jnk % rank_k );
652 current_workload( jnk ) = compute_sub_workload(
653 di, last_point, point_i, dj, j, dk, k );
654 } );
655 Kokkos::fence();
656
657 // compute the (w_jk^ave - w_jk^{last_point:point_i})
658 Kokkos::parallel_for(
659 "Cabana::Grid::SparseDimPartitioner::computeDifference",
660 Kokkos::RangePolicy<execution_space>(
661 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
662 KOKKOS_LAMBDA( uint32_t jnk ) {
663 auto wl =
664 current_workload( jnk ) - ave_workload( jnk );
665 // compute absolute diff (rather than squares to
666 // avoid potential overflow)
667 // TODO: update when Kokkos::abs() available
668 wl = wl > 0 ? wl : -wl;
669 current_workload( jnk ) = wl;
670 } );
671 Kokkos::fence();
672
673 // compute the sum of the difference in all rank_j*rank_k
674 // regions
675 int diff;
676 Kokkos::parallel_reduce(
677 "diff_reduce",
678 Kokkos::RangePolicy<execution_space>(
679 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
680 KOKKOS_LAMBDA( const int idx, int& update ) {
681 update += current_workload( idx );
682 },
683 diff );
684 Kokkos::fence();
685
686 // record the new optimal position
687 if ( diff <= last_diff )
688 {
689 // register starting points of potentially equivalent
690 // partitions
691 if ( diff != last_diff )
692 equal_start_point = point_i;
693
694 // check if point_i reach the total_tile_num
695 if ( point_i == rec_mirror( rank, di ) )
696 {
697 rec_mirror( current_rank, di ) = point_i;
698 break;
699 }
700
701 last_diff = diff;
702 point_i++;
703 }
704 else
705 {
706 // final optimal position - middle position of all
707 // potentially equivalent partitions
708 if ( rec_mirror( current_rank, di ) !=
709 ( point_i - 1 + equal_start_point ) / 2 )
710 {
711 rec_mirror( current_rank, di ) =
712 ( point_i - 1 + equal_start_point ) / 2;
713 is_changed = true;
714 }
715 last_point = point_i - 1;
716 break;
717 }
718 } // end while (optimization for the current rank)
719 } // end for (all partition/rank in the optimized dimension)
720 Kokkos::deep_copy( _rectangle_partition_dev, rec_mirror );
721 } // end for (3 dimensions)
722 }
723
729 int currentRankWorkload( MPI_Comm cart_comm )
730 {
731 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
732 Kokkos::HostSpace(), _rectangle_partition_dev );
733 auto prefix_sum_mirror = Kokkos::create_mirror_view_and_copy(
734 Kokkos::HostSpace(), _workload_prefix_sum );
735
736 return currentRankWorkload( cart_comm, rec_mirror, prefix_sum_mirror );
737 }
738
746 template <typename PartitionViewHost, typename WorkloadViewHost>
747 int currentRankWorkload( MPI_Comm cart_comm, PartitionViewHost& rec_view,
748 WorkloadViewHost& prefix_sum_view )
749 {
751 compute_sub_workload_host( rec_view, prefix_sum_view );
752
753 // Get the Cartesian topology index of this rank.
754 Kokkos::Array<int, num_space_dim> cart_rank;
755 int linear_rank;
756 MPI_Comm_rank( cart_comm, &linear_rank );
757 MPI_Cart_coords( cart_comm, linear_rank, num_space_dim,
758 cart_rank.data() );
759
760 // compute total workload of the current rank
761 int workload_current_rank = compute_sub_workload_host(
762 0, rec_view( cart_rank[0], 0 ), rec_view( cart_rank[0] + 1, 0 ), 1,
763 cart_rank[1], 2, cart_rank[2] );
764
765 return workload_current_rank;
766 }
767
773 {
774 auto prefix_sum_view = Kokkos::create_mirror_view_and_copy(
775 Kokkos::HostSpace(), _workload_prefix_sum );
776 // compute total workload of the current rank
777 return averageRankWorkload( prefix_sum_view );
778 }
779
785 template <typename WorkloadViewHost>
786 int averageRankWorkload( WorkloadViewHost& prefix_sum_view )
787 {
788 // compute total workload of the current rank
789 return prefix_sum_view( prefix_sum_view.extent( 0 ) - 1,
790 prefix_sum_view.extent( 1 ) - 1,
791 prefix_sum_view.extent( 2 ) - 1 ) /
792 ( _ranks_per_dim[0] * _ranks_per_dim[1] * _ranks_per_dim[2] );
793 }
794
800 float computeImbalanceFactor( MPI_Comm cart_comm )
801 {
802 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
803 Kokkos::HostSpace(), _rectangle_partition_dev );
804 auto prefix_sum_mirror = Kokkos::create_mirror_view_and_copy(
805 Kokkos::HostSpace(), _workload_prefix_sum );
806
807 int workload_current_rank =
808 currentRankWorkload( cart_comm, rec_mirror, prefix_sum_mirror );
809 int workload_ave_rank = averageRankWorkload( prefix_sum_mirror );
810
811 return static_cast<float>( workload_current_rank ) /
812 static_cast<float>( workload_ave_rank );
813 }
814
819 template <typename PartitionView, typename WorkloadView>
821 {
823 PartitionView rec_partition;
826
828 SubWorkloadFunctor( PartitionView rec_par, WorkloadView pre_sum )
829 : rec_partition( rec_par )
830 , workload_prefix_sum( pre_sum )
831 {
832 }
833
838 KOKKOS_INLINE_FUNCTION int operator()( int dim_i, int i_start,
839 int i_end, int dim_j, int j,
840 int dim_k, int k ) const
841 {
842 int end[num_space_dim], start[num_space_dim];
843 end[dim_i] = i_end;
844 end[dim_j] = rec_partition( j + 1, dim_j );
845 end[dim_k] = rec_partition( k + 1, dim_k );
846
847 start[dim_i] = i_start;
848 start[dim_j] = rec_partition( j, dim_j );
849 start[dim_k] = rec_partition( k, dim_k );
850
851 // S[i][j][k] = S[i-1][j][k] + S[i][j-1][k] + S[i][j][k-1] -
852 // S[i-1][j-1][k]
853 // - S[i][j-1][k-1] - S[i-1][j][k-1] + S[i-1][j-1][k-1] + a[i][j][k]
854 return workload_prefix_sum( end[0], end[1], end[2] ) // S[i][j][k]
855 - workload_prefix_sum( start[0], end[1],
856 end[2] ) // S[i-1][j][k]
857 - workload_prefix_sum( end[0], start[1],
858 end[2] ) // S[i][j-1][k]
859 - workload_prefix_sum( end[0], end[1],
860 start[2] ) // S[i][j][k-1]
861 + workload_prefix_sum( start[0], start[1],
862 end[2] ) // S[i-1][j-1][k]
863 + workload_prefix_sum( end[0], start[1],
864 start[2] ) // S[i][j-1][k-1]
865 + workload_prefix_sum( start[0], end[1],
866 start[2] ) // S[i-1][j][k-1]
867 - workload_prefix_sum( start[0], start[1],
868 start[2] ); // S[i-1][j-1][k-1]
869 }
870 };
871
872 private:
873 // workload_threshold
874 int _workload_threshold;
875 // default check point for re-balance
876 int _num_step_rebalance;
877 // max_optimize iterations
878 int _max_optimize_iteration;
879
880 // represent the rectangle partition in each dimension
881 // with form [0, p_1, ..., p_n, cell_num], n = rank num in current
882 // dimension, partition in this dimension would be [0, p_1), [p_1, p_2) ...
883 // [p_n, total_tile_num] (unit: tile)
884 partition_view _rectangle_partition_dev;
885 // the workload of each tile on current
886 workload_view _workload_per_tile;
887 // 3d prefix sum of the workload of each tile on current
888 workload_view _workload_prefix_sum;
889 // ranks per dimension
890 Kokkos::Array<int, num_space_dim> _ranks_per_dim;
891
892 void allocate( const std::array<int, num_space_dim>& global_cells_per_dim )
893 {
894 _workload_per_tile = workload_view(
895 Kokkos::view_alloc( Kokkos::WithoutInitializing,
896 "workload_per_tile" ),
897 ( global_cells_per_dim[0] >> cell_bits_per_tile_dim ) + 1,
898 ( global_cells_per_dim[1] >> cell_bits_per_tile_dim ) + 1,
899 ( global_cells_per_dim[2] >> cell_bits_per_tile_dim ) + 1 );
900
901 _workload_prefix_sum = workload_view(
902 Kokkos::view_alloc( Kokkos::WithoutInitializing,
903 "workload_prefix_sum" ),
904 ( global_cells_per_dim[0] >> cell_bits_per_tile_dim ) + 1,
905 ( global_cells_per_dim[1] >> cell_bits_per_tile_dim ) + 1,
906 ( global_cells_per_dim[2] >> cell_bits_per_tile_dim ) + 1 );
907 }
908};
909} // namespace Grid
910} // namespace Cabana
911
912#endif // end CABANA_GRID_SPARSEDIMPARTITIONER_HPP
Multi-node grid partitioner.
KOKKOS_INLINE_FUNCTION constexpr Integer bitCount(Integer input_int) noexcept
(Host/Device) Compute the lease bit number needed to index input integer
Definition Cabana_Grid_SparseIndexSpace.hpp:80
Cabana utilities.
Block partitioner base class.
Definition Cabana_Grid_Partitioner.hpp:37
void resetWorkload()
set all elements in _workload_per_tile and _workload_prefix_sum matrix to 0
Definition Cabana_Grid_SparseDimPartitioner.hpp:336
int currentRankWorkload(MPI_Comm cart_comm, PartitionViewHost &rec_view, WorkloadViewHost &prefix_sum_view)
compute the total workload on the current MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:747
int optimizePartition(const SparseMapType &sparseMap, MPI_Comm comm)
iteratively optimize the partition
Definition Cabana_Grid_SparseDimPartitioner.hpp:547
SparseDimPartitioner(MPI_Comm comm, float max_workload_coeff, int workload_num, int num_step_rebalance, const std::array< int, num_space_dim > &global_cells_per_dim, int max_optimize_iteration=10)
Constructor - automatically compute ranks_per_dim from MPI communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:89
std::array< std::vector< int >, num_space_dim > getCurrentPartition()
Get the current partition. Copy partition from the device view to host std::array<vector>
Definition Cabana_Grid_SparseDimPartitioner.hpp:316
Kokkos::View< int ***, memory_space > workload_view
Workload device view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:57
void computeFullPrefixSum(MPI_Comm comm)
reduce the total workload in all MPI ranks; 2. compute the workload prefix sum matrix for all MPI ran...
Definition Cabana_Grid_SparseDimPartitioner.hpp:415
void initializeRecPartition(std::vector< int > &rec_partition_i, std::vector< int > &rec_partition_j, std::vector< int > &rec_partition_k)
Initialize the tile partition; partition in each dimension has the form [0, p_1, ....
Definition Cabana_Grid_SparseDimPartitioner.hpp:286
void ownedTileInfo(MPI_Comm cart_comm, std::array< int, num_space_dim > &owned_num_tile, std::array< int, num_space_dim > &global_tile_offset) const
Get the owned number of tiles and the global tile offset of the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:233
static constexpr std::size_t num_space_dim
dimension
Definition Cabana_Grid_SparseDimPartitioner.hpp:47
int averageRankWorkload()
compute the average workload on each MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:772
Kokkos::View< int *[num_space_dim], typename execution_space::array_layout, Kokkos::HostSpace > partition_view_host
Partition host view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:65
float computeImbalanceFactor(MPI_Comm cart_comm)
compute the imbalance factor for the current partition
Definition Cabana_Grid_SparseDimPartitioner.hpp:800
MemorySpace memory_space
Kokkos memory space.
Definition Cabana_Grid_SparseDimPartitioner.hpp:50
static constexpr unsigned long long cell_num_per_tile_dim
Definition Cabana_Grid_SparseDimPartitioner.hpp:74
typename memory_space::execution_space execution_space
Default execution space.
Definition Cabana_Grid_SparseDimPartitioner.hpp:54
int averageRankWorkload(WorkloadViewHost &prefix_sum_view)
compute the average workload on each MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:786
void computeLocalWorkLoad(const SparseMapType &sparseMap)
compute the workload in the current MPI rank from sparseMap (the workload of a tile is 1 if the tile ...
Definition Cabana_Grid_SparseDimPartitioner.hpp:390
int currentRankWorkload(MPI_Comm cart_comm)
compute the total workload on the current MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:729
void ownedCellInfo(MPI_Comm cart_comm, const std::array< int, num_space_dim > &, std::array< int, num_space_dim > &owned_num_cell, std::array< int, num_space_dim > &global_cell_offset) const override
Get the owned number of cells and the global cell offset of the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:264
void optimizePartition(bool &is_changed, int iter_seed)
optimize the partition in three dimensions separately
Definition Cabana_Grid_SparseDimPartitioner.hpp:585
void computeLocalWorkLoad(const ParticlePosViewType &view, int particle_num, const ArrayType &global_lower_corner, const CellUnit dx)
compute the workload in the current MPI rank from particle positions (each particle count for 1 workl...
Definition Cabana_Grid_SparseDimPartitioner.hpp:352
std::array< int, num_space_dim > ranksPerDimension(MPI_Comm comm, const std::array< int, num_space_dim > &) const override
Get the number of MPI ranks in each dimension of the grid from the given MPI communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:166
std::array< int, num_space_dim > ownedTilesPerDimension(MPI_Comm cart_comm) const
Get the tile number in each dimension owned by the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:188
Kokkos::View< int ***, typename execution_space::array_layout, Kokkos::HostSpace > workload_view_host
Workload host view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:61
std::array< int, num_space_dim > ownedCellsPerDimension(MPI_Comm cart_comm) const
Get the cell number in each dimension owned by the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:212
SparseDimPartitioner(MPI_Comm comm, float max_workload_coeff, int workload_num, int num_step_rebalance, const std::array< int, num_space_dim > &ranks_per_dim, const std::array< int, num_space_dim > &global_cells_per_dim, int max_optimize_iteration=10)
Constructor - user-defined ranks_per_dim communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:118
Kokkos::View< int *[num_space_dim], memory_space > partition_view
Partition device view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:59
static constexpr unsigned long long cell_bits_per_tile_dim
Number of bits (per dimension) needed to index the cells inside a tile.
Definition Cabana_Grid_SparseDimPartitioner.hpp:70
std::array< int, num_space_dim > ranksPerDimension(MPI_Comm comm)
Compute the number of MPI ranks in each dimension of the grid from the given MPI communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:144
int optimizePartition(const ParticlePosViewType &view, int particle_num, const ArrayType &global_lower_corner, const CellUnit dx, MPI_Comm comm)
iteratively optimize the partition
Definition Cabana_Grid_SparseDimPartitioner.hpp:504
Core: particle data structures and algorithms.
Definition Cabana_AoSoA.hpp:36
functor to compute the sub workload in a given region (from the prefix sum)
Definition Cabana_Grid_SparseDimPartitioner.hpp:821
PartitionView rec_partition
Rectilinear partition.
Definition Cabana_Grid_SparseDimPartitioner.hpp:823
KOKKOS_INLINE_FUNCTION int operator()(int dim_i, int i_start, int i_end, int dim_j, int j, int dim_k, int k) const
Definition Cabana_Grid_SparseDimPartitioner.hpp:838
SubWorkloadFunctor(PartitionView rec_par, WorkloadView pre_sum)
Constructor.
Definition Cabana_Grid_SparseDimPartitioner.hpp:828
WorkloadView workload_prefix_sum
Workload prefix sum matrix.
Definition Cabana_Grid_SparseDimPartitioner.hpp:825