16#ifndef CABANA_GRID_SPARSEDIMPARTITIONER_HPP
17#define CABANA_GRID_SPARSEDIMPARTITIONER_HPP
23#include <Kokkos_Core.hpp>
41template <
typename MemorySpace,
unsigned long long CellPerTileDim = 4,
42 std::size_t NumSpaceDim = 3>
51 static_assert( Kokkos::is_memory_space<MemorySpace>() );
62 Kokkos::View<
int***,
typename execution_space::array_layout,
67 typename execution_space::array_layout, Kokkos::HostSpace>;
90 MPI_Comm comm,
float max_workload_coeff,
int workload_num,
91 int num_step_rebalance,
92 const std::array<int, num_space_dim>& global_cells_per_dim,
93 int max_optimize_iteration = 10 )
94 : _workload_threshold(
95 static_cast<int>( max_workload_coeff * workload_num ) )
96 , _num_step_rebalance( num_step_rebalance )
97 , _max_optimize_iteration( max_optimize_iteration )
100 allocate( global_cells_per_dim );
119 MPI_Comm comm,
float max_workload_coeff,
int workload_num,
120 int num_step_rebalance,
121 const std::array<int, num_space_dim>& ranks_per_dim,
122 const std::array<int, num_space_dim>& global_cells_per_dim,
123 int max_optimize_iteration = 10 )
124 : _workload_threshold(
125 static_cast<int>( max_workload_coeff * workload_num ) )
126 , _num_step_rebalance( num_step_rebalance )
127 , _max_optimize_iteration( max_optimize_iteration )
129 allocate( global_cells_per_dim );
130 std::copy( ranks_per_dim.begin(), ranks_per_dim.end(),
131 _ranks_per_dim.data() );
135 MPI_Comm_size( comm, &comm_size );
136 MPI_Dims_create( comm_size,
num_space_dim, _ranks_per_dim.data() );
147 MPI_Comm_size( comm, &comm_size );
149 std::array<int, num_space_dim> ranks_per_dim;
151 ranks_per_dim[d] = 0;
152 MPI_Dims_create( comm_size,
num_space_dim, ranks_per_dim.data() );
154 std::copy( ranks_per_dim.begin(), ranks_per_dim.end(),
155 _ranks_per_dim.data() );
157 return ranks_per_dim;
165 std::array<int, num_space_dim>
167 const std::array<int, num_space_dim>& )
const override
169 std::array<int, num_space_dim> ranks_per_dim = {
170 _ranks_per_dim[0], _ranks_per_dim[1], _ranks_per_dim[2] };
172 MPI_Comm_size( comm, &comm_size );
175 nrank *= _ranks_per_dim[d];
176 if ( comm_size != nrank )
177 throw std::runtime_error(
178 "Cabana::Grid::SparseDimPartitioner::ranksPerDimension: "
179 "SparseDimPartitioner ranks do not match comm size" );
180 return ranks_per_dim;
187 std::array<int, num_space_dim>
191 std::array<int, num_space_dim> cart_rank;
193 MPI_Comm_rank( cart_comm, &linear_rank );
198 std::array<int, num_space_dim> tiles_per_dim;
199 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
200 Kokkos::HostSpace(), _rectangle_partition_dev );
202 tiles_per_dim[d] = rec_mirror( cart_rank[d] + 1, d ) -
203 rec_mirror( cart_rank[d], d );
204 return tiles_per_dim;
211 std::array<int, num_space_dim>
220 return tiles_per_dim;
234 std::array<int, num_space_dim>& owned_num_tile,
235 std::array<int, num_space_dim>& global_tile_offset )
const
238 std::array<int, num_space_dim> cart_rank;
240 MPI_Comm_rank( cart_comm, &linear_rank );
245 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
246 Kokkos::HostSpace(), _rectangle_partition_dev );
249 owned_num_tile[d] = rec_mirror( cart_rank[d] + 1, d ) -
250 rec_mirror( cart_rank[d], d );
251 global_tile_offset[d] = rec_mirror( cart_rank[d], d );
265 MPI_Comm cart_comm,
const std::array<int, num_space_dim>&,
266 std::array<int, num_space_dim>& owned_num_cell,
267 std::array<int, num_space_dim>& global_cell_offset )
const override
269 ownedTileInfo( cart_comm, owned_num_cell, global_cell_offset );
287 std::vector<int>& rec_partition_j,
288 std::vector<int>& rec_partition_k )
293 max_size < _ranks_per_dim[d] ? _ranks_per_dim[d] : max_size;
295 typedef typename execution_space::array_layout layout;
296 Kokkos::View<int* [num_space_dim], layout, Kokkos::HostSpace>
297 rectangle_partition(
"rectangle_partition_host", max_size + 1 );
299 for (
int id = 0;
id < _ranks_per_dim[0] + 1; ++id )
300 rectangle_partition(
id, 0 ) = rec_partition_i[id];
302 for (
int id = 0;
id < _ranks_per_dim[1] + 1; ++id )
303 rectangle_partition(
id, 1 ) = rec_partition_j[id];
305 for (
int id = 0;
id < _ranks_per_dim[2] + 1; ++id )
306 rectangle_partition(
id, 2 ) = rec_partition_k[id];
308 _rectangle_partition_dev = Kokkos::create_mirror_view_and_copy(
319 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
320 Kokkos::HostSpace(), _rectangle_partition_dev );
323 rec_part[d].resize( _ranks_per_dim[d] + 1 );
324 for (
int id = 0;
id < _ranks_per_dim[d] + 1; ++id )
326 rec_part[d][id] = rec_mirror(
id, d );
338 Kokkos::deep_copy( _workload_per_tile, 0 );
339 Kokkos::deep_copy( _workload_prefix_sum, 0 );
351 template <
class ParticlePosViewType,
typename ArrayType,
typename CellUnit>
354 const ArrayType& global_lower_corner,
359 auto workload = _workload_per_tile;
360 Kokkos::Array<CellUnit, num_space_dim> lower_corner;
363 lower_corner[d] = global_lower_corner[d];
366 Kokkos::parallel_for(
367 "Cabana::Grid::SparseDimPartitioner::computeLocalWorkLoadPosition",
368 Kokkos::RangePolicy<execution_space>( 0, particle_num ),
369 KOKKOS_LAMBDA(
const int i ) {
370 int ti =
static_cast<int>(
371 ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >>
373 int tj =
static_cast<int>(
374 ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >>
376 int tz =
static_cast<int>(
377 ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >>
379 Kokkos::atomic_inc( &workload( ti + 1, tj + 1, tz + 1 ) );
389 template <
class SparseMapType>
394 auto workload = _workload_per_tile;
395 Kokkos::parallel_for(
396 "Cabana::Grid::SparseDimPartitioner::computeLocalWorkLoadSparseMap",
397 Kokkos::RangePolicy<execution_space>( 0, sparseMap.capacity() ),
398 KOKKOS_LAMBDA( uint32_t i ) {
399 if ( sparseMap.valid_at( i ) )
401 auto key = sparseMap.key_at( i );
403 sparseMap.key2ijk( key, ti, tj, tk );
404 Kokkos::atomic_inc( &workload( ti + 1, tj + 1, tk + 1 ) );
418 auto workload = _workload_per_tile;
419 auto prefix_sum = _workload_prefix_sum;
420 int total_size = _workload_per_tile.size();
424 MPI_Allreduce( workload.data(), prefix_sum.data(), total_size, MPI_INT,
431 j < static_cast<int>( _workload_prefix_sum.extent( 1 ) ); ++j )
433 k < static_cast<int>( _workload_prefix_sum.extent( 2 ) );
435 Kokkos::parallel_scan(
436 "scan_prefix_sum_dim0",
437 Kokkos::RangePolicy<execution_space>(
438 0, _workload_prefix_sum.extent( 0 ) ),
439 KOKKOS_LAMBDA(
const int i,
int& update,
441 const float val_i = prefix_sum( i, j, k );
445 prefix_sum( i, j, k ) = update;
452 i < static_cast<int>( _workload_prefix_sum.extent( 0 ) ); ++i )
454 k < static_cast<int>( _workload_prefix_sum.extent( 2 ) );
456 Kokkos::parallel_scan(
457 "scan_prefix_sum_dim1",
458 Kokkos::RangePolicy<execution_space>(
459 0, _workload_prefix_sum.extent( 1 ) ),
460 KOKKOS_LAMBDA(
const int j,
int& update,
462 const float val_i = prefix_sum( i, j, k );
466 prefix_sum( i, j, k ) = update;
473 i < static_cast<int>( _workload_prefix_sum.extent( 0 ) ); ++i )
475 j < static_cast<int>( _workload_prefix_sum.extent( 1 ) );
477 Kokkos::parallel_scan(
478 "scan_prefix_sum_dim2",
479 Kokkos::RangePolicy<execution_space>(
480 0, _workload_prefix_sum.extent( 2 ) ),
481 KOKKOS_LAMBDA(
const int k,
int& update,
483 const float val_i = prefix_sum( i, j, k );
487 prefix_sum( i, j, k ) = update;
503 template <
class ParticlePosViewType,
typename ArrayType,
typename CellUnit>
505 const ArrayType& global_lower_corner,
506 const CellUnit dx, MPI_Comm comm )
516 for (
int i = 0; i < _max_optimize_iteration; ++i )
518 bool is_changed =
false;
519 bool dim_covered[3] = {
false,
false,
false };
520 for (
int d = 0; d < 3; ++d )
523 while ( dim_covered[random_dim_id] )
526 bool is_dim_changed =
false;
530 is_changed = is_changed || is_dim_changed;
531 dim_covered[random_dim_id] =
true;
537 return _max_optimize_iteration;
546 template <
class SparseMapType>
555 for (
int i = 0; i < _max_optimize_iteration; ++i )
557 bool is_changed =
false;
558 bool dim_covered[3] = {
false,
false,
false };
559 for (
int d = 0; d < 3; ++d )
562 while ( dim_covered[random_dim_id] )
565 bool is_dim_changed =
false;
569 is_changed = is_changed || is_dim_changed;
570 dim_covered[random_dim_id] =
true;
576 return _max_optimize_iteration;
589 for (
int iter_id = iter_seed;
597 auto rank_k = _ranks_per_dim[dk];
599 auto rank = _ranks_per_dim[di];
600 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
601 Kokkos::HostSpace(), _rectangle_partition_dev );
602 auto rec_partition = _rectangle_partition_dev;
605 compute_sub_workload( _rectangle_partition_dev,
606 _workload_prefix_sum );
609 Kokkos::View<int*, memory_space> ave_workload(
610 "ave_workload", _ranks_per_dim[dj] * _ranks_per_dim[dk] );
611 Kokkos::parallel_for(
612 "Cabana::Grid::SparseDimPartitioner::computeAverageWorkLoad",
613 Kokkos::RangePolicy<execution_space>(
614 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
615 KOKKOS_LAMBDA( uint32_t jnk ) {
617 int j =
static_cast<int>( jnk / rank_k );
618 int k =
static_cast<int>( jnk % rank_k );
621 ave_workload( jnk ) =
622 compute_sub_workload( di, 0, rec_partition( rank, di ),
632 int equal_start_point = 1;
636 Kokkos::View<int*, memory_space> current_workload(
637 "current_workload", _ranks_per_dim[dj] * _ranks_per_dim[dk] );
638 for (
int current_rank = 1; current_rank < rank; current_rank++ )
640 int last_diff = __INT_MAX__;
644 Kokkos::parallel_for(
645 "Cabana::Grid::SparseDimPartitioner::"
646 "computeCurrentWorkLoad",
647 Kokkos::RangePolicy<execution_space>(
648 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
649 KOKKOS_LAMBDA( uint32_t jnk ) {
650 int j =
static_cast<int>( jnk / rank_k );
651 int k =
static_cast<int>( jnk % rank_k );
652 current_workload( jnk ) = compute_sub_workload(
653 di, last_point, point_i, dj, j, dk, k );
658 Kokkos::parallel_for(
659 "Cabana::Grid::SparseDimPartitioner::computeDifference",
660 Kokkos::RangePolicy<execution_space>(
661 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
662 KOKKOS_LAMBDA( uint32_t jnk ) {
664 current_workload( jnk ) - ave_workload( jnk );
668 wl = wl > 0 ? wl : -wl;
669 current_workload( jnk ) = wl;
676 Kokkos::parallel_reduce(
678 Kokkos::RangePolicy<execution_space>(
679 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
680 KOKKOS_LAMBDA(
const int idx,
int& update ) {
681 update += current_workload( idx );
687 if ( diff <= last_diff )
691 if ( diff != last_diff )
692 equal_start_point = point_i;
695 if ( point_i == rec_mirror( rank, di ) )
697 rec_mirror( current_rank, di ) = point_i;
708 if ( rec_mirror( current_rank, di ) !=
709 ( point_i - 1 + equal_start_point ) / 2 )
711 rec_mirror( current_rank, di ) =
712 ( point_i - 1 + equal_start_point ) / 2;
715 last_point = point_i - 1;
720 Kokkos::deep_copy( _rectangle_partition_dev, rec_mirror );
731 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
732 Kokkos::HostSpace(), _rectangle_partition_dev );
733 auto prefix_sum_mirror = Kokkos::create_mirror_view_and_copy(
734 Kokkos::HostSpace(), _workload_prefix_sum );
746 template <
typename PartitionViewHost,
typename WorkloadViewHost>
748 WorkloadViewHost& prefix_sum_view )
751 compute_sub_workload_host( rec_view, prefix_sum_view );
754 Kokkos::Array<int, num_space_dim> cart_rank;
756 MPI_Comm_rank( cart_comm, &linear_rank );
761 int workload_current_rank = compute_sub_workload_host(
762 0, rec_view( cart_rank[0], 0 ), rec_view( cart_rank[0] + 1, 0 ), 1,
763 cart_rank[1], 2, cart_rank[2] );
765 return workload_current_rank;
774 auto prefix_sum_view = Kokkos::create_mirror_view_and_copy(
775 Kokkos::HostSpace(), _workload_prefix_sum );
785 template <
typename WorkloadViewHost>
789 return prefix_sum_view( prefix_sum_view.extent( 0 ) - 1,
790 prefix_sum_view.extent( 1 ) - 1,
791 prefix_sum_view.extent( 2 ) - 1 ) /
792 ( _ranks_per_dim[0] * _ranks_per_dim[1] * _ranks_per_dim[2] );
802 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
803 Kokkos::HostSpace(), _rectangle_partition_dev );
804 auto prefix_sum_mirror = Kokkos::create_mirror_view_and_copy(
805 Kokkos::HostSpace(), _workload_prefix_sum );
807 int workload_current_rank =
811 return static_cast<float>( workload_current_rank ) /
812 static_cast<float>( workload_ave_rank );
819 template <
typename PartitionView,
typename WorkloadView>
838 KOKKOS_INLINE_FUNCTION
int operator()(
int dim_i,
int i_start,
839 int i_end,
int dim_j,
int j,
840 int dim_k,
int k )
const
847 start[dim_i] = i_start;
874 int _workload_threshold;
876 int _num_step_rebalance;
878 int _max_optimize_iteration;
890 Kokkos::Array<int, num_space_dim> _ranks_per_dim;
892 void allocate(
const std::array<int, num_space_dim>& global_cells_per_dim )
895 Kokkos::view_alloc( Kokkos::WithoutInitializing,
896 "workload_per_tile" ),
902 Kokkos::view_alloc( Kokkos::WithoutInitializing,
903 "workload_prefix_sum" ),
Multi-node grid partitioner.
KOKKOS_INLINE_FUNCTION constexpr Integer bitCount(Integer input_int) noexcept
(Host/Device) Compute the lease bit number needed to index input integer
Definition Cabana_Grid_SparseIndexSpace.hpp:80
Block partitioner base class.
Definition Cabana_Grid_Partitioner.hpp:37
void resetWorkload()
set all elements in _workload_per_tile and _workload_prefix_sum matrix to 0
Definition Cabana_Grid_SparseDimPartitioner.hpp:336
int currentRankWorkload(MPI_Comm cart_comm, PartitionViewHost &rec_view, WorkloadViewHost &prefix_sum_view)
compute the total workload on the current MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:747
int optimizePartition(const SparseMapType &sparseMap, MPI_Comm comm)
iteratively optimize the partition
Definition Cabana_Grid_SparseDimPartitioner.hpp:547
SparseDimPartitioner(MPI_Comm comm, float max_workload_coeff, int workload_num, int num_step_rebalance, const std::array< int, num_space_dim > &global_cells_per_dim, int max_optimize_iteration=10)
Constructor - automatically compute ranks_per_dim from MPI communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:89
std::array< std::vector< int >, num_space_dim > getCurrentPartition()
Get the current partition. Copy partition from the device view to host std::array<vector>
Definition Cabana_Grid_SparseDimPartitioner.hpp:316
Kokkos::View< int ***, memory_space > workload_view
Workload device view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:57
void computeFullPrefixSum(MPI_Comm comm)
reduce the total workload in all MPI ranks; 2. compute the workload prefix sum matrix for all MPI ran...
Definition Cabana_Grid_SparseDimPartitioner.hpp:415
void initializeRecPartition(std::vector< int > &rec_partition_i, std::vector< int > &rec_partition_j, std::vector< int > &rec_partition_k)
Initialize the tile partition; partition in each dimension has the form [0, p_1, ....
Definition Cabana_Grid_SparseDimPartitioner.hpp:286
void ownedTileInfo(MPI_Comm cart_comm, std::array< int, num_space_dim > &owned_num_tile, std::array< int, num_space_dim > &global_tile_offset) const
Get the owned number of tiles and the global tile offset of the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:233
static constexpr std::size_t num_space_dim
dimension
Definition Cabana_Grid_SparseDimPartitioner.hpp:47
int averageRankWorkload()
compute the average workload on each MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:772
Kokkos::View< int *[num_space_dim], typename execution_space::array_layout, Kokkos::HostSpace > partition_view_host
Partition host view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:65
float computeImbalanceFactor(MPI_Comm cart_comm)
compute the imbalance factor for the current partition
Definition Cabana_Grid_SparseDimPartitioner.hpp:800
MemorySpace memory_space
Kokkos memory space.
Definition Cabana_Grid_SparseDimPartitioner.hpp:50
static constexpr unsigned long long cell_num_per_tile_dim
Definition Cabana_Grid_SparseDimPartitioner.hpp:74
typename memory_space::execution_space execution_space
Default execution space.
Definition Cabana_Grid_SparseDimPartitioner.hpp:54
int averageRankWorkload(WorkloadViewHost &prefix_sum_view)
compute the average workload on each MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:786
void computeLocalWorkLoad(const SparseMapType &sparseMap)
compute the workload in the current MPI rank from sparseMap (the workload of a tile is 1 if the tile ...
Definition Cabana_Grid_SparseDimPartitioner.hpp:390
int currentRankWorkload(MPI_Comm cart_comm)
compute the total workload on the current MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:729
void ownedCellInfo(MPI_Comm cart_comm, const std::array< int, num_space_dim > &, std::array< int, num_space_dim > &owned_num_cell, std::array< int, num_space_dim > &global_cell_offset) const override
Get the owned number of cells and the global cell offset of the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:264
void optimizePartition(bool &is_changed, int iter_seed)
optimize the partition in three dimensions separately
Definition Cabana_Grid_SparseDimPartitioner.hpp:585
void computeLocalWorkLoad(const ParticlePosViewType &view, int particle_num, const ArrayType &global_lower_corner, const CellUnit dx)
compute the workload in the current MPI rank from particle positions (each particle count for 1 workl...
Definition Cabana_Grid_SparseDimPartitioner.hpp:352
std::array< int, num_space_dim > ranksPerDimension(MPI_Comm comm, const std::array< int, num_space_dim > &) const override
Get the number of MPI ranks in each dimension of the grid from the given MPI communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:166
std::array< int, num_space_dim > ownedTilesPerDimension(MPI_Comm cart_comm) const
Get the tile number in each dimension owned by the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:188
Kokkos::View< int ***, typename execution_space::array_layout, Kokkos::HostSpace > workload_view_host
Workload host view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:61
std::array< int, num_space_dim > ownedCellsPerDimension(MPI_Comm cart_comm) const
Get the cell number in each dimension owned by the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:212
SparseDimPartitioner(MPI_Comm comm, float max_workload_coeff, int workload_num, int num_step_rebalance, const std::array< int, num_space_dim > &ranks_per_dim, const std::array< int, num_space_dim > &global_cells_per_dim, int max_optimize_iteration=10)
Constructor - user-defined ranks_per_dim communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:118
Kokkos::View< int *[num_space_dim], memory_space > partition_view
Partition device view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:59
static constexpr unsigned long long cell_bits_per_tile_dim
Number of bits (per dimension) needed to index the cells inside a tile.
Definition Cabana_Grid_SparseDimPartitioner.hpp:70
std::array< int, num_space_dim > ranksPerDimension(MPI_Comm comm)
Compute the number of MPI ranks in each dimension of the grid from the given MPI communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:144
int optimizePartition(const ParticlePosViewType &view, int particle_num, const ArrayType &global_lower_corner, const CellUnit dx, MPI_Comm comm)
iteratively optimize the partition
Definition Cabana_Grid_SparseDimPartitioner.hpp:504
Core: particle data structures and algorithms.
Definition Cabana_AoSoA.hpp:36
functor to compute the sub workload in a given region (from the prefix sum)
Definition Cabana_Grid_SparseDimPartitioner.hpp:821
PartitionView rec_partition
Rectilinear partition.
Definition Cabana_Grid_SparseDimPartitioner.hpp:823
KOKKOS_INLINE_FUNCTION int operator()(int dim_i, int i_start, int i_end, int dim_j, int j, int dim_k, int k) const
Definition Cabana_Grid_SparseDimPartitioner.hpp:838
SubWorkloadFunctor(PartitionView rec_par, WorkloadView pre_sum)
Constructor.
Definition Cabana_Grid_SparseDimPartitioner.hpp:828
WorkloadView workload_prefix_sum
Workload prefix sum matrix.
Definition Cabana_Grid_SparseDimPartitioner.hpp:825