16#ifndef CABANA_GRID_SPARSEDIMPARTITIONER_HPP
17#define CABANA_GRID_SPARSEDIMPARTITIONER_HPP
23#include <Kokkos_Core.hpp>
41template <
typename MemorySpace,
unsigned long long CellPerTileDim = 4,
42 std::size_t NumSpaceDim = 3>
55 Cabana::Impl::deprecated( Kokkos::is_device<MemorySpace>() ) );
58 using device_type [[deprecated]] =
typename memory_space::device_type;
68 Kokkos::View<
int***,
typename execution_space::array_layout,
73 typename execution_space::array_layout, Kokkos::HostSpace>;
96 MPI_Comm comm,
float max_workload_coeff,
int workload_num,
97 int num_step_rebalance,
98 const std::array<int, num_space_dim>& global_cells_per_dim,
99 int max_optimize_iteration = 10 )
100 : _workload_threshold(
101 static_cast<int>( max_workload_coeff * workload_num ) )
102 , _num_step_rebalance( num_step_rebalance )
103 , _max_optimize_iteration( max_optimize_iteration )
106 allocate( global_cells_per_dim );
125 MPI_Comm comm,
float max_workload_coeff,
int workload_num,
126 int num_step_rebalance,
127 const std::array<int, num_space_dim>& ranks_per_dim,
128 const std::array<int, num_space_dim>& global_cells_per_dim,
129 int max_optimize_iteration = 10 )
130 : _workload_threshold(
131 static_cast<int>( max_workload_coeff * workload_num ) )
132 , _num_step_rebalance( num_step_rebalance )
133 , _max_optimize_iteration( max_optimize_iteration )
135 allocate( global_cells_per_dim );
136 std::copy( ranks_per_dim.begin(), ranks_per_dim.end(),
137 _ranks_per_dim.data() );
141 MPI_Comm_size( comm, &comm_size );
142 MPI_Dims_create( comm_size,
num_space_dim, _ranks_per_dim.data() );
153 MPI_Comm_size( comm, &comm_size );
155 std::array<int, num_space_dim> ranks_per_dim;
157 ranks_per_dim[d] = 0;
158 MPI_Dims_create( comm_size,
num_space_dim, ranks_per_dim.data() );
160 std::copy( ranks_per_dim.begin(), ranks_per_dim.end(),
161 _ranks_per_dim.data() );
163 return ranks_per_dim;
171 std::array<int, num_space_dim>
173 const std::array<int, num_space_dim>& )
const override
175 std::array<int, num_space_dim> ranks_per_dim = {
176 _ranks_per_dim[0], _ranks_per_dim[1], _ranks_per_dim[2] };
178 MPI_Comm_size( comm, &comm_size );
181 nrank *= _ranks_per_dim[d];
182 if ( comm_size != nrank )
183 throw std::runtime_error(
184 "Cabana::Grid::SparseDimPartitioner::ranksPerDimension: "
185 "SparseDimPartitioner ranks do not match comm size" );
186 return ranks_per_dim;
193 std::array<int, num_space_dim>
197 std::array<int, num_space_dim> cart_rank;
199 MPI_Comm_rank( cart_comm, &linear_rank );
204 std::array<int, num_space_dim> tiles_per_dim;
205 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
206 Kokkos::HostSpace(), _rectangle_partition_dev );
208 tiles_per_dim[d] = rec_mirror( cart_rank[d] + 1, d ) -
209 rec_mirror( cart_rank[d], d );
210 return tiles_per_dim;
217 std::array<int, num_space_dim>
226 return tiles_per_dim;
240 std::array<int, num_space_dim>& owned_num_tile,
241 std::array<int, num_space_dim>& global_tile_offset )
const
244 std::array<int, num_space_dim> cart_rank;
246 MPI_Comm_rank( cart_comm, &linear_rank );
251 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
252 Kokkos::HostSpace(), _rectangle_partition_dev );
255 owned_num_tile[d] = rec_mirror( cart_rank[d] + 1, d ) -
256 rec_mirror( cart_rank[d], d );
257 global_tile_offset[d] = rec_mirror( cart_rank[d], d );
271 MPI_Comm cart_comm,
const std::array<int, num_space_dim>&,
272 std::array<int, num_space_dim>& owned_num_cell,
273 std::array<int, num_space_dim>& global_cell_offset )
const override
275 ownedTileInfo( cart_comm, owned_num_cell, global_cell_offset );
293 std::vector<int>& rec_partition_j,
294 std::vector<int>& rec_partition_k )
299 max_size < _ranks_per_dim[d] ? _ranks_per_dim[d] : max_size;
301 typedef typename execution_space::array_layout layout;
302 Kokkos::View<int* [num_space_dim], layout, Kokkos::HostSpace>
303 rectangle_partition(
"rectangle_partition_host", max_size + 1 );
305 for (
int id = 0;
id < _ranks_per_dim[0] + 1; ++id )
306 rectangle_partition(
id, 0 ) = rec_partition_i[id];
308 for (
int id = 0;
id < _ranks_per_dim[1] + 1; ++id )
309 rectangle_partition(
id, 1 ) = rec_partition_j[id];
311 for (
int id = 0;
id < _ranks_per_dim[2] + 1; ++id )
312 rectangle_partition(
id, 2 ) = rec_partition_k[id];
314 _rectangle_partition_dev = Kokkos::create_mirror_view_and_copy(
325 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
326 Kokkos::HostSpace(), _rectangle_partition_dev );
329 rec_part[d].resize( _ranks_per_dim[d] + 1 );
330 for (
int id = 0;
id < _ranks_per_dim[d] + 1; ++id )
332 rec_part[d][id] = rec_mirror(
id, d );
344 Kokkos::deep_copy( _workload_per_tile, 0 );
345 Kokkos::deep_copy( _workload_prefix_sum, 0 );
357 template <
class ParticlePosViewType,
typename ArrayType,
typename CellUnit>
360 const ArrayType& global_lower_corner,
365 auto workload = _workload_per_tile;
366 Kokkos::Array<CellUnit, num_space_dim> lower_corner;
369 lower_corner[d] = global_lower_corner[d];
372 Kokkos::parallel_for(
373 "Cabana::Grid::SparseDimPartitioner::computeLocalWorkLoadPosition",
374 Kokkos::RangePolicy<execution_space>( 0, particle_num ),
375 KOKKOS_LAMBDA(
const int i ) {
376 int ti =
static_cast<int>(
377 ( view( i, 0 ) - lower_corner[0] ) / dx - 0.5 ) >>
379 int tj =
static_cast<int>(
380 ( view( i, 1 ) - lower_corner[1] ) / dx - 0.5 ) >>
382 int tz =
static_cast<int>(
383 ( view( i, 2 ) - lower_corner[2] ) / dx - 0.5 ) >>
385 Kokkos::atomic_inc( &workload( ti + 1, tj + 1, tz + 1 ) );
395 template <
class SparseMapType>
400 auto workload = _workload_per_tile;
401 Kokkos::parallel_for(
402 "Cabana::Grid::SparseDimPartitioner::computeLocalWorkLoadSparseMap",
403 Kokkos::RangePolicy<execution_space>( 0, sparseMap.capacity() ),
404 KOKKOS_LAMBDA( uint32_t i ) {
405 if ( sparseMap.valid_at( i ) )
407 auto key = sparseMap.key_at( i );
409 sparseMap.key2ijk( key, ti, tj, tk );
410 Kokkos::atomic_inc( &workload( ti + 1, tj + 1, tk + 1 ) );
424 auto workload = _workload_per_tile;
425 auto prefix_sum = _workload_prefix_sum;
426 int total_size = _workload_per_tile.size();
430 MPI_Allreduce( workload.data(), prefix_sum.data(), total_size, MPI_INT,
437 j < static_cast<int>( _workload_prefix_sum.extent( 1 ) ); ++j )
439 k < static_cast<int>( _workload_prefix_sum.extent( 2 ) );
441 Kokkos::parallel_scan(
442 "scan_prefix_sum_dim0",
443 Kokkos::RangePolicy<execution_space>(
444 0, _workload_prefix_sum.extent( 0 ) ),
445 KOKKOS_LAMBDA(
const int i,
int& update,
447 const float val_i = prefix_sum( i, j, k );
451 prefix_sum( i, j, k ) = update;
458 i < static_cast<int>( _workload_prefix_sum.extent( 0 ) ); ++i )
460 k < static_cast<int>( _workload_prefix_sum.extent( 2 ) );
462 Kokkos::parallel_scan(
463 "scan_prefix_sum_dim1",
464 Kokkos::RangePolicy<execution_space>(
465 0, _workload_prefix_sum.extent( 1 ) ),
466 KOKKOS_LAMBDA(
const int j,
int& update,
468 const float val_i = prefix_sum( i, j, k );
472 prefix_sum( i, j, k ) = update;
479 i < static_cast<int>( _workload_prefix_sum.extent( 0 ) ); ++i )
481 j < static_cast<int>( _workload_prefix_sum.extent( 1 ) );
483 Kokkos::parallel_scan(
484 "scan_prefix_sum_dim2",
485 Kokkos::RangePolicy<execution_space>(
486 0, _workload_prefix_sum.extent( 2 ) ),
487 KOKKOS_LAMBDA(
const int k,
int& update,
489 const float val_i = prefix_sum( i, j, k );
493 prefix_sum( i, j, k ) = update;
509 template <
class ParticlePosViewType,
typename ArrayType,
typename CellUnit>
511 const ArrayType& global_lower_corner,
512 const CellUnit dx, MPI_Comm comm )
522 for (
int i = 0; i < _max_optimize_iteration; ++i )
524 bool is_changed =
false;
525 bool dim_covered[3] = {
false,
false,
false };
526 for (
int d = 0; d < 3; ++d )
529 while ( dim_covered[random_dim_id] )
532 bool is_dim_changed =
false;
536 is_changed = is_changed || is_dim_changed;
537 dim_covered[random_dim_id] =
true;
543 return _max_optimize_iteration;
552 template <
class SparseMapType>
561 for (
int i = 0; i < _max_optimize_iteration; ++i )
563 bool is_changed =
false;
564 bool dim_covered[3] = {
false,
false,
false };
565 for (
int d = 0; d < 3; ++d )
568 while ( dim_covered[random_dim_id] )
571 bool is_dim_changed =
false;
575 is_changed = is_changed || is_dim_changed;
576 dim_covered[random_dim_id] =
true;
582 return _max_optimize_iteration;
595 for (
int iter_id = iter_seed;
603 auto rank_k = _ranks_per_dim[dk];
605 auto rank = _ranks_per_dim[di];
606 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
607 Kokkos::HostSpace(), _rectangle_partition_dev );
608 auto rec_partition = _rectangle_partition_dev;
611 compute_sub_workload( _rectangle_partition_dev,
612 _workload_prefix_sum );
615 Kokkos::View<int*, memory_space> ave_workload(
616 "ave_workload", _ranks_per_dim[dj] * _ranks_per_dim[dk] );
617 Kokkos::parallel_for(
618 "Cabana::Grid::SparseDimPartitioner::computeAverageWorkLoad",
619 Kokkos::RangePolicy<execution_space>(
620 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
621 KOKKOS_LAMBDA( uint32_t jnk ) {
623 int j =
static_cast<int>( jnk / rank_k );
624 int k =
static_cast<int>( jnk % rank_k );
627 ave_workload( jnk ) =
628 compute_sub_workload( di, 0, rec_partition( rank, di ),
638 int equal_start_point = 1;
642 Kokkos::View<int*, memory_space> current_workload(
643 "current_workload", _ranks_per_dim[dj] * _ranks_per_dim[dk] );
644 for (
int current_rank = 1; current_rank < rank; current_rank++ )
646 int last_diff = __INT_MAX__;
650 Kokkos::parallel_for(
651 "Cabana::Grid::SparseDimPartitioner::"
652 "computeCurrentWorkLoad",
653 Kokkos::RangePolicy<execution_space>(
654 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
655 KOKKOS_LAMBDA( uint32_t jnk ) {
656 int j =
static_cast<int>( jnk / rank_k );
657 int k =
static_cast<int>( jnk % rank_k );
658 current_workload( jnk ) = compute_sub_workload(
659 di, last_point, point_i, dj, j, dk, k );
664 Kokkos::parallel_for(
665 "Cabana::Grid::SparseDimPartitioner::computeDifference",
666 Kokkos::RangePolicy<execution_space>(
667 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
668 KOKKOS_LAMBDA( uint32_t jnk ) {
670 current_workload( jnk ) - ave_workload( jnk );
674 wl = wl > 0 ? wl : -wl;
675 current_workload( jnk ) = wl;
682 Kokkos::parallel_reduce(
684 Kokkos::RangePolicy<execution_space>(
685 0, _ranks_per_dim[dj] * _ranks_per_dim[dk] ),
686 KOKKOS_LAMBDA(
const int idx,
int& update ) {
687 update += current_workload( idx );
693 if ( diff <= last_diff )
697 if ( diff != last_diff )
698 equal_start_point = point_i;
701 if ( point_i == rec_mirror( rank, di ) )
703 rec_mirror( current_rank, di ) = point_i;
714 if ( rec_mirror( current_rank, di ) !=
715 ( point_i - 1 + equal_start_point ) / 2 )
717 rec_mirror( current_rank, di ) =
718 ( point_i - 1 + equal_start_point ) / 2;
721 last_point = point_i - 1;
726 Kokkos::deep_copy( _rectangle_partition_dev, rec_mirror );
737 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
738 Kokkos::HostSpace(), _rectangle_partition_dev );
739 auto prefix_sum_mirror = Kokkos::create_mirror_view_and_copy(
740 Kokkos::HostSpace(), _workload_prefix_sum );
752 template <
typename PartitionViewHost,
typename WorkloadViewHost>
754 WorkloadViewHost& prefix_sum_view )
757 compute_sub_workload_host( rec_view, prefix_sum_view );
760 Kokkos::Array<int, num_space_dim> cart_rank;
762 MPI_Comm_rank( cart_comm, &linear_rank );
767 int workload_current_rank = compute_sub_workload_host(
768 0, rec_view( cart_rank[0], 0 ), rec_view( cart_rank[0] + 1, 0 ), 1,
769 cart_rank[1], 2, cart_rank[2] );
771 return workload_current_rank;
780 auto prefix_sum_view = Kokkos::create_mirror_view_and_copy(
781 Kokkos::HostSpace(), _workload_prefix_sum );
791 template <
typename WorkloadViewHost>
795 return prefix_sum_view( prefix_sum_view.extent( 0 ) - 1,
796 prefix_sum_view.extent( 1 ) - 1,
797 prefix_sum_view.extent( 2 ) - 1 ) /
798 ( _ranks_per_dim[0] * _ranks_per_dim[1] * _ranks_per_dim[2] );
808 auto rec_mirror = Kokkos::create_mirror_view_and_copy(
809 Kokkos::HostSpace(), _rectangle_partition_dev );
810 auto prefix_sum_mirror = Kokkos::create_mirror_view_and_copy(
811 Kokkos::HostSpace(), _workload_prefix_sum );
813 int workload_current_rank =
817 return static_cast<float>( workload_current_rank ) /
818 static_cast<float>( workload_ave_rank );
825 template <
typename PartitionView,
typename WorkloadView>
844 KOKKOS_INLINE_FUNCTION
int operator()(
int dim_i,
int i_start,
845 int i_end,
int dim_j,
int j,
846 int dim_k,
int k )
const
853 start[dim_i] = i_start;
880 int _workload_threshold;
882 int _num_step_rebalance;
884 int _max_optimize_iteration;
896 Kokkos::Array<int, num_space_dim> _ranks_per_dim;
898 void allocate(
const std::array<int, num_space_dim>& global_cells_per_dim )
901 Kokkos::view_alloc( Kokkos::WithoutInitializing,
902 "workload_per_tile" ),
908 Kokkos::view_alloc( Kokkos::WithoutInitializing,
909 "workload_prefix_sum" ),
Multi-node grid partitioner.
KOKKOS_INLINE_FUNCTION constexpr Integer bitCount(Integer input_int) noexcept
(Host/Device) Compute the lease bit number needed to index input integer
Definition Cabana_Grid_SparseIndexSpace.hpp:80
Block partitioner base class.
Definition Cabana_Grid_Partitioner.hpp:37
void resetWorkload()
set all elements in _workload_per_tile and _workload_prefix_sum matrix to 0
Definition Cabana_Grid_SparseDimPartitioner.hpp:342
int currentRankWorkload(MPI_Comm cart_comm, PartitionViewHost &rec_view, WorkloadViewHost &prefix_sum_view)
compute the total workload on the current MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:753
int optimizePartition(const SparseMapType &sparseMap, MPI_Comm comm)
iteratively optimize the partition
Definition Cabana_Grid_SparseDimPartitioner.hpp:553
SparseDimPartitioner(MPI_Comm comm, float max_workload_coeff, int workload_num, int num_step_rebalance, const std::array< int, num_space_dim > &global_cells_per_dim, int max_optimize_iteration=10)
Constructor - automatically compute ranks_per_dim from MPI communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:95
std::array< std::vector< int >, num_space_dim > getCurrentPartition()
Get the current partition. Copy partition from the device view to host std::array<vector>
Definition Cabana_Grid_SparseDimPartitioner.hpp:322
Kokkos::View< int ***, memory_space > workload_view
Workload device view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:63
void computeFullPrefixSum(MPI_Comm comm)
reduce the total workload in all MPI ranks; 2. compute the workload prefix sum matrix for all MPI ran...
Definition Cabana_Grid_SparseDimPartitioner.hpp:421
void initializeRecPartition(std::vector< int > &rec_partition_i, std::vector< int > &rec_partition_j, std::vector< int > &rec_partition_k)
Initialize the tile partition; partition in each dimension has the form [0, p_1, ....
Definition Cabana_Grid_SparseDimPartitioner.hpp:292
void ownedTileInfo(MPI_Comm cart_comm, std::array< int, num_space_dim > &owned_num_tile, std::array< int, num_space_dim > &global_tile_offset) const
Get the owned number of tiles and the global tile offset of the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:239
static constexpr std::size_t num_space_dim
dimension
Definition Cabana_Grid_SparseDimPartitioner.hpp:47
int averageRankWorkload()
compute the average workload on each MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:778
Kokkos::View< int *[num_space_dim], typename execution_space::array_layout, Kokkos::HostSpace > partition_view_host
Partition host view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:71
float computeImbalanceFactor(MPI_Comm cart_comm)
compute the imbalance factor for the current partition
Definition Cabana_Grid_SparseDimPartitioner.hpp:806
static constexpr unsigned long long cell_num_per_tile_dim
Definition Cabana_Grid_SparseDimPartitioner.hpp:80
typename memory_space::execution_space execution_space
Default execution space.
Definition Cabana_Grid_SparseDimPartitioner.hpp:60
int averageRankWorkload(WorkloadViewHost &prefix_sum_view)
compute the average workload on each MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:792
void computeLocalWorkLoad(const SparseMapType &sparseMap)
compute the workload in the current MPI rank from sparseMap (the workload of a tile is 1 if the tile ...
Definition Cabana_Grid_SparseDimPartitioner.hpp:396
int currentRankWorkload(MPI_Comm cart_comm)
compute the total workload on the current MPI rank
Definition Cabana_Grid_SparseDimPartitioner.hpp:735
void ownedCellInfo(MPI_Comm cart_comm, const std::array< int, num_space_dim > &, std::array< int, num_space_dim > &owned_num_cell, std::array< int, num_space_dim > &global_cell_offset) const override
Get the owned number of cells and the global cell offset of the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:270
void optimizePartition(bool &is_changed, int iter_seed)
optimize the partition in three dimensions separately
Definition Cabana_Grid_SparseDimPartitioner.hpp:591
typename MemorySpace::memory_space memory_space
Memory space.
Definition Cabana_Grid_SparseDimPartitioner.hpp:52
void computeLocalWorkLoad(const ParticlePosViewType &view, int particle_num, const ArrayType &global_lower_corner, const CellUnit dx)
compute the workload in the current MPI rank from particle positions (each particle count for 1 workl...
Definition Cabana_Grid_SparseDimPartitioner.hpp:358
std::array< int, num_space_dim > ranksPerDimension(MPI_Comm comm, const std::array< int, num_space_dim > &) const override
Get the number of MPI ranks in each dimension of the grid from the given MPI communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:172
std::array< int, num_space_dim > ownedTilesPerDimension(MPI_Comm cart_comm) const
Get the tile number in each dimension owned by the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:194
Kokkos::View< int ***, typename execution_space::array_layout, Kokkos::HostSpace > workload_view_host
Workload host view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:67
std::array< int, num_space_dim > ownedCellsPerDimension(MPI_Comm cart_comm) const
Get the cell number in each dimension owned by the current MPI rank.
Definition Cabana_Grid_SparseDimPartitioner.hpp:218
SparseDimPartitioner(MPI_Comm comm, float max_workload_coeff, int workload_num, int num_step_rebalance, const std::array< int, num_space_dim > &ranks_per_dim, const std::array< int, num_space_dim > &global_cells_per_dim, int max_optimize_iteration=10)
Constructor - user-defined ranks_per_dim communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:124
Kokkos::View< int *[num_space_dim], memory_space > partition_view
Partition device view.
Definition Cabana_Grid_SparseDimPartitioner.hpp:65
static constexpr unsigned long long cell_bits_per_tile_dim
Number of bits (per dimension) needed to index the cells inside a tile.
Definition Cabana_Grid_SparseDimPartitioner.hpp:76
std::array< int, num_space_dim > ranksPerDimension(MPI_Comm comm)
Compute the number of MPI ranks in each dimension of the grid from the given MPI communicator.
Definition Cabana_Grid_SparseDimPartitioner.hpp:150
int optimizePartition(const ParticlePosViewType &view, int particle_num, const ArrayType &global_lower_corner, const CellUnit dx, MPI_Comm comm)
iteratively optimize the partition
Definition Cabana_Grid_SparseDimPartitioner.hpp:510
Core: particle data structures and algorithms.
Definition Cabana_AoSoA.hpp:36
functor to compute the sub workload in a given region (from the prefix sum)
Definition Cabana_Grid_SparseDimPartitioner.hpp:827
PartitionView rec_partition
Rectilinear partition.
Definition Cabana_Grid_SparseDimPartitioner.hpp:829
KOKKOS_INLINE_FUNCTION int operator()(int dim_i, int i_start, int i_end, int dim_j, int j, int dim_k, int k) const
Definition Cabana_Grid_SparseDimPartitioner.hpp:844
SubWorkloadFunctor(PartitionView rec_par, WorkloadView pre_sum)
Constructor.
Definition Cabana_Grid_SparseDimPartitioner.hpp:834
WorkloadView workload_prefix_sum
Workload prefix sum matrix.
Definition Cabana_Grid_SparseDimPartitioner.hpp:831