Cabana/doxygen/Cabana__CommunicationPlan_8hpp_source.html

/****************************************************************************

 * Copyright (c) 2018-2023 by the Cabana authors                            *

 * All rights reserved.                                                     *

 *                                                                          *

 * This file is part of the Cabana library. Cabana is distributed under a   *

 * BSD 3-clause license. For the licensing terms see the LICENSE file in    *

 * the top-level directory.                                                 *

 *                                                                          *

 * SPDX-License-Identifier: BSD-3-Clause                                    *

 ****************************************************************************/


#ifndef CABANA_COMMUNICATIONPLAN_HPP

#define CABANA_COMMUNICATIONPLAN_HPP


#include <Cabana_Utils.hpp>


#include <Kokkos_Core.hpp>

#include <Kokkos_ScatterView.hpp>


#include <mpi.h>


#include <algorithm>

#include <exception>

#include <memory>

#include <numeric>

#include <type_traits>

#include <vector>


namespace Cabana

{

namespace Impl

{

//---------------------------------------------------------------------------//

// Count sends and create steering algorithm tags.

struct CountSendsAndCreateSteeringDuplicated

{

};

struct CountSendsAndCreateSteeringAtomic

{

};


//---------------------------------------------------------------------------//

// Count sends and create steering algorithm selector.

template <class ExecutionSpace>

struct CountSendsAndCreateSteeringAlgorithm;


// CUDA and HIP use atomics.

#ifdef KOKKOS_ENABLE_CUDA

template <>

struct CountSendsAndCreateSteeringAlgorithm<Kokkos::Cuda>

{

    using type = CountSendsAndCreateSteeringAtomic;

};

#endif // end KOKKOS_ENABLE_CUDA

#ifdef KOKKOS_ENABLE_HIP

template <>

struct CountSendsAndCreateSteeringAlgorithm<Kokkos::Experimental::HIP>

{

    using type = CountSendsAndCreateSteeringAtomic;

};

#endif // end KOKKOS_ENABLE_HIP

#ifdef KOKKOS_ENABLE_SYCL

template <>

struct CountSendsAndCreateSteeringAlgorithm<Kokkos::Experimental::SYCL>

{

    using type = CountSendsAndCreateSteeringAtomic;

};

#endif // end KOKKOS_ENABLE_SYCL

#ifdef KOKKOS_ENABLE_OPENMPTARGET

template <>

struct CountSendsAndCreateSteeringAlgorithm<Kokkos::Experimental::OpenMPTarget>

{

    using type = CountSendsAndCreateSteeringAtomic;

};

#endif // end KOKKOS_ENABLE_OPENMPTARGET


// The default is to use duplication.

template <class ExecutionSpace>

struct CountSendsAndCreateSteeringAlgorithm

{

    using type = CountSendsAndCreateSteeringDuplicated;

};


//---------------------------------------------------------------------------//

// Count sends and generate the steering vector. Atomic version.

template <class ExecutionSpace, class ExportRankView>

auto countSendsAndCreateSteering( ExecutionSpace,

                                  const ExportRankView element_export_ranks,

                                  const int comm_size,

                                  CountSendsAndCreateSteeringAtomic )

    -> std::pair<Kokkos::View<int*, typename ExportRankView::memory_space>,

                 Kokkos::View<typename ExportRankView::size_type*,

                              typename ExportRankView::memory_space>>

{

    using memory_space = typename ExportRankView::memory_space;

    using size_type = typename ExportRankView::size_type;


    // Create views.

    Kokkos::View<int*, memory_space> neighbor_counts( "neighbor_counts",

                                                      comm_size );

    Kokkos::View<size_type*, memory_space> neighbor_ids(

        Kokkos::ViewAllocateWithoutInitializing( "neighbor_ids" ),

        element_export_ranks.size() );


    // Count the sends and create the steering vector.


    // For smaller values of comm_size, use the optimized scratch memory path

    // Value of 64 is arbitrary, should be at least 27 to cover 3-d halos?

    if ( comm_size <= 64 )

    {

        constexpr int team_size = 256;

        Kokkos::TeamPolicy<ExecutionSpace> team_policy(

            ( element_export_ranks.size() + team_size - 1 ) / team_size,

            team_size );

        team_policy = team_policy.set_scratch_size(

            0,

            Kokkos::PerTeam( sizeof( int ) * ( team_size + 2 * comm_size ) ) );

        Kokkos::parallel_for(

            "Cabana::CommunicationPlan::countSendsAndCreateSteeringShared",

            team_policy,

            KOKKOS_LAMBDA(

                const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type&

                    team ) {

                // NOTE: `get_shmem` returns shared memory pointers *aligned to

                // 8 bytes*, so if `comm_size` is odd we can get erroneously

                // padded offsets if we call `get_shmem` separately for each

                // shared memory intermediary. Acquiring all the needed scratch

                // memory at once then computing pointer offsets by hand avoids

                // this issue.

                int* scratch = (int*)team.team_shmem().get_shmem(

                    ( team.team_size() + 2 * comm_size ) * sizeof( int ), 0 );


                // local neighbor_ids, gives the local offset relative to

                // calculated global offset. Size team.team_size() * sizeof(int)

                int* local_neighbor_ids = scratch;

                // local histogram, `comm_size` in size

                int* histo = local_neighbor_ids + team.team_size();

                // offset into global array, `comm_size` in size

                int* global_offset = histo + comm_size;

                // overall element `tid`

                const auto tid =

                    team.team_rank() + team.league_rank() * team.team_size();

                // local element

                const int local_id = team.team_rank();

                // total number of elements, for convenience

                const int num_elements = element_export_ranks.size();

                // my element export rank

                const int my_element_export_rank =

                    ( tid < num_elements ? element_export_ranks( tid ) : -1 );


                // cannot outright terminate early b/c threads share work

                // if (tid >= num_elements) return;

                const bool in_bounds =

                    tid < num_elements && my_element_export_rank >= 0;


                Kokkos::parallel_for(

                    Kokkos::TeamThreadRange( team, comm_size ),

                    [&]( const int i ) { histo[i] = 0; } );


                // synchronize zeroing

                team.team_barrier();


                // build local histogram, need to encode num_elements check here

                if ( in_bounds )

                {

                    // shared memory atomic add, accumulate into local offset

                    local_neighbor_ids[local_id] = Kokkos::atomic_fetch_add(

                        &histo[my_element_export_rank], 1 );

                }


                // synchronize local histogram build

                team.team_barrier();


                // reserve space in global array via a loop over neighbor counts

                Kokkos::parallel_for(

                    Kokkos::TeamThreadRange( team, comm_size ),

                    [&]( const int i )

                    {

                        // global memory atomic add, reserves space

                        global_offset[i] = Kokkos::atomic_fetch_add(

                            &neighbor_counts( i ), histo[i] );

                    } );


                // synchronize block-stride loop

                team.team_barrier();


                // and now store to my location

                if ( in_bounds )

                {

                    neighbor_ids( tid ) =

                        global_offset[my_element_export_rank] +

                        local_neighbor_ids[local_id];

                }

            } );

    }

    else

    {

        // For larger numbers of export ranks (for ex, migration) we use the

        // global memory version, though this is a point of future optimization


        Kokkos::parallel_for(

            "Cabana::CommunicationPlan::countSendsAndCreateSteering",

            Kokkos::RangePolicy<ExecutionSpace>( 0,

                                                 element_export_ranks.size() ),

            KOKKOS_LAMBDA( const size_type i ) {

                if ( element_export_ranks( i ) >= 0 )

                    neighbor_ids( i ) = Kokkos::atomic_fetch_add(

                        &neighbor_counts( element_export_ranks( i ) ), 1 );

            } );

    }

    Kokkos::fence();


    // Return the counts and ids.

    return std::make_pair( neighbor_counts, neighbor_ids );

}


//---------------------------------------------------------------------------//

// Count sends and generate the steering vector. Duplicated version.

template <class ExecutionSpace, class ExportRankView>

auto countSendsAndCreateSteering( ExecutionSpace,

                                  const ExportRankView element_export_ranks,

                                  const int comm_size,

                                  CountSendsAndCreateSteeringDuplicated )

    -> std::pair<Kokkos::View<int*, typename ExportRankView::memory_space>,

                 Kokkos::View<typename ExportRankView::size_type*,

                              typename ExportRankView::memory_space>>

{

    using memory_space = typename ExportRankView::memory_space;

    using size_type = typename ExportRankView::size_type;


    // Create a unique thread token.

    Kokkos::Experimental::UniqueToken<

        ExecutionSpace, Kokkos::Experimental::UniqueTokenScope::Global>

        unique_token;


    // Create views.

    Kokkos::View<int*, memory_space> neighbor_counts(

        Kokkos::ViewAllocateWithoutInitializing( "neighbor_counts" ),

        comm_size );

    Kokkos::View<size_type*, memory_space> neighbor_ids(

        Kokkos::ViewAllocateWithoutInitializing( "neighbor_ids" ),

        element_export_ranks.size() );

    Kokkos::View<int**, memory_space> neighbor_counts_dup(

        "neighbor_counts", unique_token.size(), comm_size );

    Kokkos::View<size_type**, memory_space> neighbor_ids_dup(

        "neighbor_ids", unique_token.size(), element_export_ranks.size() );


    // Compute initial duplicated sends and steering.

    Kokkos::parallel_for(

        "Cabana::CommunicationPlan::intialCount",

        Kokkos::RangePolicy<ExecutionSpace>( 0, element_export_ranks.size() ),

        KOKKOS_LAMBDA( const size_type i ) {

            if ( element_export_ranks( i ) >= 0 )

            {

                // Get the thread id.

                auto thread_id = unique_token.acquire();


                // Do the duplicated fetch-add. If this is a valid element id

                // increment the send count for this rank. Add the incremented

                // count as the thread-local neighbor id. This is too big by

                // one (because we use the prefix increment operator) but we

                // want a non-zero value so we can later find which thread

                // this element was located on because we are always

                // guaranteed a non-zero value. We will subtract this value

                // later.

                neighbor_ids_dup( thread_id, i ) = ++neighbor_counts_dup(

                    thread_id, element_export_ranks( i ) );


                // Release the thread id.

                unique_token.release( thread_id );

            }

        } );

    Kokkos::fence();


    // Team policy

    using team_policy =

        Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>>;

    using index_type = typename team_policy::index_type;


    // Compute the send counts for each neighbor rank by reducing across

    // the thread duplicates.

    Kokkos::parallel_for(

        "Cabana::CommunicationPlan::finalCount",

        team_policy( neighbor_counts.extent( 0 ), Kokkos::AUTO ),

        KOKKOS_LAMBDA( const typename team_policy::member_type& team ) {

            // Get the element id.

            auto i = team.league_rank();


            // Add the thread results.

            int thread_counts = 0;

            Kokkos::parallel_reduce(

                Kokkos::TeamThreadRange( team,

                                         neighbor_counts_dup.extent( 0 ) ),

                [&]( const index_type thread_id, int& result )

                { result += neighbor_counts_dup( thread_id, i ); },

                thread_counts );

            neighbor_counts( i ) = thread_counts;

        } );

    Kokkos::fence();


    // Compute the location of each export element in the send buffer of

    // its destination rank.

    Kokkos::parallel_for(

        "Cabana::CommunicationPlan::createSteering",

        team_policy( element_export_ranks.size(), Kokkos::AUTO ),

        KOKKOS_LAMBDA( const typename team_policy::member_type& team ) {

            // Get the element id.

            auto i = team.league_rank();


            // Only operate on valid elements

            if ( element_export_ranks( i ) >= 0 )

            {

                // Compute the thread id in which we located the element

                // during the count phase. Only the thread in which we

                // located the element will contribute to the reduction.

                index_type dup_thread = 0;

                Kokkos::parallel_reduce(

                    Kokkos::TeamThreadRange( team,

                                             neighbor_ids_dup.extent( 0 ) ),

                    [&]( const index_type thread_id, index_type& result )

                    {

                        if ( neighbor_ids_dup( thread_id, i ) > 0 )

                            result += thread_id;

                    },

                    dup_thread );


                // Compute the offset of this element in the steering

                // vector for its destination rank. Loop through the

                // threads up to the thread that found this element in the

                // count stage. All thread counts prior to that thread

                // will contribute to the offset.

                size_type thread_offset = 0;

                Kokkos::parallel_reduce(

                    Kokkos::TeamThreadRange( team, dup_thread ),

                    [&]( const index_type thread_id, size_type& result ) {

                        result += neighbor_counts_dup(

                            thread_id, element_export_ranks( i ) );

                    },

                    thread_offset );


                // Add the thread-local value to the offset where we subtract

                // the 1 that we added artificially when we were first

                // counting.

                neighbor_ids( i ) =

                    thread_offset + neighbor_ids_dup( dup_thread, i ) - 1;

            }

        } );

    Kokkos::fence();


    // Return the counts and ids.

    return std::make_pair( neighbor_counts, neighbor_ids );

}


//---------------------------------------------------------------------------//

} // end namespace Impl


//---------------------------------------------------------------------------//


inline std::vector<int> getUniqueTopology( MPI_Comm comm,

                                           std::vector<int> topology )

{

    auto remove_end = std::remove( topology.begin(), topology.end(), -1 );

    std::sort( topology.begin(), remove_end );

    auto unique_end = std::unique( topology.begin(), remove_end );

    topology.resize( std::distance( topology.begin(), unique_end ) );


    // Put this rank first.

    int my_rank = -1;

    MPI_Comm_rank( comm, &my_rank );

    for ( auto& n : topology )

    {

        if ( n == my_rank )

        {

            std::swap( n, topology[0] );

            break;

        }

    }

    return topology;

}


//---------------------------------------------------------------------------//

template <class MemorySpace>


class CommunicationPlan

{

  public:

    // FIXME: extracting the self type for backwards compatibility with previous

    // template on DeviceType. Should simply be MemorySpace after next release.

    using memory_space = typename MemorySpace::memory_space;

    // FIXME: replace warning with memory space assert after next release.

    static_assert( Impl::deprecated( Kokkos::is_device<MemorySpace>() ) );


    using device_type [[deprecated]] = typename memory_space::device_type;


    using execution_space = typename memory_space::execution_space;


    // FIXME: extracting the self type for backwards compatibility with previous

    // template on DeviceType. Should simply be memory_space::size_type after

    // next release.

    using size_type = typename memory_space::memory_space::size_type;


    CommunicationPlan( MPI_Comm comm )

    {

        _comm_ptr.reset(

            // Duplicate the communicator and store in a std::shared_ptr so that

            // all copies point to the same object

            [comm]()

            {

                auto p = std::make_unique<MPI_Comm>();

                MPI_Comm_dup( comm, p.get() );

                return p.release();

            }(),

            // Custom deleter to mark the communicator for deallocation

            []( MPI_Comm* p )

            {

                MPI_Comm_free( p );

                delete p;

            } );

    }


    MPI_Comm comm() const { return *_comm_ptr; }


    int numNeighbor() const { return _neighbors.size(); }


    int neighborRank( const int neighbor ) const

    {

        return _neighbors[neighbor];

    }


    std::size_t numExport( const int neighbor ) const

    {

        return _num_export[neighbor];

    }


    std::size_t totalNumExport() const { return _total_num_export; }


    std::size_t numImport( const int neighbor ) const

    {

        return _num_import[neighbor];

    }


    std::size_t totalNumImport() const { return _total_num_import; }


    std::size_t exportSize() const { return _num_export_element; }


    Kokkos::View<std::size_t*, memory_space> getExportSteering() const

    {

        return _export_steering;

    }


    // The functions in the public block below would normally be protected but

    // we make them public to allow using private class data in CUDA kernels

    // with lambda functions.

  public:

    template <class ExecutionSpace, class ViewType>

    Kokkos::View<size_type*, memory_space>


    createFromExportsAndTopology( ExecutionSpace exec_space,

                                  const ViewType& element_export_ranks,

                                  const std::vector<int>& neighbor_ranks )

    {

        static_assert( is_accessible_from<memory_space, ExecutionSpace>{}, "" );


        // Store the number of export elements.

        _num_export_element = element_export_ranks.size();


        // Store the unique neighbors (this rank first).

        _neighbors = getUniqueTopology( comm(), neighbor_ranks );

        int num_n = _neighbors.size();


        // Get the size of this communicator.

        int comm_size = -1;

        MPI_Comm_size( comm(), &comm_size );


        // Get the MPI rank we are currently on.

        int my_rank = -1;

        MPI_Comm_rank( comm(), &my_rank );


        // Pick an mpi tag for communication. This object has it's own

        // communication space so any mpi tag will do.

        const int mpi_tag = 1221;


        // Initialize import/export sizes.

        _num_export.assign( num_n, 0 );

        _num_import.assign( num_n, 0 );


        // Count the number of sends this rank will do to other ranks. Keep

        // track of which slot we get in our neighbor's send buffer.

        auto counts_and_ids = Impl::countSendsAndCreateSteering(

            exec_space, element_export_ranks, comm_size,

            typename Impl::CountSendsAndCreateSteeringAlgorithm<

                ExecutionSpace>::type() );


        // Copy the counts to the host.

        auto neighbor_counts_host = Kokkos::create_mirror_view_and_copy(

            Kokkos::HostSpace(), counts_and_ids.first );


        // Get the export counts.

        for ( int n = 0; n < num_n; ++n )

            _num_export[n] = neighbor_counts_host( _neighbors[n] );


        // Post receives for the number of imports we will get.

        std::vector<MPI_Request> requests;

        requests.reserve( num_n );

        for ( int n = 0; n < num_n; ++n )

            if ( my_rank != _neighbors[n] )

            {

                requests.push_back( MPI_Request() );

                MPI_Irecv( &_num_import[n], 1, MPI_UNSIGNED_LONG, _neighbors[n],

                           mpi_tag, comm(), &( requests.back() ) );

            }

            else

                _num_import[n] = _num_export[n];


        // Send the number of exports to each of our neighbors.

        for ( int n = 0; n < num_n; ++n )

            if ( my_rank != _neighbors[n] )

                MPI_Send( &_num_export[n], 1, MPI_UNSIGNED_LONG, _neighbors[n],

                          mpi_tag, comm() );


        // Wait on receives.

        std::vector<MPI_Status> status( requests.size() );

        const int ec =

            MPI_Waitall( requests.size(), requests.data(), status.data() );

        if ( MPI_SUCCESS != ec )

            throw std::logic_error(

                "Cabana::CommunicationPlan::createFromExportsAndTopology: "

                "Failed MPI Communication" );


        // Get the total number of imports/exports.

        _total_num_export = std::accumulate(

            _num_export.begin(), _num_export.end(), std::size_t{ 0u } );

        _total_num_import = std::accumulate(

            _num_import.begin(), _num_import.end(), std::size_t{ 0u } );


        // Barrier before continuing to ensure synchronization.

        MPI_Barrier( comm() );


        // Return the neighbor ids.

        return counts_and_ids.second;

    }


    template <class ViewType>

    Kokkos::View<size_type*, memory_space>


    createFromExportsAndTopology( const ViewType& element_export_ranks,

                                  const std::vector<int>& neighbor_ranks )

    {

        // Use the default execution space.

        return createFromExportsAndTopology(

            execution_space{}, element_export_ranks, neighbor_ranks );

    }


    template <class ExecutionSpace, class ViewType>

    Kokkos::View<size_type*, memory_space>


    createFromExportsOnly( ExecutionSpace exec_space,

                           const ViewType& element_export_ranks )

    {

        static_assert( is_accessible_from<memory_space, ExecutionSpace>{}, "" );


        // Store the number of export elements.

        _num_export_element = element_export_ranks.size();


        // Get the size of this communicator.

        int comm_size = -1;

        MPI_Comm_size( comm(), &comm_size );


        // Get the MPI rank we are currently on.

        int my_rank = -1;

        MPI_Comm_rank( comm(), &my_rank );


        // Pick an mpi tag for communication. This object has it's own

        // communication space so any mpi tag will do.

        const int mpi_tag = 1221;


        // Count the number of sends this rank will do to other ranks. Keep

        // track of which slot we get in our neighbor's send buffer.

        auto counts_and_ids = Impl::countSendsAndCreateSteering(

            exec_space, element_export_ranks, comm_size,

            typename Impl::CountSendsAndCreateSteeringAlgorithm<

                ExecutionSpace>::type() );


        // Copy the counts to the host.

        auto neighbor_counts_host = Kokkos::create_mirror_view_and_copy(

            Kokkos::HostSpace(), counts_and_ids.first );


        // Extract the export ranks and number of exports and then flag the

        // send ranks.

        _neighbors.clear();

        _num_export.clear();

        _total_num_export = 0;

        for ( int r = 0; r < comm_size; ++r )

            if ( neighbor_counts_host( r ) > 0 )

            {

                _neighbors.push_back( r );

                _num_export.push_back( neighbor_counts_host( r ) );

                _total_num_export += neighbor_counts_host( r );

                neighbor_counts_host( r ) = 1;

            }


        // Get the number of export ranks and initially allocate the import

        // sizes.

        int num_export_rank = _neighbors.size();

        _num_import.assign( num_export_rank, 0 );


        // If we are sending to ourself put that one first in the neighbor

        // list and assign the number of imports to be the number of exports.

        bool self_send = false;

        for ( int n = 0; n < num_export_rank; ++n )

            if ( _neighbors[n] == my_rank )

            {

                std::swap( _neighbors[n], _neighbors[0] );

                std::swap( _num_export[n], _num_export[0] );

                _num_import[0] = _num_export[0];

                self_send = true;

                break;

            }


        // Determine how many total import ranks each neighbor has.

        int num_import_rank = -1;

        std::vector<int> recv_counts( comm_size, 1 );

        MPI_Reduce_scatter( neighbor_counts_host.data(), &num_import_rank,

                            recv_counts.data(), MPI_INT, MPI_SUM, comm() );

        if ( self_send )

            --num_import_rank;


        // Post the expected number of receives and indicate we might get them

        // from any rank.

        std::vector<std::size_t> import_sizes( num_import_rank );

        std::vector<MPI_Request> requests( num_import_rank );

        for ( int n = 0; n < num_import_rank; ++n )

            MPI_Irecv( &import_sizes[n], 1, MPI_UNSIGNED_LONG, MPI_ANY_SOURCE,

                       mpi_tag, comm(), &requests[n] );


        // Do blocking sends. Dont do any self sends.

        int self_offset = ( self_send ) ? 1 : 0;

        for ( int n = self_offset; n < num_export_rank; ++n )

            MPI_Send( &_num_export[n], 1, MPI_UNSIGNED_LONG, _neighbors[n],

                      mpi_tag, comm() );


        // Wait on non-blocking receives.

        std::vector<MPI_Status> status( requests.size() );

        const int ec =

            MPI_Waitall( requests.size(), requests.data(), status.data() );

        if ( MPI_SUCCESS != ec )

            throw std::logic_error(

                "Cabana::CommunicationPlan::createFromExportsOnly: Failed MPI "

                "Communication" );


        // Compute the total number of imports.

        _total_num_import =

            std::accumulate( import_sizes.begin(), import_sizes.end(),

                             ( self_send ) ? _num_import[0] : 0 );


        // Extract the imports. If we did self sends we already know what

        // imports we got from that.

        for ( int i = 0; i < num_import_rank; ++i )

        {

            // Get the message source.

            const auto source = status[i].MPI_SOURCE;


            // See if the neighbor we received stuff from was someone we also

            // sent stuff to.

            auto found_neighbor =

                std::find( _neighbors.begin(), _neighbors.end(), source );


            // If this is a new neighbor (i.e. someone we didn't send anything

            // to) record this.

            if ( found_neighbor == std::end( _neighbors ) )

            {

                _neighbors.push_back( source );

                _num_import.push_back( import_sizes[i] );

                _num_export.push_back( 0 );

            }


            // Otherwise if we already sent something to this neighbor that

            // means we already have a neighbor/export entry. Just assign the

            // import entry for that neighbor.

            else

            {

                auto n = std::distance( _neighbors.begin(), found_neighbor );

                _num_import[n] = import_sizes[i];

            }

        }


        // Barrier before continuing to ensure synchronization.

        MPI_Barrier( comm() );


        // Return the neighbor ids.

        return counts_and_ids.second;

    }


    template <class ViewType>

    Kokkos::View<size_type*, memory_space>


    createFromExportsOnly( const ViewType& element_export_ranks )

    {

        // Use the default execution space.

        return createFromExportsOnly( execution_space{}, element_export_ranks );

    }


    template <class PackViewType, class RankViewType>


    void createExportSteering( const PackViewType& neighbor_ids,

                               const RankViewType& element_export_ranks )

    {

        // passing in element_export_ranks here as a dummy argument.

        createSteering( true, neighbor_ids, element_export_ranks,

                        element_export_ranks );

    }


    template <class PackViewType, class RankViewType, class IdViewType>


    void createExportSteering( const PackViewType& neighbor_ids,

                               const RankViewType& element_export_ranks,

                               const IdViewType& element_export_ids )

    {

        createSteering( false, neighbor_ids, element_export_ranks,

                        element_export_ids );

    }


    // Create the export steering vector.

    template <class ExecutionSpace, class PackViewType, class RankViewType,

              class IdViewType>

    void createSteering( ExecutionSpace, const bool use_iota,

                         const PackViewType& neighbor_ids,

                         const RankViewType& element_export_ranks,

                         const IdViewType& element_export_ids )

    {

        static_assert( is_accessible_from<memory_space, ExecutionSpace>{}, "" );


        if ( !use_iota &&

             ( element_export_ids.size() != element_export_ranks.size() ) )

            throw std::runtime_error(

                "Cabana::CommunicationPlan::createSteering: Export ids and "

                "ranks different sizes!" );


        // Get the size of this communicator.

        int comm_size = -1;

        MPI_Comm_size( *_comm_ptr, &comm_size );


        // Calculate the steering offsets via exclusive prefix sum for the

        // exports.

        int num_n = _neighbors.size();

        std::vector<std::size_t> offsets( num_n, 0.0 );

        for ( int n = 1; n < num_n; ++n )

            offsets[n] = offsets[n - 1] + _num_export[n - 1];


        // Map the offsets to the device.

        Kokkos::View<std::size_t*, Kokkos::HostSpace> rank_offsets_host(

            Kokkos::ViewAllocateWithoutInitializing( "rank_map" ), comm_size );

        for ( int n = 0; n < num_n; ++n )

            rank_offsets_host( _neighbors[n] ) = offsets[n];

        auto rank_offsets = Kokkos::create_mirror_view_and_copy(

            memory_space(), rank_offsets_host );


        // Create the export steering vector for writing local elements into

        // the send buffer. Note we create a local, shallow copy - this is a

        // CUDA workaround for handling class private data.

        _export_steering = Kokkos::View<std::size_t*, memory_space>(

            Kokkos::ViewAllocateWithoutInitializing( "export_steering" ),

            _total_num_export );

        auto steer_vec = _export_steering;

        Kokkos::parallel_for(

            "Cabana::CommunicationPlan::createSteering",

            Kokkos::RangePolicy<ExecutionSpace>( 0, _num_export_element ),

            KOKKOS_LAMBDA( const int i ) {

                if ( element_export_ranks( i ) >= 0 )

                    steer_vec( rank_offsets( element_export_ranks( i ) ) +

                               neighbor_ids( i ) ) =

                        ( use_iota ) ? i : element_export_ids( i );

            } );

        Kokkos::fence();

    }


    template <class PackViewType, class RankViewType, class IdViewType>

    void createSteering( const bool use_iota, const PackViewType& neighbor_ids,

                         const RankViewType& element_export_ranks,

                         const IdViewType& element_export_ids )

    {

        // Use the default execution space.

        createSteering( execution_space{}, use_iota, neighbor_ids,

                        element_export_ranks, element_export_ids );

    }


  private:

    std::shared_ptr<MPI_Comm> _comm_ptr;

    std::vector<int> _neighbors;

    std::size_t _total_num_export;

    std::size_t _total_num_import;

    std::vector<std::size_t> _num_export;

    std::vector<std::size_t> _num_import;

    std::size_t _num_export_element;

    Kokkos::View<std::size_t*, memory_space> _export_steering;

};


//---------------------------------------------------------------------------//

template <class AoSoAType>


struct CommunicationDataAoSoA

{

    static_assert( is_aosoa<AoSoAType>::value, "" );


    using particle_data_type = AoSoAType;

    using memory_space = typename particle_data_type::memory_space;

    using data_type = typename particle_data_type::tuple_type;

    using buffer_type = typename Kokkos::View<data_type*, memory_space>;


    CommunicationDataAoSoA( particle_data_type particles )

        : _particles( particles )

    {

        _send_buffer = buffer_type(

            Kokkos::ViewAllocateWithoutInitializing( "send_buffer" ), 0 );

        _recv_buffer = buffer_type(

            Kokkos::ViewAllocateWithoutInitializing( "recv_buffer" ), 0 );

    }


    void reallocateSend( const std::size_t num_send )

    {

        Kokkos::realloc( _send_buffer, num_send );

    }


    void reallocateReceive( const std::size_t num_recv )

    {

        Kokkos::realloc( _recv_buffer, num_recv );

    }


    buffer_type _send_buffer;

    buffer_type _recv_buffer;

    particle_data_type _particles;

    std::size_t _num_comp = 0;

};


template <class SliceType>


struct CommunicationDataSlice

{

    static_assert( is_slice<SliceType>::value, "" );


    using particle_data_type = SliceType;

    using memory_space = typename particle_data_type::memory_space;

    using data_type = typename particle_data_type::value_type;

    using buffer_type =

        typename Kokkos::View<data_type**, Kokkos::LayoutRight, memory_space>;


    CommunicationDataSlice( particle_data_type particles )

        : _particles( particles )

    {

        setSliceComponents();


        _send_buffer = buffer_type(

            Kokkos::ViewAllocateWithoutInitializing( "send_buffer" ), 0, 0 );

        _recv_buffer = buffer_type(

            Kokkos::ViewAllocateWithoutInitializing( "recv_buffer" ), 0, 0 );

    }


    void reallocateSend( const std::size_t num_send )

    {

        Kokkos::realloc( _send_buffer, num_send, _num_comp );

    }


    void reallocateReceive( const std::size_t num_recv )

    {

        Kokkos::realloc( _recv_buffer, num_recv, _num_comp );

    }


    void setSliceComponents()

    {

        _num_comp = 1;

        for ( std::size_t d = 2; d < _particles.viewRank(); ++d )

            _num_comp *= _particles.extent( d );

    }


    buffer_type _send_buffer;

    buffer_type _recv_buffer;

    particle_data_type _particles;

    std::size_t _num_comp;

};


//---------------------------------------------------------------------------//


template <class CommPlanType, class CommDataType>


class CommunicationData

{

  public:

    using plan_type = CommPlanType;

    using execution_space = typename plan_type::execution_space;

    using policy_type = Kokkos::RangePolicy<execution_space>;

    using comm_data_type = CommDataType;

    using particle_data_type = typename comm_data_type::particle_data_type;

    using memory_space = typename comm_data_type::memory_space;

    using data_type = typename comm_data_type::data_type;

    using buffer_type = typename comm_data_type::buffer_type;


    CommunicationData( const CommPlanType& comm_plan,

                       const particle_data_type& particles,

                       const double overallocation = 1.0 )

        : _comm_plan( comm_plan )

        , _comm_data( CommDataType( particles ) )

        , _overallocation( overallocation )

    {

    }


    buffer_type getSendBuffer() const { return _comm_data._send_buffer; }

    buffer_type getReceiveBuffer() const { return _comm_data._recv_buffer; }


    particle_data_type getData() const { return _comm_data._particles; }


    void setData( const particle_data_type& particles )

    {

        _comm_data._particles = particles;

    }


    auto sendSize() { return _send_size; }

    auto receiveSize() { return _recv_size; }

    auto sendCapacity() { return _comm_data._send_buffer.extent( 0 ); }

    auto receiveCapacity() { return _comm_data._recv_buffer.extent( 0 ); }


    void shrinkToFit( const bool use_overallocation = false )

    {

        auto shrunk_send_size = _send_size;

        auto shrunk_recv_size = _recv_size;

        if ( use_overallocation )

        {

            shrunk_send_size *= _overallocation;

            shrunk_recv_size *= _overallocation;

        }

        _comm_data.reallocateSend( shrunk_send_size );

        _comm_data.reallocateReceive( shrunk_recv_size );

    }


    virtual void apply() = 0;


    template <class ExecutionSpace>

    void apply( ExecutionSpace );


    void reserveImpl( const CommPlanType& comm_plan,

                      const particle_data_type particles,

                      const std::size_t total_send,

                      const std::size_t total_recv,

                      const double overallocation )

    {

        if ( overallocation < 1.0 )

            throw std::runtime_error( "Cabana::CommunicationPlan: "

                                      "Cannot allocate buffers with less space "

                                      "than data to communicate!" );

        _overallocation = overallocation;


        reserveImpl( comm_plan, particles, total_send, total_recv );

    }

    void reserveImpl( const CommPlanType& comm_plan,

                      const particle_data_type particles,

                      const std::size_t total_send,

                      const std::size_t total_recv )

    {

        _comm_plan = comm_plan;

        setData( particles );


        auto send_capacity = sendCapacity();

        auto new_send_size = static_cast<std::size_t>(

            static_cast<double>( total_send ) * _overallocation );

        if ( new_send_size > send_capacity )

            _comm_data.reallocateSend( new_send_size );


        auto recv_capacity = receiveCapacity();

        auto new_recv_size = static_cast<std::size_t>(

            static_cast<double>( total_recv ) * _overallocation );

        if ( new_recv_size > recv_capacity )

            _comm_data.reallocateReceive( new_recv_size );


        _send_size = total_send;

        _recv_size = total_recv;

    }


  protected:

    auto getSliceComponents() { return _comm_data._num_comp; };


    plan_type _comm_plan;

    comm_data_type _comm_data;

    double _overallocation;

    std::size_t _send_size;

    std::size_t _recv_size;

};


} // end namespace Cabana


#endif // end CABANA_COMMUNICATIONPLAN_HPP

Cabana_Utils.hpp
Cabana utilities.

Cabana::CommunicationData::setData
void setData(const particle_data_type &particles)
Update particles to communicate.
Definition Cabana_CommunicationPlan.hpp:1221

Cabana::CommunicationData::sendCapacity
auto sendCapacity()
Current allocated send buffer space.
Definition Cabana_CommunicationPlan.hpp:1239

Cabana::CommunicationData::execution_space
typename plan_type::execution_space execution_space
Kokkos execution space.
Definition Cabana_CommunicationPlan.hpp:1184

Cabana::CommunicationData::shrinkToFit
void shrinkToFit(const bool use_overallocation=false)
Reduce communication buffers to current send/receive sizes.
Definition Cabana_CommunicationPlan.hpp:1247

Cabana::CommunicationData::receiveSize
auto receiveSize()
Current receive buffer size.
Definition Cabana_CommunicationPlan.hpp:1237

Cabana::CommunicationData::getSendBuffer
buffer_type getSendBuffer() const
Get the communication send buffer.
Definition Cabana_CommunicationPlan.hpp:1214

Cabana::CommunicationData::getData
particle_data_type getData() const
Get the particles to communicate.
Definition Cabana_CommunicationPlan.hpp:1219

Cabana::CommunicationData< HaloType, CommunicationDataAoSoA< AoSoAType > >::_comm_plan
plan_type _comm_plan
Definition Cabana_CommunicationPlan.hpp:1312

Cabana::CommunicationData< HaloType, CommunicationDataAoSoA< AoSoAType > >::_recv_size
std::size_t _recv_size
Definition Cabana_CommunicationPlan.hpp:1320

Cabana::CommunicationData::apply
void apply(ExecutionSpace)
Perform the communication (migrate, gather, scatter).

Cabana::CommunicationData::data_type
typename comm_data_type::data_type data_type
Communication data type.
Definition Cabana_CommunicationPlan.hpp:1194

Cabana::CommunicationData< HaloType, CommunicationDataAoSoA< AoSoAType > >::_comm_data
comm_data_type _comm_data
Definition Cabana_CommunicationPlan.hpp:1314

Cabana::CommunicationData< HaloType, CommunicationDataAoSoA< AoSoAType > >::_overallocation
double _overallocation
Definition Cabana_CommunicationPlan.hpp:1316

Cabana::CommunicationData::particle_data_type
typename comm_data_type::particle_data_type particle_data_type
Particle data type.
Definition Cabana_CommunicationPlan.hpp:1190

Cabana::CommunicationData::comm_data_type
CommDataType comm_data_type
Communication data type.
Definition Cabana_CommunicationPlan.hpp:1188

Cabana::CommunicationData::policy_type
Kokkos::RangePolicy< execution_space > policy_type
Kokkos execution policy.
Definition Cabana_CommunicationPlan.hpp:1186

Cabana::CommunicationData::buffer_type
typename comm_data_type::buffer_type buffer_type
Communication buffer type.
Definition Cabana_CommunicationPlan.hpp:1196

Cabana::CommunicationData::getSliceComponents
auto getSliceComponents()
Get the total number of components in the slice.
Definition Cabana_CommunicationPlan.hpp:1309

Cabana::CommunicationData::plan_type
CommPlanType plan_type
Communication plan type (Halo, Distributor)
Definition Cabana_CommunicationPlan.hpp:1182

Cabana::CommunicationData::CommunicationData
CommunicationData(const CommPlanType &comm_plan, const particle_data_type &particles, const double overallocation=1.0)
Definition Cabana_CommunicationPlan.hpp:1204

Cabana::CommunicationData::receiveCapacity
auto receiveCapacity()
Current allocated receive buffer space.
Definition Cabana_CommunicationPlan.hpp:1241

Cabana::CommunicationData::apply
virtual void apply()=0
Perform the communication (migrate, gather, scatter).

Cabana::CommunicationData::sendSize
auto sendSize()
Current send buffer size.
Definition Cabana_CommunicationPlan.hpp:1231

Cabana::CommunicationData::memory_space
typename comm_data_type::memory_space memory_space
Kokkos memory space.
Definition Cabana_CommunicationPlan.hpp:1192

Cabana::CommunicationData::getReceiveBuffer
buffer_type getReceiveBuffer() const
Get the communication receive buffer.
Definition Cabana_CommunicationPlan.hpp:1216

Cabana::CommunicationData< HaloType, CommunicationDataAoSoA< AoSoAType > >::_send_size
std::size_t _send_size
Definition Cabana_CommunicationPlan.hpp:1318

Cabana::CommunicationPlan::createFromExportsOnly
Kokkos::View< size_type *, memory_space > createFromExportsOnly(ExecutionSpace exec_space, const ViewType &element_export_ranks)
Export rank creator. Use this when you don't know who you will receiving from - only who you are send...
Definition Cabana_CommunicationPlan.hpp:758

Cabana::CommunicationPlan::getExportSteering
Kokkos::View< std::size_t *, memory_space > getExportSteering() const
Get the steering vector for the exports.
Definition Cabana_CommunicationPlan.hpp:548

Cabana::CommunicationPlan::exportSize
std::size_t exportSize() const
Get the number of export elements.
Definition Cabana_CommunicationPlan.hpp:537

Cabana::CommunicationPlan::size_type
typename memory_space::memory_space::size_type size_type
Size type.
Definition Cabana_CommunicationPlan.hpp:439

Cabana::CommunicationPlan::memory_space
typename MemorySpace::memory_space memory_space
Memory space.
Definition Cabana_CommunicationPlan.hpp:425

Cabana::CommunicationPlan::createFromExportsOnly
Kokkos::View< size_type *, memory_space > createFromExportsOnly(const ViewType &element_export_ranks)
Export rank creator. Use this when you don't know who you will receiving from - only who you are send...
Definition Cabana_CommunicationPlan.hpp:925

Cabana::CommunicationPlan::createFromExportsAndTopology
Kokkos::View< size_type *, memory_space > createFromExportsAndTopology(const ViewType &element_export_ranks, const std::vector< int > &neighbor_ranks)
Neighbor and export rank creator. Use this when you already know which ranks neighbor each other (i....
Definition Cabana_CommunicationPlan.hpp:718

Cabana::CommunicationPlan::numNeighbor
int numNeighbor() const
Get the number of neighbor ranks that this rank will communicate with.
Definition Cabana_CommunicationPlan.hpp:475

Cabana::CommunicationPlan::createExportSteering
void createExportSteering(const PackViewType &neighbor_ids, const RankViewType &element_export_ranks)
Create the export steering vector.
Definition Cabana_CommunicationPlan.hpp:946

Cabana::CommunicationPlan::totalNumExport
std::size_t totalNumExport() const
Get the total number of exports this rank will do.
Definition Cabana_CommunicationPlan.hpp:505

Cabana::CommunicationPlan::comm
MPI_Comm comm() const
Get the MPI communicator.
Definition Cabana_CommunicationPlan.hpp:468

Cabana::CommunicationPlan::numExport
std::size_t numExport(const int neighbor) const
Get the number of elements this rank will export to a given neighbor.
Definition Cabana_CommunicationPlan.hpp:495

Cabana::CommunicationPlan::numImport
std::size_t numImport(const int neighbor) const
Get the number of elements this rank will import from a given neighbor.
Definition Cabana_CommunicationPlan.hpp:515

Cabana::CommunicationPlan::totalNumImport
std::size_t totalNumImport() const
Get the total number of imports this rank will do.
Definition Cabana_CommunicationPlan.hpp:525

Cabana::CommunicationPlan::neighborRank
int neighborRank(const int neighbor) const
Given a local neighbor id get its rank in the MPI communicator.
Definition Cabana_CommunicationPlan.hpp:482

Cabana::CommunicationPlan::createFromExportsAndTopology
Kokkos::View< size_type *, memory_space > createFromExportsAndTopology(ExecutionSpace exec_space, const ViewType &element_export_ranks, const std::vector< int > &neighbor_ranks)
Neighbor and export rank creator. Use this when you already know which ranks neighbor each other (i....
Definition Cabana_CommunicationPlan.hpp:596

Cabana::CommunicationPlan::createExportSteering
void createExportSteering(const PackViewType &neighbor_ids, const RankViewType &element_export_ranks, const IdViewType &element_export_ids)
Create the export steering vector.
Definition Cabana_CommunicationPlan.hpp:974

Cabana::CommunicationPlan::CommunicationPlan
CommunicationPlan(MPI_Comm comm)
Constructor.
Definition Cabana_CommunicationPlan.hpp:446

Cabana::CommunicationPlan::execution_space
typename memory_space::execution_space execution_space
Default execution space.
Definition Cabana_CommunicationPlan.hpp:433

Cabana
Core: particle data structures and algorithms.
Definition Cabana_AoSoA.hpp:36

Cabana::getUniqueTopology
std::vector< int > getUniqueTopology(MPI_Comm comm, std::vector< int > topology)
Return unique neighbor ranks, with the current rank first.
Definition Cabana_CommunicationPlan.hpp:370

Cabana::CommunicationDataAoSoA::_send_buffer
buffer_type _send_buffer
Send buffer.
Definition Cabana_CommunicationPlan.hpp:1102

Cabana::CommunicationDataAoSoA::reallocateSend
void reallocateSend(const std::size_t num_send)
Resize the send buffer.
Definition Cabana_CommunicationPlan.hpp:1091

Cabana::CommunicationDataAoSoA::data_type
typename particle_data_type::tuple_type data_type
Communication data type.
Definition Cabana_CommunicationPlan.hpp:1073

Cabana::CommunicationDataAoSoA::particle_data_type
AoSoAType particle_data_type
Particle data type.
Definition Cabana_CommunicationPlan.hpp:1069

Cabana::CommunicationDataAoSoA::_recv_buffer
buffer_type _recv_buffer
Receive buffer.
Definition Cabana_CommunicationPlan.hpp:1104

Cabana::CommunicationDataAoSoA::memory_space
typename particle_data_type::memory_space memory_space
Kokkos memory space.
Definition Cabana_CommunicationPlan.hpp:1071

Cabana::CommunicationDataAoSoA::_particles
particle_data_type _particles
Particle AoSoA.
Definition Cabana_CommunicationPlan.hpp:1106

Cabana::CommunicationDataAoSoA::CommunicationDataAoSoA
CommunicationDataAoSoA(particle_data_type particles)
Definition Cabana_CommunicationPlan.hpp:1081

Cabana::CommunicationDataAoSoA::buffer_type
typename Kokkos::View< data_type *, memory_space > buffer_type
Communication buffer type.
Definition Cabana_CommunicationPlan.hpp:1075

Cabana::CommunicationDataAoSoA::_num_comp
std::size_t _num_comp
Slice components.
Definition Cabana_CommunicationPlan.hpp:1108

Cabana::CommunicationDataAoSoA::reallocateReceive
void reallocateReceive(const std::size_t num_recv)
Resize the receive buffer.
Definition Cabana_CommunicationPlan.hpp:1096

Cabana::CommunicationDataSlice::setSliceComponents
void setSliceComponents()
Get the total number of components in the slice.
Definition Cabana_CommunicationPlan.hpp:1156

Cabana::CommunicationDataSlice::reallocateSend
void reallocateSend(const std::size_t num_send)
Resize the send buffer.
Definition Cabana_CommunicationPlan.hpp:1145

Cabana::CommunicationDataSlice::_send_buffer
buffer_type _send_buffer
Send buffer.
Definition Cabana_CommunicationPlan.hpp:1164

Cabana::CommunicationDataSlice::_recv_buffer
buffer_type _recv_buffer
Receive buffer.
Definition Cabana_CommunicationPlan.hpp:1166

Cabana::CommunicationDataSlice::reallocateReceive
void reallocateReceive(const std::size_t num_recv)
Resize the receive buffer.
Definition Cabana_CommunicationPlan.hpp:1150

Cabana::CommunicationDataSlice::buffer_type
typename Kokkos::View< data_type **, Kokkos::LayoutRight, memory_space > buffer_type
Communication buffer type.
Definition Cabana_CommunicationPlan.hpp:1126

Cabana::CommunicationDataSlice::data_type
typename particle_data_type::value_type data_type
Communication data type.
Definition Cabana_CommunicationPlan.hpp:1124

Cabana::CommunicationDataSlice::memory_space
typename particle_data_type::memory_space memory_space
Kokkos memory space.
Definition Cabana_CommunicationPlan.hpp:1122

Cabana::CommunicationDataSlice::particle_data_type
SliceType particle_data_type
Particle data type.
Definition Cabana_CommunicationPlan.hpp:1120

Cabana::CommunicationDataSlice::_num_comp
std::size_t _num_comp
Slice components.
Definition Cabana_CommunicationPlan.hpp:1170

Cabana::CommunicationDataSlice::_particles
particle_data_type _particles
Particle slice.
Definition Cabana_CommunicationPlan.hpp:1168

Cabana::CommunicationDataSlice::CommunicationDataSlice
CommunicationDataSlice(particle_data_type particles)
Definition Cabana_CommunicationPlan.hpp:1133

Cabana::is_accessible_from
Definition Cabana_Types.hpp:88

Cabana::is_aosoa
AoSoA static type checker.
Definition Cabana_AoSoA.hpp:61

Cabana::is_slice
Slice static type checker.
Definition Cabana_Slice.hpp:861