docs/develop/mpi_8c_source.html

 #ifdef HAVE_MPI

 #include <stdbool.h>

 #include <communication/mpi.h>
 #include <communication/wnd.h>
 #include <communication/gvt.h>
 #include <communication/communication.h>
 #include <queues/queues.h>
 #include <core/core.h>
 #include <arch/atomic.h>
 #include <statistics/statistics.h>

 bool mpi_support_multithread;

 spinlock_t mpi_lock;

 static spinlock_t msgs_lock;

 static unsigned int terminated = 0;

 static MPI_Request *termination_reqs;

 static spinlock_t msgs_fini;

 static MPI_Op reduce_stats_op;

 static MPI_Datatype stats_mpi_t;

 static MPI_Comm msg_comm;


 bool pending_msgs(int tag)
 {
     int flag = 0;
     lock_mpi();
     MPI_Iprobe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
     unlock_mpi();
     return (bool)flag;
 }

 bool is_request_completed(MPI_Request *req)
 {
     int flag = 0;
     lock_mpi();
     MPI_Test(req, &flag, MPI_STATUS_IGNORE);
     unlock_mpi();
     return (bool)flag;
 }

 void send_remote_msg(msg_t *msg)
 {
     outgoing_msg *out_msg = allocate_outgoing_msg();
     out_msg->msg = msg;
     out_msg->msg->colour = threads_phase_colour[local_tid];
     unsigned int dest = find_kernel_by_gid(msg->receiver);

     validate_msg(msg);

     register_outgoing_msg(out_msg->msg);

     lock_mpi();
     MPI_Isend(((char *)out_msg->msg) + MSG_PADDING, MSG_META_SIZE + msg->size, MPI_BYTE, dest, msg->receiver.to_int, msg_comm, &out_msg->req);
     unlock_mpi();

     // Keep the message in the outgoing queue until it will be delivered
     store_outgoing_msg(out_msg, dest);
 }

 void receive_remote_msgs(void)
 {
     int size;
     msg_t *msg;
     MPI_Status status;
     MPI_Message mpi_msg;
     int pending;
     struct lp_struct *lp;
     GID_t gid;

     // TODO: given the latest changes in the platform, this *might*
     // be removed.
     if (!spin_trylock(&msgs_lock))
         return;

     while (true) {
         lock_mpi();
         MPI_Improbe(MPI_ANY_SOURCE, MPI_ANY_TAG, msg_comm, &pending, &mpi_msg, &status);
         unlock_mpi();

         if (!pending)
             goto out;

         MPI_Get_count(&status, MPI_BYTE, &size);

         if (likely(MSG_PADDING + size <= SLAB_MSG_SIZE)) {
             set_gid(gid, status.MPI_TAG);
             lp = find_lp_by_gid(gid);
             msg = get_msg_from_slab(lp);
         } else {
             msg = rsalloc(MSG_PADDING + size);
             bzero(msg, MSG_PADDING);
         }

         // Receive the message. Use MPI_Mrecv to be sure that the very same message
         // which was matched by the previous MPI_Improbe is extracted.
         lock_mpi();
         MPI_Mrecv(((char *)msg) + MSG_PADDING, size, MPI_BYTE, &mpi_msg, MPI_STATUS_IGNORE);
         unlock_mpi();

         validate_msg(msg);
         insert_bottom_half(msg);
     }
     out:
     spin_unlock(&msgs_lock);
 }


 bool all_kernels_terminated(void)
 {
     return (terminated == n_ker);
 }


 void collect_termination(void)
 {
     int res;
     unsigned int tdata;

     if (terminated == 0 || !spin_trylock(&msgs_fini))
         return;

     while (pending_msgs(MSG_FINI)) {
         lock_mpi();
         res =
             MPI_Recv(&tdata, 1, MPI_UNSIGNED, MPI_ANY_SOURCE, MSG_FINI, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
         unlock_mpi();
         if (unlikely(res != 0)) {
             rootsim_error(true, "MPI_Recv did not complete correctly");
             return;
         }
         terminated++;
     }
     spin_unlock(&msgs_fini);
 }


 void broadcast_termination(void)
 {
     unsigned int i;
     lock_mpi();
     for (i = 0; i < n_ker; i++) {
         if (i == kid)
             continue;
         MPI_Isend(&i, 1, MPI_UNSIGNED, i, MSG_FINI, MPI_COMM_WORLD, &termination_reqs[i]);
     }
     terminated++;
     unlock_mpi();
 }


 static void reduce_stat_vector(struct stat_t *in, struct stat_t *inout, int *len, MPI_Datatype *dptr)
 {
     (void)dptr;
     int i = 0;

     for (i = 0; i < *len; ++i) {
         inout[i].vec += in[i].vec;
         inout[i].gvt_round_time += in[i].gvt_round_time;
         inout[i].gvt_round_time_min = fmin(inout[i].gvt_round_time_min, in[i].gvt_round_time_min);
         inout[i].gvt_round_time_max = fmax(inout[i].gvt_round_time_max, in[i].gvt_round_time_max);
         inout[i].max_resident_set += in[i].max_resident_set;
     }
 }


 #define MPI_TYPE_STAT_LEN (sizeof(struct stat_t)/sizeof(double))

 static void stats_reduction_init(void)
 {
     // This is a compilation time fail-safe
     static_assert(offsetof(struct stat_t, gvt_round_time_max) == (sizeof(double) * 19), "The packing assumptions on struct stat_t are wrong or its definition has been modified");

     unsigned i;

     // Boilerplate to create a new MPI data type
     MPI_Datatype type[MPI_TYPE_STAT_LEN];
     MPI_Aint disp[MPI_TYPE_STAT_LEN];
     int block_lengths[MPI_TYPE_STAT_LEN];

     // Initialize those arrays (we asssume that struct stat_t is packed tightly)
     for (i = 0; i < MPI_TYPE_STAT_LEN; ++i) {
         type[i] = MPI_DOUBLE;
         disp[i] = i * sizeof(double);
         block_lengths[i] = 1;
     }

     // Create the custom type and commit the changes
     MPI_Type_create_struct(MPI_TYPE_STAT_LEN, block_lengths, disp, type, &stats_mpi_t);
     MPI_Type_commit(&stats_mpi_t);

     // Create the MPI Operation used to reduce stats
     if (master_thread()) {
         MPI_Op_create((MPI_User_function *)reduce_stat_vector, true, &reduce_stats_op);
     }
 }

 #undef MPI_TYPE_STAT_LEN


 void mpi_reduce_statistics(struct stat_t *global, struct stat_t *local)
 {
     MPI_Reduce(local, global, 1, stats_mpi_t, reduce_stats_op, 0, MPI_COMM_WORLD);
 }


 void dist_termination_init(void)
 {
     /* init for collective termination */
     termination_reqs = rsalloc(n_ker * sizeof(MPI_Request));
     unsigned int i;
     for (i = 0; i < n_ker; i++) {
         termination_reqs[i] = MPI_REQUEST_NULL;
     }
     spinlock_init(&msgs_fini);
 }


 void dist_termination_finalize(void)
 {
     MPI_Waitall(n_ker, termination_reqs, MPI_STATUSES_IGNORE);
 }

 void syncronize_all(void)
 {
     if (master_thread()) {
         MPI_Comm comm;
         MPI_Comm_dup(MPI_COMM_WORLD, &comm);
         MPI_Barrier(comm);
         MPI_Comm_free(&comm);
     }
     thread_barrier(&all_thread_barrier);
 }


 void mpi_init(int *argc, char ***argv)
 {
     int mpi_thread_lvl_provided = 0;
     MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &mpi_thread_lvl_provided);

     mpi_support_multithread = true;
     if (mpi_thread_lvl_provided < MPI_THREAD_MULTIPLE) {
         //MPI do not support thread safe api call
         if (mpi_thread_lvl_provided < MPI_THREAD_SERIALIZED) {
             // MPI do not even support serialized threaded call we cannot continue
             rootsim_error(true, "The MPI implementation does not support threads [current thread level support: %d]\n", mpi_thread_lvl_provided);
         }
         mpi_support_multithread = false;
     }

     spinlock_init(&mpi_lock);

     MPI_Comm_size(MPI_COMM_WORLD, (int *)&n_ker);
     MPI_Comm_rank(MPI_COMM_WORLD, (int *)&kid);

     // Create a separate communicator which we use to send event messages
     MPI_Comm_dup(MPI_COMM_WORLD, &msg_comm);
 }


 void inter_kernel_comm_init(void)
 {
     spinlock_init(&msgs_lock);

     outgoing_window_init();
     gvt_comm_init();
     dist_termination_init();
     stats_reduction_init();
 }


 void inter_kernel_comm_finalize(void)
 {
     dist_termination_finalize();
     //outgoing_window_finalize();
     gvt_comm_finalize();
 }


 void mpi_finalize(void)
 {
     if (master_thread()) {
         MPI_Barrier(MPI_COMM_WORLD);
         MPI_Comm_free(&msg_comm);
         MPI_Finalize();
     } else {
         rootsim_error(true, "MPI finalize has been invoked by a non master thread: T%u\n", local_tid);
     }
 }

 #endif /* HAVE_MPI */
spin_trylock
bool spin_trylock(spinlock_t *s)
Definition: x86.c:151

communication.h
Communication Routines.

is_request_completed
bool is_request_completed(MPI_Request *req)
check if an MPI request has been completed
Definition: mpi.c:144

likely
#define likely(exp)
Optimize the branch as likely taken.
Definition: core.h:72

_outgoing_msg
The structure representing a node in the outgoing_queue list.
Definition: wnd.h:41

dist_termination_finalize
void dist_termination_finalize(void)
Cleanup routine of the distributed termination subsystem.
Definition: mpi.c:473

spinlock_init
#define spinlock_init(s)
Spinlock initialization.
Definition: atomic.h:72

gvt_comm_init
void gvt_comm_init(void)
Initialize the MPI-based distributed GVT reduction submodule.
Definition: gvt.c:174

broadcast_termination
void broadcast_termination(void)
Notify all the kernels about local termination.
Definition: mpi.c:329

queues.h
Message queueing subsystem.

threads_phase_colour
phase_colour * threads_phase_colour
Definition: gvt.c:48

termination_reqs
static MPI_Request * termination_reqs
MPI Requests to handle termination detection collection asynchronously.
Definition: mpi.c:67

core.h
Core ROOT-Sim functionalities.

SLAB_MSG_SIZE
#define SLAB_MSG_SIZE
Slab allocator max message size.
Definition: communication.h:43

mpi_finalize
void mpi_finalize(void)
Finalize MPI.
Definition: mpi.c:580

stats_mpi_t
static MPI_Datatype stats_mpi_t
MPI Datatype to describe the content of a struct stat_t.
Definition: mpi.c:76

lp_struct
Definition: process.h:65

_gid_t::to_int
unsigned int to_int
The GID numerical value.
Definition: core.h:133

statistics.h
Statistics module.

MPI_TYPE_STAT_LEN
#define MPI_TYPE_STAT_LEN
The size in bytes of the statistics custom MPI Datatype. It assumes that stat_t contains only double ...
Definition: mpi.c:367

msgs_lock
static spinlock_t msgs_lock
A guard to ensure isolation in the the message receiving routine.
Definition: mpi.c:58

stats_reduction_init
static void stats_reduction_init(void)
Initialize MPI Datatype and Operation for statistics reduction.
Definition: mpi.c:383

find_kernel_by_gid
unsigned int find_kernel_by_gid(GID_t gid)
Definition: core.c:164

store_outgoing_msg
void store_outgoing_msg(outgoing_msg *out_msg, unsigned int dest_kid)
Store an outgoing message.
Definition: wnd.c:167

MSG_FINI
One rank informs the others that the simulation has to be stopped.
Definition: communication.h:88

register_outgoing_msg
void register_outgoing_msg(const msg_t *msg)
Register an outgoing message, if necessary.
Definition: gvt.c:589

mpi_support_multithread
bool mpi_support_multithread
Flag telling whether the MPI runtime supports multithreading.
Definition: mpi.c:48

mpi_reduce_statistics
void mpi_reduce_statistics(struct stat_t *global, struct stat_t *local)
Invoke statistics reduction.
Definition: mpi.c:430

pending_msgs
bool pending_msgs(int tag)
Check if there are pending messages.
Definition: mpi.c:122

all_kernels_terminated
bool all_kernels_terminated(void)
Check if all kernels have reached the termination condition.
Definition: mpi.c:272

_msg_t
Message Type definition.
Definition: core.h:164

_msg_t::size
unsigned int size
Unique identifier of the message, used for rendez-vous events.
Definition: core.h:189

stat_t
Definition: statistics.h:117

inter_kernel_comm_finalize
void inter_kernel_comm_finalize(void)
Finalize inter-kernel communication.
Definition: mpi.c:564

_outgoing_msg::req
MPI_Request req
The MPI Request used to keep track of the delivery operation.
Definition: wnd.h:42

unlock_mpi
#define unlock_mpi()
This macro releases a global lock if multithreaded support is not available from MPI.
Definition: mpi.h:44

allocate_outgoing_msg
outgoing_msg * allocate_outgoing_msg(void)
Allocate a buffer for an outgoing message node.
Definition: wnd.c:130

thread_barrier
bool thread_barrier(barrier_t *b)
Definition: thread.c:200

atomic.h
Atomic operations.

validate_msg
void validate_msg(msg_t *msg)
Perform some sanity checks on a message buffer.
Definition: communication.c:662

all_thread_barrier
barrier_t all_thread_barrier
Barrier for all worker threads.
Definition: core.c:49

mpi_init
void mpi_init(int *argc, char ***argv)
Initialize MPI subsystem.
Definition: mpi.c:516

mpi_lock
spinlock_t mpi_lock
Definition: mpi.c:55

dist_termination_init
void dist_termination_init(void)
Setup the distributed termination subsystem.
Definition: mpi.c:454

outgoing_window_init
void outgoing_window_init(void)
Outgoing queue initialization.
Definition: wnd.c:86

master_thread
#define master_thread()
This macro expands to true if the current KLT is the master thread for the local kernel.
Definition: thread.h:155

_gid_t
Definition of a GID.
Definition: core.h:132

send_remote_msg
void send_remote_msg(msg_t *msg)
Send a message to a remote LP.
Definition: mpi.c:169

get_msg_from_slab
msg_t * get_msg_from_slab(struct lp_struct *lp)
Get a buffer to keep a message.
Definition: communication.c:192

lp_struct::gid
GID_t gid
Global ID of the LP.
Definition: process.h:82

mpi.h
MPI Support Module.

reduce_stats_op
static MPI_Op reduce_stats_op
MPI Operation to reduce statics.
Definition: mpi.c:73

receive_remote_msgs
void receive_remote_msgs(void)
Receive remote messages.
Definition: mpi.c:210

_outgoing_msg::msg
msg_t * msg
A pointer to the msg_t which MPI is delivering.
Definition: wnd.h:45

insert_bottom_half
void insert_bottom_half(msg_t *msg)
Definition: queues.c:115

collect_termination
void collect_termination(void)
Check if other kernels have reached the termination condition.
Definition: mpi.c:291

gvt_comm_finalize
void gvt_comm_finalize(void)
Shut down the MPI-based distributed GVT reduction submodule.
Definition: gvt.c:221

msg_comm
static MPI_Comm msg_comm
MPI Communicator for event/control messages.
Definition: mpi.c:100

spinlock_t
Definition: atomic.h:51

wnd.h
Message delivery support.

local_tid
__thread unsigned int local_tid
Definition: thread.c:72

lock_mpi
#define lock_mpi()
This macro takes a global lock if multithread support is not available from MPI.
Definition: mpi.h:41

syncronize_all
void syncronize_all(void)
Syncronize all the kernels.
Definition: mpi.c:494

gvt.h
Distributed GVT Support module.

terminated
static unsigned int terminated
Definition: mpi.c:64

inter_kernel_comm_init
void inter_kernel_comm_init(void)
Initialize inter-kernel communication.
Definition: mpi.c:547

reduce_stat_vector
static void reduce_stat_vector(struct stat_t *in, struct stat_t *inout, int *len, MPI_Datatype *dptr)
Reduce operation for statistics.
Definition: mpi.c:350

n_ker
unsigned int n_ker
Total number of simulation kernel instances running.
Definition: core.c:58

msgs_fini
static spinlock_t msgs_fini
A guard to ensure isolation in collect_termination()
Definition: mpi.c:70

unlikely
#define unlikely(exp)
Optimize the branch as likely not taken.
Definition: core.h:74

kid
unsigned int kid
Identifier of the local kernel.
Definition: core.c:55

spin_unlock
void spin_unlock(spinlock_t *s)
Definition: x86.c:161