Source

mpi3-fortran / ompi / mca / coll / sm / coll_sm_module.c

Full commit
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
/*
 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2006 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
 * Copyright (c) 2009-2012 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2010-2012 Los Alamos National Security, LLC.  
 *                         All rights reserved. 
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */
/**
 * @file
 *
 * Warning: this is not for the faint of heart -- don't even bother
 * reading this source code if you don't have a strong understanding
 * of nested data structures and pointer math (remember that
 * associativity and order of C operations is *critical* in terms of
 * pointer math!).
 */

#include "ompi_config.h"

#include <stdio.h>
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#ifdef HAVE_SCHED_H
#include <sched.h>
#endif
#include <sys/types.h>
#ifdef HAVE_SYS_MMAN_H
#include <sys/mman.h>
#endif  /* HAVE_SYS_MMAN_H */
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif  /* HAVE_UNISTD_H */

#include "mpi.h"
#include "opal_stdint.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/util/os_path.h"

#include "ompi/communicator/communicator.h"
#include "ompi/group/group.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/rte/rte.h"
#include "ompi/proc/proc.h"
#include "coll_sm.h"


/*
 * Global variables
 */
uint32_t mca_coll_sm_one = 1;


/*
 * Local functions
 */
static int sm_module_enable(mca_coll_base_module_t *module,
                          struct ompi_communicator_t *comm);
static bool have_local_peers(ompi_group_t *group, size_t size);
static int bootstrap_comm(ompi_communicator_t *comm,
                          mca_coll_sm_module_t *module);

/*
 * Module constructor
 */
static void mca_coll_sm_module_construct(mca_coll_sm_module_t *module)
{
    module->enabled = false;
    module->sm_comm_data = NULL;
    module->previous_reduce = NULL;
    module->previous_reduce_module = NULL;
}

/*
 * Module destructor
 */
static void mca_coll_sm_module_destruct(mca_coll_sm_module_t *module)
{
    mca_coll_sm_comm_t *c = module->sm_comm_data;

    if (NULL != c) {
        /* Munmap the per-communicator shmem data segment */
        if (NULL != c->sm_bootstrap_meta) {
            /* Ignore any errors -- what are we going to do about
               them? */
            mca_common_sm_fini(c->sm_bootstrap_meta);
        }
        free(c);
    }

    /* It should always be non-NULL, but just in case */
    if (NULL != module->previous_reduce_module) {
        OBJ_RELEASE(module->previous_reduce_module);
    }

    module->enabled = false;
}


OBJ_CLASS_INSTANCE(mca_coll_sm_module_t,
                   mca_coll_base_module_t,
                   mca_coll_sm_module_construct,
                   mca_coll_sm_module_destruct);

/*
 * Initial query function that is invoked during MPI_INIT, allowing
 * this component to disqualify itself if it doesn't support the
 * required level of thread support.  This function is invoked exactly
 * once.
 */
int mca_coll_sm_init_query(bool enable_progress_threads,
                           bool enable_mpi_threads)
{
    /* if no session directory was created, then we cannot be used */
    if (NULL == ompi_process_info.job_session_dir) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    /* Don't do much here because we don't really want to allocate any
       shared memory until this component is selected to be used. */
    opal_output_verbose(10, mca_coll_base_output,
                        "coll:sm:init_query: pick me! pick me!");
    return OMPI_SUCCESS;
}


/*
 * Invoked when there's a new communicator that has been created.
 * Look at the communicator and decide which set of functions and
 * priority we want to return.
 */
mca_coll_base_module_t *
mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority)
{
    mca_coll_sm_module_t *sm_module;

    /* If we're intercomm, or if there's only one process in the
       communicator, or if not all the processes in the communicator
       are not on this node, then we don't want to run */
    if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) ||
        !have_local_peers(comm->c_local_group, ompi_comm_size(comm))) {
        opal_output_verbose(10, mca_coll_base_output,
                            "coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name);
	return NULL;
    }

    /* Get the priority level attached to this module. If priority is less
     * than or equal to 0, then the module is unavailable. */
    *priority = mca_coll_sm_component.sm_priority;
    if (mca_coll_sm_component.sm_priority <= 0) {
        opal_output_verbose(10, mca_coll_base_output,
                            "coll:sm:comm_query (%d/%s): priority too low; disqualifying myself", comm->c_contextid, comm->c_name);
	return NULL;
    }

    sm_module = OBJ_NEW(mca_coll_sm_module_t);
    if (NULL == sm_module) {
        return NULL;
    }

    /* All is good -- return a module */
    sm_module->super.coll_module_enable = sm_module_enable;
    sm_module->super.ft_event        = mca_coll_sm_ft_event;
    sm_module->super.coll_allgather  = NULL;
    sm_module->super.coll_allgatherv = NULL;
    sm_module->super.coll_allreduce  = mca_coll_sm_allreduce_intra;
    sm_module->super.coll_alltoall   = NULL;
    sm_module->super.coll_alltoallv  = NULL;
    sm_module->super.coll_alltoallw  = NULL;
    sm_module->super.coll_barrier    = mca_coll_sm_barrier_intra;
    sm_module->super.coll_bcast      = mca_coll_sm_bcast_intra;
    sm_module->super.coll_exscan     = NULL;
    sm_module->super.coll_gather     = NULL;
    sm_module->super.coll_gatherv    = NULL;
    sm_module->super.coll_reduce     = mca_coll_sm_reduce_intra;
    sm_module->super.coll_reduce_scatter = NULL;
    sm_module->super.coll_scan       = NULL;
    sm_module->super.coll_scatter    = NULL;
    sm_module->super.coll_scatterv   = NULL;

    opal_output_verbose(10, mca_coll_base_output,
                        "coll:sm:comm_query (%d/%s): pick me! pick me!", 
                        comm->c_contextid, comm->c_name);
    return &(sm_module->super);
}


/*
 * Init module on the communicator
 */
static int sm_module_enable(mca_coll_base_module_t *module,
                            struct ompi_communicator_t *comm)
{
    if (NULL == comm->c_coll.coll_reduce ||
        NULL == comm->c_coll.coll_reduce_module) {
        opal_output_verbose(10, mca_coll_base_output,
                            "coll:sm:enable (%d/%s): no underlying reduce; disqualifying myself",
                            comm->c_contextid, comm->c_name);
        return OMPI_ERROR;
    }

    /* We do everything lazily in ompi_coll_sm_enable() */
    return OMPI_SUCCESS;
}

int ompi_coll_sm_lazy_enable(mca_coll_base_module_t *module,
                             struct ompi_communicator_t *comm)
{
    int i, j, root, ret;
    int rank = ompi_comm_rank(comm);
    int size = ompi_comm_size(comm);
    mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;
    mca_coll_sm_comm_t *data = NULL;
    size_t control_size, frag_size;
    mca_coll_sm_component_t *c = &mca_coll_sm_component;
#if OPAL_HAVE_HWLOC
    opal_hwloc_base_memory_segment_t *maffinity;
#endif
    int parent, min_child, max_child, num_children;
    unsigned char *base = NULL;
    const int num_barrier_buffers = 2;

    /* Just make sure we haven't been here already */
    if (sm_module->enabled) {
        return OMPI_SUCCESS;
    }
    sm_module->enabled = true;

#if OPAL_HAVE_HWLOC
    /* Get some space to setup memory affinity (just easier to try to
       alloc here to handle the error case) */
    maffinity = (opal_hwloc_base_memory_segment_t*)
        malloc(sizeof(opal_hwloc_base_memory_segment_t) * 
               c->sm_comm_num_segments * 3);
    if (NULL == maffinity) {
        opal_output_verbose(10, mca_coll_base_output,
                            "coll:sm:enable (%d/%s): malloc failed (1)", 
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
#endif

    /* Allocate data to hang off the communicator.  The memory we
       alloc will be laid out as follows:

       1. mca_coll_base_comm_t
       2. array of num_segments mca_coll_base_mpool_index_t instances
          (pointed to by the array in 2)
       3. array of ompi_comm_size(comm) mca_coll_sm_tree_node_t
          instances
       4. array of sm_tree_degree pointers to other tree nodes (i.e.,
          this nodes' children) for each instance of
          mca_coll_sm_tree_node_t
    */
    sm_module->sm_comm_data = data = (mca_coll_sm_comm_t*)
        malloc(sizeof(mca_coll_sm_comm_t) + 
               (c->sm_comm_num_segments * 
                sizeof(mca_coll_sm_data_index_t)) +
               (size * 
                (sizeof(mca_coll_sm_tree_node_t) +
                 (sizeof(mca_coll_sm_tree_node_t*) * c->sm_tree_degree))));
    if (NULL == data) {
#if OPAL_HAVE_HWLOC
        free(maffinity);
#endif
        opal_output_verbose(10, mca_coll_base_output,
                            "coll:sm:enable (%d/%s): malloc failed (2)", 
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
    }
    data->mcb_operation_count = 0;

    /* Setup #2: set the array to point immediately beyond the
       mca_coll_base_comm_t */
    data->mcb_data_index = (mca_coll_sm_data_index_t*) (data + 1);
    /* Setup array of pointers for #3 */
    data->mcb_tree = (mca_coll_sm_tree_node_t*)
        (data->mcb_data_index + c->sm_comm_num_segments);
    /* Finally, setup the array of children pointers in the instances
       in #5 to point to their corresponding arrays in #6 */
    data->mcb_tree[0].mcstn_children = (mca_coll_sm_tree_node_t**)
        (data->mcb_tree + size);
    for (i = 1; i < size; ++i) {
        data->mcb_tree[i].mcstn_children = 
            data->mcb_tree[i - 1].mcstn_children + c->sm_tree_degree;
    }

    /* Pre-compute a tree for a given number of processes and degree.
       We'll re-use this tree for all possible values of root (i.e.,
       shift everyone's process to be the "0"/root in this tree. */
    for (root = 0; root < size; ++root) {
        parent = (root - 1) / mca_coll_sm_component.sm_tree_degree;
        num_children = mca_coll_sm_component.sm_tree_degree;
    
        /* Do we have children?  If so, how many? */
        
        if ((root * num_children) + 1 >= size) {
            /* Leaves */
            min_child = -1;
            max_child = -1;
            num_children = 0;
        } else {
            /* Interior nodes */
            min_child = root * num_children + 1;
            max_child = root * num_children + num_children;
            if (max_child >= size) {
                max_child = size - 1;
            }
            num_children = max_child - min_child + 1;
        }

        /* Save the values */
        data->mcb_tree[root].mcstn_id = root;
        if (root == 0 && parent == 0) {
            data->mcb_tree[root].mcstn_parent = NULL;
        } else {
            data->mcb_tree[root].mcstn_parent = &data->mcb_tree[parent];
        }
        data->mcb_tree[root].mcstn_num_children = num_children;
        for (i = 0; i < c->sm_tree_degree; ++i) {
            data->mcb_tree[root].mcstn_children[i] = 
                (i < num_children) ?
                &data->mcb_tree[min_child + i] : NULL;
        }
    }

    /* Attach to this communicator's shmem data segment */
    if (OMPI_SUCCESS != (ret = bootstrap_comm(comm, sm_module))) {
        free(data);
#if OPAL_HAVE_HWLOC
        free(maffinity);
#endif
        sm_module->sm_comm_data = NULL;
        return ret;
    }

    /* Once the communicator is bootstrapped, setup the pointers into
       the per-communicator shmem data segment.  First, setup the
       barrier buffers.  There are 2 sets of barrier buffers (because
       there can never be more than one outstanding barrier occuring
       at any timie).  Setup pointers to my control buffers, my
       parents, and [the beginning of] my children (note that the
       children are contiguous, so having the first pointer and the
       num_children from the mcb_tree data is sufficient). */
    control_size = c->sm_control_size;
    base = data->sm_bootstrap_meta->module_data_addr;
    data->mcb_barrier_control_me = (uint32_t*)
        (base + (rank * control_size * num_barrier_buffers * 2));
    if (data->mcb_tree[rank].mcstn_parent) {
        data->mcb_barrier_control_parent = (uint32_t*)
            (base +
             (data->mcb_tree[rank].mcstn_parent->mcstn_id * control_size * 
              num_barrier_buffers * 2));
    } else {
        data->mcb_barrier_control_parent = NULL;
    }
    if (data->mcb_tree[rank].mcstn_num_children > 0) {
        data->mcb_barrier_control_children = (uint32_t*)
            (base +
             (data->mcb_tree[rank].mcstn_children[0]->mcstn_id * control_size *
              num_barrier_buffers * 2));
    } else {
        data->mcb_barrier_control_children = NULL;
    }
    data->mcb_barrier_count = 0;

    /* Next, setup the pointer to the in-use flags.  The number of
       segments will be an even multiple of the number of in-use
       flags. */
    base += (c->sm_control_size * size * num_barrier_buffers * 2);
    data->mcb_in_use_flags = (mca_coll_sm_in_use_flag_t*) base;

    /* All things being equal, if we're rank 0, then make the in-use
       flags be local (memory affinity).  Then zero them all out so
       that they're marked as unused. */
    j = 0;
    if (0 == rank) {
#if OPAL_HAVE_HWLOC
        maffinity[j].mbs_start_addr = base;
        maffinity[j].mbs_len = c->sm_control_size * 
            c->sm_comm_num_in_use_flags;
#endif
        /* Set the op counts to 1 (actually any nonzero value will do)
           so that the first time children/leaf processes come
           through, they don't see a value of 0 and think that the
           root/parent has already set the count to their op number
           (i.e., 0 is the first op count value). */
        for (i = 0; i < mca_coll_sm_component.sm_comm_num_in_use_flags; ++i) {
            ((mca_coll_sm_in_use_flag_t *)base)[i].mcsiuf_operation_count = 1;
            ((mca_coll_sm_in_use_flag_t *)base)[i].mcsiuf_num_procs_using = 0;
        }
        ++j;
    }

    /* Next, setup pointers to the control and data portions of the
       segments, as well as to the relevant in-use flags. */
    base += (c->sm_comm_num_in_use_flags * c->sm_control_size);
    control_size = size * c->sm_control_size;
    frag_size = size * c->sm_fragment_size;
    for (i = 0; i < c->sm_comm_num_segments; ++i) {
        data->mcb_data_index[i].mcbmi_control = (uint32_t*)
            (base + (i * (control_size + frag_size)));
        data->mcb_data_index[i].mcbmi_data = 
            (((char*) data->mcb_data_index[i].mcbmi_control) + 
             control_size);

#if OPAL_HAVE_HWLOC
        /* Memory affinity: control */

        maffinity[j].mbs_len = c->sm_control_size;
        maffinity[j].mbs_start_addr = (void *)
            (data->mcb_data_index[i].mcbmi_control +
             (rank * c->sm_control_size));
        ++j;

        /* Memory affinity: data */

        maffinity[j].mbs_len = c->sm_fragment_size;
        maffinity[j].mbs_start_addr = 
            data->mcb_data_index[i].mcbmi_data +
            (rank * c->sm_control_size);
        ++j;
#endif
    }

#if OPAL_HAVE_HWLOC
    /* Setup memory affinity so that the pages that belong to this
       process are local to this process */
    opal_hwloc_base_memory_set(maffinity, j);
    free(maffinity);
#endif

    /* Zero out the control structures that belong to this process */
    memset(data->mcb_barrier_control_me, 0, 
           num_barrier_buffers * 2 * c->sm_control_size);
    for (i = 0; i < c->sm_comm_num_segments; ++i) {
        memset((void *) data->mcb_data_index[i].mcbmi_control, 0,
               c->sm_control_size);
    }

    /* Save previous component's reduce information */
    sm_module->previous_reduce = comm->c_coll.coll_reduce;
    sm_module->previous_reduce_module = comm->c_coll.coll_reduce_module;
    OBJ_RETAIN(sm_module->previous_reduce_module);

    /* Indicate that we have successfully attached and setup */
    opal_atomic_add(&(data->sm_bootstrap_meta->module_seg->seg_inited), 1);

    /* Wait for everyone in this communicator to attach and setup */
    opal_output_verbose(10, mca_coll_base_output,
                        "coll:sm:enable (%d/%s): waiting for peers to attach",
                        comm->c_contextid, comm->c_name);
    SPIN_CONDITION(size == data->sm_bootstrap_meta->module_seg->seg_inited, seg_init_exit);

    /* Once we're all here, remove the mmap file; it's not needed anymore */
    if (0 == rank) {
        unlink(data->sm_bootstrap_meta->shmem_ds.seg_name);
        opal_output_verbose(10, mca_coll_base_output,
                            "coll:sm:enable (%d/%s): removed mmap file %s", 
                            comm->c_contextid, comm->c_name,
                            data->sm_bootstrap_meta->shmem_ds.seg_name);
    }

    /* All done */

    opal_output_verbose(10, mca_coll_base_output,
                        "coll:sm:enable (%d/%s): success!", 
                        comm->c_contextid, comm->c_name);
    return OMPI_SUCCESS;
}


static bool have_local_peers(ompi_group_t *group, size_t size)
{
    size_t i;
    ompi_proc_t *proc;

    for (i = 0; i < size; ++i) {
        proc = ompi_group_peer_lookup(group,i);
        if (!OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) {
            return false;
        }
    }

    return true;
}


static int bootstrap_comm(ompi_communicator_t *comm,
                          mca_coll_sm_module_t *module)
{
    int i;
    char *shortpath, *fullpath;
    mca_coll_sm_component_t *c = &mca_coll_sm_component;
    mca_coll_sm_comm_t *data = module->sm_comm_data;
    int comm_size = ompi_comm_size(comm);
    int num_segments = c->sm_comm_num_segments;
    int num_in_use = c->sm_comm_num_in_use_flags;
    int frag_size = c->sm_fragment_size;
    int control_size = c->sm_control_size;
    ompi_process_name_t *lowest_name = NULL;
    size_t size;
    ompi_proc_t *proc;

    /* Make the rendezvous filename for this communicators shmem data
       segment.  The CID is not guaranteed to be unique among all
       procs on this node, so also pair it with the PID of the proc
       with the lowest ORTE name to form a unique filename. */
    proc = ompi_group_peer_lookup(comm->c_local_group, 0);
    lowest_name = &(proc->proc_name);
    for (i = 1; i < comm_size; ++i) {
        proc = ompi_group_peer_lookup(comm->c_local_group, i);
        if (ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, 
                                          &(proc->proc_name),
                                          lowest_name) < 0) {
            lowest_name = &(proc->proc_name);
        }
    }
    asprintf(&shortpath, "coll-sm-cid-%d-name-%s.mmap", comm->c_contextid,
             OMPI_NAME_PRINT(lowest_name));
    if (NULL == shortpath) {
        opal_output_verbose(10, mca_coll_base_output,
                            "coll:sm:enable:bootstrap comm (%d/%s): asprintf failed", 
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    fullpath = opal_os_path(false, ompi_process_info.job_session_dir,
                            shortpath, NULL);
    free(shortpath);
    if (NULL == fullpath) {
        opal_output_verbose(10, mca_coll_base_output,
                            "coll:sm:enable:bootstrap comm (%d/%s): opal_os_path failed", 
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* Calculate how much space we need in the per-communicator shmem
       data segment.  There are several values to add:

       - size of the barrier data (2 of these):
           - fan-in data (num_procs * control_size)
           - fan-out data (num_procs * control_size)
       - size of the "in use" buffers:
           - num_in_use_buffers * control_size
       - size of the message fragment area (one for each segment):
           - control (num_procs * control_size)
           - fragment data (num_procs * (frag_size))

       So it's:

           barrier: 2 * control_size + 2 * control_size
           in use:  num_in_use * control_size
           control: num_segments * (num_procs * control_size * 2 +
                                    num_procs * control_size)
           message: num_segments * (num_procs * frag_size)
     */

    size = 4 * control_size +
        (num_in_use * control_size) +
        (num_segments * (comm_size * control_size * 2)) +
        (num_segments * (comm_size * frag_size));
    opal_output_verbose(10, mca_coll_base_output,
                        "coll:sm:enable:bootstrap comm (%d/%s): attaching to %" PRIsize_t " byte mmap: %s",
                        comm->c_contextid, comm->c_name, size, fullpath);
    data->sm_bootstrap_meta =
        mca_common_sm_init_group(comm->c_local_group, size, fullpath,
                                 sizeof(mca_common_sm_seg_header_t),
                                 sizeof(void*));
    if (NULL == data->sm_bootstrap_meta) {
        opal_output_verbose(10, mca_coll_base_output,
                            "coll:sm:enable:bootstrap comm (%d/%s): mca_common_sm_init_group failed", 
                            comm->c_contextid, comm->c_name);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* All done */
    return OMPI_SUCCESS;
}


int mca_coll_sm_ft_event(int state) {
    if(OPAL_CRS_CHECKPOINT == state) {
        ;
    }
    else if(OPAL_CRS_CONTINUE == state) {
        ;
    }
    else if(OPAL_CRS_RESTART == state) {
        ;
    }
    else if(OPAL_CRS_TERM == state ) {
        ;
    }
    else {
        ;
    }

    return OMPI_SUCCESS;
}