Commits

Rolf vandeVaart committed 29f3255

CUDA RDMA support in new smcuda BTL

Comments (0)

Files changed (30)

ompi/mca/btl/btl.h

 /* btl can support failover if enabled */
 #define MCA_BTL_FLAGS_FAILOVER_SUPPORT 0x0200
 
+#define MCA_BTL_FLAGS_CUDA_PUT        0x0400
+#define MCA_BTL_FLAGS_CUDA_GET        0x0800
+#define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
+
 /* Default exclusivity levels */
 #define MCA_BTL_EXCLUSIVITY_HIGH     (64*1024) /* internal loopback */
 #define MCA_BTL_EXCLUSIVITY_DEFAULT  1024      /* GM/IB/etc. */
         uint32_t  key32[4];
         uint64_t  key64[2];
         uint8_t   key8[16];
+#if OMPI_CUDA_SUPPORT
+        uint64_t cudakey[16]; /* 64 bytes for CUDA mem handle, 64 bytes for CUDA event handle */
+#endif /* OMPI_CUDA_SUPPORT */
     } seg_key;
+#if OMPI_CUDA_SUPPORT
+    /** Address of the entire memory handle */
+    ompi_ptr_t memh_seg_addr;        
+     /** Length in bytes of entire memory handle */
+    uint32_t   memh_seg_len;           
+#endif /* OMPI_CUDA_SUPPORT */
 };
 typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;
 

ompi/mca/btl/smcuda/Makefile.am

+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+#                         University Research and Technology
+#                         Corporation.  All rights reserved.
+# Copyright (c) 2004-2009 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, 
+#                         University of Stuttgart.  All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+#                         All rights reserved.
+# Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+
+dist_pkgdata_DATA = help-mpi-btl-smcuda.txt
+
+libmca_btl_smcuda_la_sources = \
+    btl_smcuda.c \
+    btl_smcuda.h \
+    btl_smcuda_component.c \
+    btl_smcuda_endpoint.h \
+    btl_smcuda_fifo.h \
+    btl_smcuda_frag.c \
+    btl_smcuda_frag.h 
+
+# Make the output library in this directory, and name it either
+# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
+# (for static builds).
+
+if MCA_BUILD_ompi_btl_smcuda_DSO
+component_noinst =
+component_install = mca_btl_smcuda.la
+else
+component_noinst = libmca_btl_smcuda.la
+component_install =
+endif
+
+# See ompi/mca/common/cuda/Makefile.am for an explanation of
+# libmca_common_sm.la.
+
+mcacomponentdir = $(pkglibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
+mca_btl_smcuda_la_LDFLAGS = -module -avoid-version
+mca_btl_smcuda_la_LIBADD = \
+    $(top_ompi_builddir)/ompi/mca/common/sm/libmca_common_sm.la
+mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
+if MCA_ompi_cuda_support
+mca_btl_smcuda_la_LIBADD += \
+    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
+endif
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
+libmca_btl_smcuda_la_LDFLAGS = -module -avoid-version
+libmca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)

ompi/mca/btl/smcuda/btl_smcuda.c

+/*
+ * Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006-2007 Voltaire. All rights reserved.
+ * Copyright (c) 2009      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2010      Los Alamos National Security, LLC.  
+ *                         All rights reserved. 
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif  /* HAVE_FCNTL_H */
+#include <errno.h>
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif  /* HAVE_SYS_MMAN_H */
+
+#include "opal/sys/atomic.h"
+#include "opal/class/opal_bitmap.h"
+#include "opal/util/output.h"
+#include "opal/util/printf.h"
+#include "opal/mca/carto/carto.h"
+#include "opal/mca/carto/base/base.h"
+#include "opal/mca/paffinity/base/base.h"
+#include "opal/mca/maffinity/base/base.h"
+#include "orte/util/proc_info.h"
+#include "opal/datatype/opal_convertor.h"
+#include "ompi/class/ompi_free_list.h"
+#include "ompi/mca/btl/btl.h"
+#if OMPI_CUDA_SUPPORT
+#include "ompi/mca/common/cuda/common_cuda.h"
+#endif /* OMPI_CUDA_SUPPORT */
+#include "ompi/mca/mpool/base/base.h"
+#include "ompi/mca/mpool/sm/mpool_sm.h"
+
+#if OPAL_ENABLE_FT_CR    == 1
+#include "opal/mca/crs/base/base.h"
+#include "opal/util/basename.h"
+#include "orte/mca/sstore/sstore.h"
+#include "ompi/runtime/ompi_cr.h"
+#endif
+
+#include "btl_smcuda.h"
+#include "btl_smcuda_endpoint.h"
+#include "btl_smcuda_frag.h"
+#include "btl_smcuda_fifo.h"
+#include "ompi/proc/proc.h"
+
+mca_btl_smcuda_t mca_btl_smcuda = {
+    {
+        &mca_btl_smcuda_component.super,
+        0, /* btl_eager_limit */
+        0, /* btl_rndv_eager_limit */
+        0, /* btl_max_send_size */
+        0, /* btl_rdma_pipeline_send_length */
+        0, /* btl_rdma_pipeline_frag_size */
+        0, /* btl_min_rdma_pipeline_size */
+        0, /* btl_exclusivity */
+        0, /* btl_latency */
+        0, /* btl_bandwidth */
+        0, /* btl flags */
+        mca_btl_smcuda_add_procs,
+        mca_btl_smcuda_del_procs,
+        NULL,
+        mca_btl_smcuda_finalize,
+        mca_btl_smcuda_alloc,
+        mca_btl_smcuda_free,
+        mca_btl_smcuda_prepare_src,
+#if OMPI_CUDA_SUPPORT
+        mca_btl_smcuda_prepare_dst,
+#else
+        NULL,
+#endif /* OMPI_CUDA_SUPPORT */
+        mca_btl_smcuda_send,
+        mca_btl_smcuda_sendi,
+        NULL,  /* put */
+        NULL,  /* get -- optionally filled during initialization */
+        mca_btl_base_dump,
+        NULL, /* mpool */
+        mca_btl_smcuda_register_error_cb, /* register error */
+        mca_btl_smcuda_ft_event
+    }
+};
+
+/*
+ * calculate offset of an address from the beginning of a shared memory segment
+ */
+#define ADDR2OFFSET(ADDR, BASE) ((char*)(ADDR) - (char*)(BASE))
+
+/*
+ * calculate an absolute address in a local address space given an offset and
+ * a base address of a shared memory segment
+ */
+#define OFFSET2ADDR(OFFSET, BASE) ((ptrdiff_t)(OFFSET) + (char*)(BASE))
+
+
+static void *mpool_calloc(size_t nmemb, size_t size)
+{
+    void *buf;
+    size_t bsize = nmemb * size;
+    mca_mpool_base_module_t *mpool = mca_btl_smcuda_component.sm_mpool;
+
+    buf = mpool->mpool_alloc(mpool, bsize, opal_cache_line_size, 0, NULL);
+
+    if (NULL == buf)
+        return NULL;
+
+    memset(buf, 0, bsize);
+    return buf;
+}
+
+static void init_maffinity(int *my_mem_node, int *max_mem_node)
+{
+    opal_carto_graph_t *topo;
+    opal_value_array_t dists;
+    int i, num_core, socket;
+    opal_paffinity_base_cpu_set_t cpus;
+    char *myslot = NULL;
+    opal_carto_node_distance_t *dist;
+    opal_carto_base_node_t *slot_node;
+
+    *my_mem_node = 0;
+    *max_mem_node = 1;
+
+    if (OMPI_SUCCESS != opal_carto_base_get_host_graph(&topo, "Memory")) {
+        return;
+    }
+
+     OBJ_CONSTRUCT(&dists, opal_value_array_t);
+     opal_value_array_init(&dists, sizeof(opal_carto_node_distance_t));
+
+    if (OMPI_SUCCESS != opal_paffinity_base_get_processor_info(&num_core))  {
+        num_core = 100;  /* set something large */
+    }
+
+     OPAL_PAFFINITY_CPU_ZERO(cpus);
+     opal_paffinity_base_get(&cpus);
+
+     /* find core we are running on */
+     for (i = 0; i < num_core; i++) {
+         if (OPAL_PAFFINITY_CPU_ISSET(i, cpus)) {
+             break;
+         }
+     }
+
+    if (OMPI_SUCCESS != opal_paffinity_base_get_map_to_socket_core(i, &socket, &i)) {
+        /* no topology info available */
+        goto out;
+    }
+    
+     asprintf(&myslot, "slot%d", socket);
+
+     slot_node = opal_carto_base_find_node(topo, myslot);
+
+     if(NULL == slot_node) {
+         goto out;
+     }
+
+     opal_carto_base_get_nodes_distance(topo, slot_node, "Memory", &dists);
+     if((*max_mem_node = opal_value_array_get_size(&dists)) < 2) {
+         goto out;
+     }
+
+     dist = (opal_carto_node_distance_t *) opal_value_array_get_item(&dists, 0);
+     opal_maffinity_base_node_name_to_id(dist->node->node_name, my_mem_node);
+out:
+     if (myslot) {
+         free(myslot);
+     }
+     OBJ_DESTRUCT(&dists);
+     opal_carto_base_free_graph(topo);
+}
+
+static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int n)
+{
+    size_t size, length, length_payload;
+    char *sm_ctl_file;
+    sm_fifo_t *my_fifos;
+    int my_mem_node=-1, num_mem_nodes=-1, i;
+    ompi_proc_t **procs;
+    size_t num_procs;
+
+    init_maffinity(&my_mem_node, &num_mem_nodes);
+    mca_btl_smcuda_component.mem_node = my_mem_node;
+    mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes;
+
+    /* lookup shared memory pool */
+    mca_btl_smcuda_component.sm_mpools = (mca_mpool_base_module_t **) calloc(num_mem_nodes,
+                                            sizeof(mca_mpool_base_module_t*));
+
+    /* create mpool for each memory node */
+    for(i = 0; i < num_mem_nodes; i++) {
+        mca_mpool_base_resources_t res;
+        mca_btl_smcuda_component_t* m = &mca_btl_smcuda_component;
+
+        /* disable memory binding if there is only one memory node */
+        res.mem_node = (num_mem_nodes == 1) ? -1 : i;
+
+        /* determine how much memory to create */
+        /*
+         * This heuristic formula mostly says that we request memory for:
+         * - nfifos FIFOs, each comprising:
+         *   . a sm_fifo_t structure
+         *   . many pointers (fifo_size of them per FIFO)
+         * - eager fragments (2*n of them, allocated in sm_free_list_inc chunks)
+         * - max fragments (sm_free_list_num of them)
+         *
+         * On top of all that, we sprinkle in some number of "opal_cache_line_size"
+         * additions to account for some padding and edge effects that may lie
+         * in the allocator.
+         */
+        res.size =
+            FIFO_MAP_NUM(n) * ( sizeof(sm_fifo_t) + sizeof(void *) * m->fifo_size + 4 * opal_cache_line_size )
+            + ( 2 * n + m->sm_free_list_inc ) * ( m->eager_limit   + 2 * opal_cache_line_size )
+            +           m->sm_free_list_num   * ( m->max_frag_size + 2 * opal_cache_line_size );
+
+        /* before we multiply by n, make sure the result won't overflow */
+        /* Stick that little pad in, particularly since we'll eventually
+         * need a little extra space.  E.g., in mca_mpool_sm_init() in
+         * mpool_sm_component.c when sizeof(mca_common_sm_module_t) is
+         * added.
+         */
+        if ( ((double) res.size) * n > LONG_MAX - 4096 )
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        res.size *= n;
+
+        /* now, create it */
+        mca_btl_smcuda_component.sm_mpools[i] =
+            mca_mpool_base_module_create(mca_btl_smcuda_component.sm_mpool_name,
+                                         smcuda_btl, &res);
+        /* Sanity check to ensure that we found it */
+        if(NULL == mca_btl_smcuda_component.sm_mpools[i])
+            return OMPI_ERR_OUT_OF_RESOURCE;
+
+        if(i == my_mem_node)
+            mca_btl_smcuda_component.sm_mpool = mca_btl_smcuda_component.sm_mpools[i];
+#if OMPI_CUDA_SUPPORT
+        /* Create a local memory pool that sends handles to the remote side */
+        mca_common_cuda_init_cuda_mpool((mca_btl_base_module_t *)smcuda_btl);
+        if (NULL == smcuda_btl->super.btl_mpool) {
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+#endif /* OMPI_CUDA_SUPPORT */
+    }
+
+
+    mca_btl_smcuda_component.sm_mpool_base =
+        mca_btl_smcuda_component.sm_mpools[0]->mpool_base(mca_btl_smcuda_component.sm_mpools[0]);
+
+    /* create a list of peers */
+    mca_btl_smcuda_component.sm_peers = (struct mca_btl_base_endpoint_t**)
+        calloc(n, sizeof(struct mca_btl_base_endpoint_t*));
+    if(NULL == mca_btl_smcuda_component.sm_peers)
+        return OMPI_ERR_OUT_OF_RESOURCE;
+
+    /* Allocate Shared Memory BTL process coordination
+     * data structure.  This will reside in shared memory */
+
+    /* set file name */
+    if(asprintf(&sm_ctl_file, "%s"OPAL_PATH_SEP"shared_mem_btl_module.%s",
+                orte_process_info.job_session_dir,
+                orte_process_info.nodename) < 0)
+        return OMPI_ERR_OUT_OF_RESOURCE;
+
+    /* Pass in a data segment alignment of 0 to get no data
+       segment (only the shared control structure) */
+    size = sizeof(mca_common_sm_seg_header_t) +
+        n * (sizeof(sm_fifo_t*) + sizeof(char *) + sizeof(uint16_t)) + opal_cache_line_size;
+    procs = ompi_proc_world(&num_procs);
+    if (!(mca_btl_smcuda_component.sm_seg =
+          mca_common_sm_init(procs, num_procs, size, sm_ctl_file,
+                             sizeof(mca_common_sm_seg_header_t),
+                             opal_cache_line_size))) {
+        opal_output(0, "mca_btl_smcuda_add_procs: unable to create shared memory "
+                    "BTL coordinating strucure :: size %lu \n",
+                    (unsigned long)size);
+        free(procs);
+        free(sm_ctl_file);
+        return OMPI_ERROR;
+    }
+    free(procs);
+    free(sm_ctl_file);
+
+    /* check to make sure number of local procs is within the
+     * specified limits */
+    if(mca_btl_smcuda_component.sm_max_procs > 0 &&
+       mca_btl_smcuda_component.num_smp_procs + n >
+       mca_btl_smcuda_component.sm_max_procs) {
+        return OMPI_ERROR;
+    }
+
+    mca_btl_smcuda_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_smcuda_component.sm_seg->module_data_addr;
+    mca_btl_smcuda_component.shm_bases = (char**)(mca_btl_smcuda_component.shm_fifo + n);
+    mca_btl_smcuda_component.shm_mem_nodes = (uint16_t*)(mca_btl_smcuda_component.shm_bases + n);
+
+    /* set the base of the shared memory segment */
+    mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank] =
+        (char*)mca_btl_smcuda_component.sm_mpool_base;
+    mca_btl_smcuda_component.shm_mem_nodes[mca_btl_smcuda_component.my_smp_rank] =
+        (uint16_t)my_mem_node;
+
+    /* initialize the array of fifo's "owned" by this process */
+    if(NULL == (my_fifos = (sm_fifo_t*)mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t))))
+        return OMPI_ERR_OUT_OF_RESOURCE;
+
+    mca_btl_smcuda_component.shm_fifo[mca_btl_smcuda_component.my_smp_rank] = my_fifos;
+
+    /* cache the pointer to the 2d fifo array.  These addresses
+     * are valid in the current process space */
+    mca_btl_smcuda_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n);
+
+    if(NULL == mca_btl_smcuda_component.fifo)
+        return OMPI_ERR_OUT_OF_RESOURCE;
+
+    mca_btl_smcuda_component.fifo[mca_btl_smcuda_component.my_smp_rank] = my_fifos;
+
+    mca_btl_smcuda_component.mem_nodes = (uint16_t *) malloc(sizeof(uint16_t) * n);
+    if(NULL == mca_btl_smcuda_component.mem_nodes)
+        return OMPI_ERR_OUT_OF_RESOURCE;
+
+    /* initialize fragment descriptor free lists */
+
+    /* allocation will be for the fragment descriptor and payload buffer */
+    length = sizeof(mca_btl_smcuda_frag1_t);
+    length_payload =
+        sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.eager_limit;
+    i = ompi_free_list_init_new(&mca_btl_smcuda_component.sm_frags_eager, length,
+                                opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_frag1_t),
+                                length_payload, opal_cache_line_size,
+                                mca_btl_smcuda_component.sm_free_list_num,
+                                mca_btl_smcuda_component.sm_free_list_max,
+                                mca_btl_smcuda_component.sm_free_list_inc,
+                                mca_btl_smcuda_component.sm_mpool);
+    if ( OMPI_SUCCESS != i )
+        return i;
+
+    length = sizeof(mca_btl_smcuda_frag2_t);
+    length_payload =
+        sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.max_frag_size;
+    i = ompi_free_list_init_new(&mca_btl_smcuda_component.sm_frags_max, length,
+                                opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_frag2_t),
+                                length_payload, opal_cache_line_size,
+                                mca_btl_smcuda_component.sm_free_list_num,
+                                mca_btl_smcuda_component.sm_free_list_max,
+                                mca_btl_smcuda_component.sm_free_list_inc,
+                                mca_btl_smcuda_component.sm_mpool);
+    if ( OMPI_SUCCESS != i )
+        return i;
+
+    i = ompi_free_list_init_new(&mca_btl_smcuda_component.sm_frags_user, 
+		    sizeof(mca_btl_smcuda_user_t),
+		    opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_user_t),
+		    sizeof(mca_btl_smcuda_hdr_t), opal_cache_line_size,
+		    mca_btl_smcuda_component.sm_free_list_num,
+		    mca_btl_smcuda_component.sm_free_list_max,
+		    mca_btl_smcuda_component.sm_free_list_inc,
+		    mca_btl_smcuda_component.sm_mpool);
+    if ( OMPI_SUCCESS != i )
+	    return i;   
+
+    mca_btl_smcuda_component.num_outstanding_frags = 0;
+
+    mca_btl_smcuda_component.num_pending_sends = 0;
+    i = opal_free_list_init(&mca_btl_smcuda_component.pending_send_fl,
+                            sizeof(btl_smcuda_pending_send_item_t),
+                            OBJ_CLASS(opal_free_list_item_t),
+                            16, -1, 32);
+    if ( OMPI_SUCCESS != i )
+        return i;
+
+    /* set flag indicating btl has been inited */
+    smcuda_btl->btl_inited = true;
+
+    return OMPI_SUCCESS;
+}
+
+static struct mca_btl_base_endpoint_t *
+create_sm_endpoint(int local_proc, struct ompi_proc_t *proc)
+{
+    struct mca_btl_base_endpoint_t *ep;
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+    char path[PATH_MAX];
+#endif
+
+    ep = (struct mca_btl_base_endpoint_t*)
+        malloc(sizeof(struct mca_btl_base_endpoint_t));
+    if(NULL == ep)
+        return NULL;
+    ep->peer_smp_rank = local_proc + mca_btl_smcuda_component.num_smp_procs;
+
+    OBJ_CONSTRUCT(&ep->pending_sends, opal_list_t);
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+    sprintf(path, "%s"OPAL_PATH_SEP"sm_fifo.%lu",
+            orte_process_info.job_session_dir,
+            (unsigned long)proc->proc_name.vpid);
+    ep->fifo_fd = open(path, O_WRONLY);
+    if(ep->fifo_fd < 0) {
+        opal_output(0, "mca_btl_smcuda_add_procs: open(%s) failed with errno=%d\n",
+                    path, errno);
+        free(ep);
+        return NULL;
+    }
+#endif
+#if OMPI_CUDA_SUPPORT
+	mca_common_cuda_init_remote_cuda_mpool(&ep->mpool);
+#endif /* OMPI_CUDA_SUPPORT */
+    return ep;
+}
+
+static void calc_sm_max_procs(int n)
+{
+    /* see if need to allocate space for extra procs */
+    if(0 > mca_btl_smcuda_component.sm_max_procs) {
+        /* no limit */
+        if(0 <= mca_btl_smcuda_component.sm_extra_procs) {
+            /* limit */
+            mca_btl_smcuda_component.sm_max_procs =
+                n + mca_btl_smcuda_component.sm_extra_procs;
+        } else {
+            /* no limit */
+            mca_btl_smcuda_component.sm_max_procs = 2 * n;
+        }
+    }
+}
+
+int mca_btl_smcuda_add_procs(
+    struct mca_btl_base_module_t* btl,
+    size_t nprocs,
+    struct ompi_proc_t **procs,
+    struct mca_btl_base_endpoint_t **peers,
+    opal_bitmap_t* reachability)
+{
+    int return_code = OMPI_SUCCESS;
+    int32_t n_local_procs = 0, proc, j, my_smp_rank = -1;
+    ompi_proc_t* my_proc; /* pointer to caller's proc structure */
+    mca_btl_smcuda_t *smcuda_btl;
+    bool have_connected_peer = false;
+    char **bases;
+    /* initializion */
+
+    smcuda_btl = (mca_btl_smcuda_t *)btl;
+
+    /* get pointer to my proc structure */
+    if(NULL == (my_proc = ompi_proc_local()))
+        return OMPI_ERR_OUT_OF_RESOURCE;
+
+    /* Get unique host identifier for each process in the list,
+     * and idetify procs that are on this host.  Add procs on this
+     * host to shared memory reachbility list.  Also, get number
+     * of local procs in the procs list. */
+    for(proc = 0; proc < (int32_t)nprocs; proc++) {
+        /* check to see if this proc can be reached via shmem (i.e.,
+           if they're on my local host and in my job) */
+        if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
+            !OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
+            peers[proc] = NULL;
+            continue;
+        }
+
+        /* check to see if this is me */
+        if(my_proc == procs[proc]) {
+            my_smp_rank = mca_btl_smcuda_component.my_smp_rank = n_local_procs++;
+            continue;
+        }
+
+        /* we have someone to talk to */
+        have_connected_peer = true;
+
+        if(!(peers[proc] = create_sm_endpoint(n_local_procs, procs[proc]))) {
+            return_code = OMPI_ERROR;
+            goto CLEANUP;
+        }
+        n_local_procs++;
+
+        /* add this proc to shared memory accessibility list */
+        return_code = opal_bitmap_set_bit(reachability, proc);
+        if(OMPI_SUCCESS != return_code)
+            goto CLEANUP;
+    }
+
+    /* jump out if there's not someone we can talk to */
+    if (!have_connected_peer)
+        goto CLEANUP;
+
+    /* make sure that my_smp_rank has been defined */
+    if(-1 == my_smp_rank) {
+        return_code = OMPI_ERROR;
+        goto CLEANUP;
+    }
+
+    calc_sm_max_procs(n_local_procs);
+
+    if (!smcuda_btl->btl_inited) {
+        return_code =
+            smcuda_btl_first_time_init(smcuda_btl, mca_btl_smcuda_component.sm_max_procs);
+        if(return_code != OMPI_SUCCESS)
+            goto CLEANUP;
+    }
+
+    /* set local proc's smp rank in the peers structure for
+     * rapid access and calculate reachability */
+    for(proc = 0; proc < (int32_t)nprocs; proc++) {
+        if(NULL == peers[proc])
+            continue;
+        mca_btl_smcuda_component.sm_peers[peers[proc]->peer_smp_rank] = peers[proc];
+        peers[proc]->my_smp_rank = my_smp_rank;
+    }
+
+    bases = mca_btl_smcuda_component.shm_bases;
+
+    /* initialize own FIFOs */
+    /*
+     * The receiver initializes all its FIFOs.  All components will
+     * be allocated near the receiver.  Nothing will be local to
+     * "the sender" since there will be many senders.
+     */
+    for(j = mca_btl_smcuda_component.num_smp_procs;
+        j < mca_btl_smcuda_component.num_smp_procs + FIFO_MAP_NUM(n_local_procs); j++) {
+
+        return_code = sm_fifo_init( mca_btl_smcuda_component.fifo_size,
+                                    mca_btl_smcuda_component.sm_mpool,
+                                   &mca_btl_smcuda_component.fifo[my_smp_rank][j],
+                                    mca_btl_smcuda_component.fifo_lazy_free);
+        if(return_code != OMPI_SUCCESS)
+            goto CLEANUP;
+    }
+
+    opal_atomic_wmb();
+
+    /* Sync with other local procs. Force the FIFO initialization to always
+     * happens before the readers access it.
+     */
+    opal_atomic_add_32( &mca_btl_smcuda_component.sm_seg->module_seg->seg_inited, 1);
+    while( n_local_procs >
+           mca_btl_smcuda_component.sm_seg->module_seg->seg_inited) {
+        opal_progress();
+        opal_atomic_rmb();
+    }
+
+    /* coordinate with other processes */
+    for(j = mca_btl_smcuda_component.num_smp_procs;
+        j < mca_btl_smcuda_component.num_smp_procs + n_local_procs; j++) {
+        ptrdiff_t diff;
+
+        /* spin until this element is allocated */
+        /* doesn't really wait for that process... FIFO might be allocated, but not initialized */
+        opal_atomic_rmb();
+        while(NULL == mca_btl_smcuda_component.shm_fifo[j]) {
+            opal_progress();
+            opal_atomic_rmb();
+        }
+
+        /* Calculate the difference as (my_base - their_base) */
+        diff = ADDR2OFFSET(bases[my_smp_rank], bases[j]);
+
+        /* store local address of remote fifos */
+        mca_btl_smcuda_component.fifo[j] =
+            (sm_fifo_t*)OFFSET2ADDR(diff, mca_btl_smcuda_component.shm_fifo[j]);
+
+        /* cache local copy of peer memory node number */
+        mca_btl_smcuda_component.mem_nodes[j] = mca_btl_smcuda_component.shm_mem_nodes[j];
+    }
+
+    /* update the local smp process count */
+    mca_btl_smcuda_component.num_smp_procs += n_local_procs;
+
+    /* make sure we have enough eager fragmnents for each process */
+    return_code = ompi_free_list_resize(&mca_btl_smcuda_component.sm_frags_eager,
+                                        mca_btl_smcuda_component.num_smp_procs * 2);
+    if (OMPI_SUCCESS != return_code)
+        goto CLEANUP;
+
+CLEANUP:
+    return return_code;
+}
+
+int mca_btl_smcuda_del_procs(
+    struct mca_btl_base_module_t* btl,
+    size_t nprocs,
+    struct ompi_proc_t **procs,
+    struct mca_btl_base_endpoint_t **peers)
+{
+    return OMPI_SUCCESS;
+}
+
+
+/**
+ * MCA->BTL Clean up any resources held by BTL module
+ * before the module is unloaded.
+ *
+ * @param btl (IN)   BTL module.
+ *
+ * Prior to unloading a BTL module, the MCA framework will call
+ * the BTL finalize method of the module. Any resources held by
+ * the BTL should be released and if required the memory corresponding
+ * to the BTL module freed.
+ *
+ */
+
+int mca_btl_smcuda_finalize(struct mca_btl_base_module_t* btl)
+{
+    return OMPI_SUCCESS;
+}
+
+
+/*
+ * Register callback function for error handling..
+ */
+int mca_btl_smcuda_register_error_cb(
+        struct mca_btl_base_module_t* btl,
+        mca_btl_base_module_error_cb_fn_t cbfunc)
+{
+    mca_btl_smcuda_t *smcuda_btl = (mca_btl_smcuda_t *)btl;
+    smcuda_btl->error_cb = cbfunc;
+    return OMPI_SUCCESS;
+}
+
+/**
+ * Allocate a segment.
+ *
+ * @param btl (IN)      BTL module
+ * @param size (IN)     Request segment size.
+ */
+extern mca_btl_base_descriptor_t* mca_btl_smcuda_alloc(
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    uint8_t order,
+    size_t size,
+    uint32_t flags)
+{
+    mca_btl_smcuda_frag_t* frag = NULL;
+    int rc;
+    if(size <= mca_btl_smcuda_component.eager_limit) {
+        MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag,rc);
+    } else if (size <= mca_btl_smcuda_component.max_frag_size) {
+        MCA_BTL_SMCUDA_FRAG_ALLOC_MAX(frag,rc);
+    }
+
+    if (OPAL_LIKELY(frag != NULL)) {
+        frag->segment.seg_len = size;
+        frag->base.des_flags = flags;
+    }
+    return (mca_btl_base_descriptor_t*)frag;
+}
+
+/**
+ * Return a segment allocated by this BTL.
+ *
+ * @param btl (IN)      BTL module
+ * @param segment (IN)  Allocated segment.
+ */
+extern int mca_btl_smcuda_free(
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_descriptor_t* des)
+{
+    mca_btl_smcuda_frag_t* frag = (mca_btl_smcuda_frag_t*)des;
+#if OMPI_CUDA_SUPPORT
+#if 0
+    if(frag->registration != NULL) {
+        btl->btl_rcuda_mpool->mpool_deregister(btl->btl_rcuda_mpool,
+                                               (mca_mpool_base_registration_t*)frag->registration);
+        frag->registration = NULL;
+    }
+#endif
+#endif /* OMPI_CUDA_SUPPORT */
+    MCA_BTL_SMCUDA_FRAG_RETURN(frag);
+
+    return OMPI_SUCCESS;
+}
+
+
+/**
+ * Pack data
+ *
+ * @param btl (IN)      BTL module
+ */
+struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    mca_mpool_base_registration_t* registration,
+    struct opal_convertor_t* convertor,
+    uint8_t order,
+    size_t reserve,
+    size_t* size,
+    uint32_t flags)
+{
+    mca_btl_smcuda_frag_t* frag;
+    struct iovec iov;
+    uint32_t iov_count = 1;
+    size_t max_data = *size;
+    int rc;
+#if OMPI_CUDA_SUPPORT
+    if (0 != reserve) {
+#endif /* OMPI_CUDA_SUPPORT */
+        if ( reserve + max_data <= mca_btl_smcuda_component.eager_limit ) {
+            MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag,rc);
+        } else {
+            MCA_BTL_SMCUDA_FRAG_ALLOC_MAX(frag, rc);
+        }
+        if( OPAL_UNLIKELY(NULL == frag) ) {
+            return NULL;
+        }
+
+        if( OPAL_UNLIKELY(reserve + max_data > frag->size) ) {
+            max_data = frag->size - reserve;
+        }
+        iov.iov_len = max_data;
+        iov.iov_base =
+            (IOVBASE_TYPE*)(((unsigned char*)(frag->segment.seg_addr.pval)) +
+                            reserve);
+
+        rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+        if( OPAL_UNLIKELY(rc < 0) ) {
+            MCA_BTL_SMCUDA_FRAG_RETURN(frag);
+            return NULL;
+        }
+        frag->segment.seg_len = reserve + max_data;
+#if OMPI_CUDA_SUPPORT
+    } else {
+        /* Normally, we are here because we have a GPU buffer and we are preparing
+		 * to send it.  However, we can also be there because we have received a 
+		 * PUT message because we are trying to send a host buffer.  Therefore,
+		 * we need to again check to make sure buffer is GPU.  If not, then return
+		 * NULL. We can just check the convertor since we have that. */
+		if (!(convertor->flags & CONVERTOR_CUDA)) {
+			return NULL;
+		}
+
+        MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag, rc);
+        if( OPAL_UNLIKELY(NULL == frag) ) {
+            return NULL;
+        }
+        iov.iov_len = max_data;
+        iov.iov_base = NULL;
+        rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
+        if( OPAL_UNLIKELY(rc < 0) ) {
+           MCA_BTL_SMCUDA_FRAG_RETURN(frag);
+            return NULL;
+        }
+        frag->segment.seg_addr.pval = iov.iov_base;
+        frag->segment.seg_len = max_data;
+        memcpy(frag->segment.seg_key.cudakey, ((mca_mpool_rcuda_reg_t *)registration)->memHandle,
+               sizeof(((mca_mpool_rcuda_reg_t *)registration)->memHandle) + 
+			   sizeof(((mca_mpool_rcuda_reg_t *)registration)->evtHandle));
+		frag->segment.memh_seg_addr.pval = registration->base;
+		frag->segment.memh_seg_len = registration->bound - registration->base + 1;
+    }
+#endif /* OMPI_CUDA_SUPPORT */
+    frag->base.des_src = &(frag->segment);
+    frag->base.des_src_cnt = 1;
+    frag->base.order = MCA_BTL_NO_ORDER;
+    frag->base.des_dst = NULL;
+    frag->base.des_dst_cnt = 0;
+    frag->base.des_flags = flags;
+    *size = max_data;
+    return &frag->base;
+}
+
+#if 0
+#define MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag)          \
+    do {                                                                \
+        char* _memory = (char*)(sm_frag)->segment.seg_addr.pval +       \
+            (sm_frag)->segment.seg_len;                                 \
+        int* _intmem;                                                   \
+        size_t align = (intptr_t)_memory & 0xFUL;                       \
+        switch( align & 0x3 ) {                                         \
+        case 3: *_memory = 0; _memory++;                                \
+        case 2: *_memory = 0; _memory++;                                \
+        case 1: *_memory = 0; _memory++;                                \
+        }                                                               \
+        align >>= 2;                                                    \
+        _intmem = (int*)_memory;                                        \
+        switch( align ) {                                               \
+        case 3: *_intmem = 0; _intmem++;                                \
+        case 2: *_intmem = 0; _intmem++;                                \
+        case 1: *_intmem = 0; _intmem++;                                \
+        }                                                               \
+    } while(0)
+#else
+#define MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag)
+#endif
+
+#if 0
+        if( OPAL_LIKELY(align > 0) ) {                                  \
+            align = 0xFUL - align;                                      \
+            memset( _memory, 0, align );                                \
+        }                                                               \
+
+#endif
+
+/**
+ * Initiate an inline send to the peer. If failure then return a descriptor.
+ *
+ * @param btl (IN)      BTL module
+ * @param peer (IN)     BTL peer addressing
+ */
+int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
+                      struct mca_btl_base_endpoint_t* endpoint,
+                      struct opal_convertor_t* convertor,
+                      void* header,
+                      size_t header_size,
+                      size_t payload_size,
+                      uint8_t order,
+                      uint32_t flags,
+                      mca_btl_base_tag_t tag,
+                      mca_btl_base_descriptor_t** descriptor )
+{
+    size_t length = (header_size + payload_size);
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+
+    if ( mca_btl_smcuda_component.num_outstanding_frags * 2 > (int) mca_btl_smcuda_component.fifo_size ) {
+        mca_btl_smcuda_component_progress();
+    }
+
+    /* this check should be unnecessary... turn into an assertion? */
+    if( length < mca_btl_smcuda_component.eager_limit ) {
+
+        /* allocate a fragment, giving up if we can't get one */
+        /* note that frag==NULL is equivalent to rc returning an error code */
+        MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag, rc);
+        if( OPAL_UNLIKELY(NULL == frag) ) {
+            *descriptor = NULL;
+            return rc;
+        }
+
+        /* fill in fragment fields */
+        frag->segment.seg_len = length;
+        frag->hdr->len        = length;
+        assert( 0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) );
+        frag->base.des_flags = flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;   /* why do any flags matter here other than OWNERSHIP? */
+        frag->hdr->tag = tag;
+        frag->endpoint = endpoint;
+
+        /* write the match header (with MPI comm/tag/etc. info) */
+        memcpy( frag->segment.seg_addr.pval, header, header_size );
+
+        /* write the message data if there is any */
+        /*
+          We can add MEMCHECKER calls before and after the packing.
+        */
+        if( payload_size ) {
+            size_t max_data;
+            struct iovec iov;
+            uint32_t iov_count;
+            /* pack the data into the supplied buffer */
+            iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.seg_addr.pval + header_size);
+            iov.iov_len  = max_data = payload_size;
+            iov_count    = 1;
+
+            (void)opal_convertor_pack( convertor, &iov, &iov_count, &max_data);
+
+            assert(max_data == payload_size);
+        }
+
+        MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag);
+
+        /* write the fragment pointer to the FIFO */
+        /*
+         * Note that we don't care what the FIFO-write return code is.  Even if
+         * the return code indicates failure, the write has still "completed" from
+         * our point of view:  it has been posted to a "pending send" queue.
+         */
+        OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_outstanding_frags, +1);
+        MCA_BTL_SMCUDA_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
+                              endpoint->peer_smp_rank, (void *) VIRTUAL2RELATIVE(frag->hdr), false, true, rc);
+        return OMPI_SUCCESS;
+    }
+
+    /* presumably, this code path will never get executed */
+    *descriptor = mca_btl_smcuda_alloc( btl, endpoint, order,
+                                    payload_size + header_size, flags);
+    return OMPI_ERR_RESOURCE_BUSY;
+}
+
+/**
+ * Initiate a send to the peer.
+ *
+ * @param btl (IN)      BTL module
+ * @param peer (IN)     BTL peer addressing
+ */
+int mca_btl_smcuda_send( struct mca_btl_base_module_t* btl,
+                     struct mca_btl_base_endpoint_t* endpoint,
+                     struct mca_btl_base_descriptor_t* descriptor,
+                     mca_btl_base_tag_t tag )
+{
+    mca_btl_smcuda_frag_t* frag = (mca_btl_smcuda_frag_t*)descriptor;
+    int rc;
+
+    if ( mca_btl_smcuda_component.num_outstanding_frags * 2 > (int) mca_btl_smcuda_component.fifo_size ) {
+        mca_btl_smcuda_component_progress();
+    }
+
+    /* available header space */
+    frag->hdr->len = frag->segment.seg_len;
+    /* type of message, pt-2-pt, one-sided, etc */
+    frag->hdr->tag = tag;
+
+    MCA_BTL_SMCUDA_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag);
+
+    frag->endpoint = endpoint;
+
+    /*
+     * post the descriptor in the queue - post with the relative
+     * address
+     */
+    OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_outstanding_frags, +1);
+    MCA_BTL_SMCUDA_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
+                          endpoint->peer_smp_rank, (void *) VIRTUAL2RELATIVE(frag->hdr), false, true, rc);
+    if( OPAL_LIKELY(0 == rc) ) {
+        return 1;  /* the data is completely gone */
+    }
+    frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
+    /* not yet gone, but pending. Let the upper level knows that
+     * the callback will be triggered when the data will be sent.
+     */
+    return 0;
+}
+#if OMPI_CUDA_SUPPORT
+struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst( 
+        struct mca_btl_base_module_t* btl,
+        struct mca_btl_base_endpoint_t* endpoint,
+        struct mca_mpool_base_registration_t* registration,
+        struct opal_convertor_t* convertor,
+        uint8_t order,
+        size_t reserve,
+        size_t* size,
+        uint32_t flags)
+{
+    int rc;
+    mca_btl_smcuda_frag_t* frag;
+
+	/* Only support GPU buffers */
+	if (!(convertor->flags & CONVERTOR_CUDA)) {
+		return NULL;
+	}
+
+    MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag, rc);
+    if(OPAL_UNLIKELY(NULL == frag)) {
+        return NULL;
+    }
+    
+    frag->segment.seg_len = *size;
+    opal_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
+
+    frag->base.des_src = NULL;
+    frag->base.des_src_cnt = 0;
+    frag->base.des_dst = &frag->segment;
+    frag->base.des_dst_cnt = 1;
+    frag->base.des_flags = flags;
+    return &frag->base;
+}
+#endif /* OMPI_CUDA_SUPPORT */
+
+
+#if OMPI_CUDA_SUPPORT
+int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
+                        struct mca_btl_base_endpoint_t* ep,
+                        struct mca_btl_base_descriptor_t* descriptor)
+{
+    mca_mpool_rcuda_reg_t rget_reg;
+    mca_mpool_rcuda_reg_t *reg_ptr = &rget_reg;
+    int btl_ownership;
+    int rc, done;
+    void *remote_memory_address;
+    size_t offset;
+    mca_btl_smcuda_frag_t* frag = (mca_btl_smcuda_frag_t*)descriptor;
+ 
+    /* Set to 0 for debugging since it is a list item but I am not
+     * intializing it properly and it is annoying to see all the
+     * garbage in the debugger.  */
+    memset(&rget_reg, 0, sizeof(rget_reg));
+    memcpy(&rget_reg.memHandle, descriptor->des_src->seg_key.cudakey,
+           sizeof(descriptor->des_src->seg_key.cudakey));
+
+	/* Open the memory handle to the remote memory.  If it is cached, then
+	 * we just retrieve it from cache and avoid a call to open the handle.  That
+	 * is taken care of in the memory pool.  Note that we are searching for the
+	 * memory based on the base address and size of the memory handle, not the
+	 * remote memory which may lie somewhere in the middle. This is taken care of
+	 * a few lines down. */
+    rc = ep->mpool->mpool_register(ep->mpool, descriptor->des_src->memh_seg_addr.pval,
+                                   descriptor->des_src->memh_seg_len, ep->peer_smp_rank,
+                                   (mca_mpool_base_registration_t **)&reg_ptr);
+
+    if (OMPI_SUCCESS != rc) {
+        opal_output(0, "Failed to register remote memory, rc=%d", rc);
+        return rc;
+    }
+    frag->registration = (mca_mpool_base_registration_t *)reg_ptr;
+    frag->endpoint = ep;
+
+    /* The registration has given us back the memory block that this
+     * address lives in.  However, the base address of the block may
+     * not equal the address that was used to retrieve the block.
+     * Therefore, compute the offset and add it to the address of the
+     * memory handle. */
+    offset = (unsigned char *)descriptor->des_src->seg_addr.pval - reg_ptr->base.base;
+    remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
+    if (0 != offset) {
+		opal_output(0, "OFFSET=%d", (int)offset);
+    }
+
+	/* The remote side posted an IPC event to make sure we do not start our
+	 * copy until IPC event completes.  This is to ensure that the data being sent
+	 * is available in the sender's GPU buffer.  Therefore, do a stream synchronize
+	 * on the IPC event that we received.  Note that we pull it from 
+	 * rget_reg, not reg_ptr, as we do not cache the event. */
+    mca_common_wait_stream_synchronize(&rget_reg);
+
+    rc = mca_common_cuda_memcpy(descriptor->des_dst->seg_addr.pval, remote_memory_address,
+                                descriptor->des_dst->seg_len, "mca_btl_smcuda_get",
+                                (mca_btl_base_descriptor_t *)frag, &done);
+    if (OMPI_SUCCESS != rc) {
+        /* Out of resources can be handled by upper layers. */
+        if (OMPI_ERR_OUT_OF_RESOURCE != rc) {
+			opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+		}
+        return rc;
+    }
+
+    if (OPAL_UNLIKELY(1 == done)) {
+        /* This should only be true when experimenting with synchronous copies. */
+        btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
+        if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
+            frag->base.des_cbfunc(&mca_btl_smcuda.super, 
+                                  frag->endpoint, &frag->base, 
+                                  OMPI_SUCCESS);
+        }
+
+        if (btl_ownership) {
+            mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
+        }
+    }
+
+    return OMPI_SUCCESS;
+
+}
+#endif /* OMPI_CUDA_SUPPORT */
+
+#if OPAL_ENABLE_FT_CR    == 0
+int mca_btl_smcuda_ft_event(int state) {
+    return OMPI_SUCCESS;
+}
+#else
+int mca_btl_smcuda_ft_event(int state) {
+    /* Notify mpool */
+    if( NULL != mca_btl_smcuda_component.sm_mpool &&
+        NULL != mca_btl_smcuda_component.sm_mpool->mpool_ft_event) {
+        mca_btl_smcuda_component.sm_mpool->mpool_ft_event(state);
+    }
+
+    if(OPAL_CRS_CHECKPOINT == state) {
+        if( NULL != mca_btl_smcuda_component.sm_seg ) {
+            /* On restart we need the old file names to exist (not necessarily
+             * contain content) so the CRS component does not fail when searching
+             * for these old file handles. The restart procedure will make sure
+             * these files get cleaned up appropriately.
+             */
+            orte_sstore.set_attr(orte_sstore_handle_current,
+                                 SSTORE_METADATA_LOCAL_TOUCH,
+                                 mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
+        }
+    }
+    else if(OPAL_CRS_CONTINUE == state) {
+        if( orte_cr_continue_like_restart ) {
+            if( NULL != mca_btl_smcuda_component.sm_seg ) {
+                /* Add shared memory file */
+                opal_crs_base_cleanup_append(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name, false);
+            }
+
+            /* Clear this so we force the module to re-init the sm files */
+            mca_btl_smcuda_component.sm_mpool = NULL;
+        }
+    }
+    else if(OPAL_CRS_RESTART == state ||
+            OPAL_CRS_RESTART_PRE == state) {
+        if( NULL != mca_btl_smcuda_component.sm_seg ) {
+            /* Add shared memory file */
+            opal_crs_base_cleanup_append(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name, false);
+        }
+
+        /* Clear this so we force the module to re-init the sm files */
+        mca_btl_smcuda_component.sm_mpool = NULL;
+    }
+    else if(OPAL_CRS_TERM == state ) {
+        ;
+    }
+    else {
+        ;
+    }
+
+    return OMPI_SUCCESS;
+}
+#endif /* OPAL_ENABLE_FT_CR */

ompi/mca/btl/smcuda/btl_smcuda.h

+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006-2007 Voltaire. All rights reserved.
+ * Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2010      Los Alamos National Security, LLC.  
+ *                         All rights reserved. 
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/**
+ * @file
+ */
+#ifndef MCA_BTL_SMCUDA_H
+#define MCA_BTL_SMCUDA_H
+
+#include "ompi_config.h"
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif  /* HAVE_STDINT_H */
+#ifdef HAVE_SCHED_H
+#include <sched.h>
+#endif  /* HAVE_SCHED_H */
+
+#include "opal/util/bit_ops.h"
+#include "opal/class/opal_free_list.h"
+#include "ompi/mca/btl/btl.h"
+#include "ompi/mca/common/sm/common_sm.h"
+
+BEGIN_C_DECLS
+
+/*
+ * Shared Memory FIFOs
+ *
+ * The FIFO is implemented as a circular queue with head and tail pointers
+ * (integer indices).  For efficient wraparound indexing, the size of the
+ * queue is constrained to be a power of two and we "&" indices with a "mask".
+ *
+ * More than one process can write to the FIFO head.  Therefore, there is a head
+ * lock.  One cannot write until the head slot is empty, indicated by the special
+ * queue entry SM_FIFO_FREE.
+ *
+ * Only the receiver can read the FIFO tail.  Therefore, the tail lock is
+ * required only in multithreaded applications.  If a tail read returns the
+ * SM_FIFO_FREE value, that means the FIFO is empty.  Once a non-FREE value
+ * has been read, the queue slot is *not* automatically reset to SM_FIFO_FREE.
+ * Rather, read tail slots are reset "lazily" (see "lazy_free" and "num_to_clear")
+ * to reduce the number of memory barriers and improve performance.
+ *
+ * Since the FIFO lives in shared memory that is mapped differently into
+ * each address space, the "queue" pointer is relative (each process must
+ * add its own offset) and the queue_recv pointer is meaningful only in the
+ * receiver's address space.
+ *
+ * Since multiple processes access different parts of the FIFO structure in
+ * different ways, we introduce padding to keep different parts on different
+ * cachelines.
+ */
+
+#define SM_FIFO_FREE  (void *) (-2)
+/* We can't use opal_cache_line_size here because we need a
+   compile-time constant for padding the struct.  We can't really have
+   a compile-time constant that is portable, either (e.g., compile on
+   one machine and run on another).  So just use a big enough cache
+   line that should hopefully be good in most places. */
+#define SM_CACHE_LINE_PAD 128
+
+struct sm_fifo_t {
+    /* This queue pointer is used only by the heads. */
+    volatile void **queue;           
+    char pad0[SM_CACHE_LINE_PAD - sizeof(void **)];
+    /* This lock is used by the heads. */
+    opal_atomic_lock_t head_lock;    
+    char pad1[SM_CACHE_LINE_PAD - sizeof(opal_atomic_lock_t)];
+    /* This index is used by the head holding the head lock. */
+    volatile int head;               
+    char pad2[SM_CACHE_LINE_PAD - sizeof(int)];
+    /* This mask is used "read only" by all processes. */
+    unsigned int mask;               
+    char pad3[SM_CACHE_LINE_PAD - sizeof(int)];
+    /* The following are used only by the tail. */
+    volatile void **queue_recv;
+    opal_atomic_lock_t tail_lock;
+    volatile int tail;
+    int num_to_clear;
+    int lazy_free;                   
+    char pad4[SM_CACHE_LINE_PAD - sizeof(void **) -
+              sizeof(opal_atomic_lock_t) -
+              sizeof(int) * 3];
+};
+typedef struct sm_fifo_t sm_fifo_t;
+
+/*
+ * Shared Memory resource managment
+ */
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+#define DATA (char)0
+#define DONE (char)1
+#endif
+
+typedef struct mca_btl_smcuda_mem_node_t {
+    mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */
+} mca_btl_smcuda_mem_node_t;
+
+/**
+ * Shared Memory (SM) BTL module.
+ */
+struct mca_btl_smcuda_component_t {
+    mca_btl_base_component_2_0_0_t super;  /**< base BTL component */
+    int sm_free_list_num;              /**< initial size of free lists */
+    int sm_free_list_max;              /**< maximum size of free lists */
+    int sm_free_list_inc;              /**< number of elements to alloc when growing free lists */
+    int32_t sm_max_procs;              /**< upper limit on the number of processes using the shared memory pool */
+    int sm_extra_procs;                /**< number of extra procs to allow */
+    char* sm_mpool_name;               /**< name of shared memory pool module */
+    mca_mpool_base_module_t **sm_mpools; /**< shared memory pools (one for each memory node) */
+    mca_mpool_base_module_t *sm_mpool; /**< mpool on local node */
+    void* sm_mpool_base;               /**< base address of shared memory pool */
+    size_t eager_limit;                /**< first fragment size */
+    size_t max_frag_size;              /**< maximum (second and beyone) fragment size */
+    opal_mutex_t sm_lock;
+    mca_common_sm_module_t *sm_seg;   /**< description of shared memory segment */
+    volatile sm_fifo_t **shm_fifo;     /**< pointer to fifo 2D array in shared memory */
+    char **shm_bases;                  /**< pointer to base pointers in shared memory */
+    uint16_t *shm_mem_nodes;           /**< pointer to mem noded in shared memory */
+    sm_fifo_t **fifo;                  /**< cached copy of the pointer to the 2D
+                                          fifo array.  The address in the shared
+                                          memory segment sm_ctl_header is a relative,
+                                          but this one, in process private memory, is
+                                          a real virtual address */
+    uint16_t *mem_nodes;               /**< cached copy of mem nodes of each local rank */
+    size_t fifo_size;                  /**< number of FIFO queue entries */
+    size_t fifo_lazy_free;             /**< number of reads before lazy fifo free is triggered */
+    int nfifos;                        /**< number of FIFOs per receiver */
+    int32_t num_smp_procs;             /**< current number of smp procs on this host */
+    int32_t my_smp_rank;               /**< My SMP process rank.  Used for accessing
+                                        *   SMP specfic data structures. */
+    ompi_free_list_t sm_frags_eager;   /**< free list of sm first */
+    ompi_free_list_t sm_frags_max;     /**< free list of sm second */
+    ompi_free_list_t sm_frags_user;
+    ompi_free_list_t sm_first_frags_to_progress;  /**< list of first
+                                                    fragments that are
+                                                    awaiting resources */
+    struct mca_btl_base_endpoint_t **sm_peers;
+
+    opal_free_list_t pending_send_fl;
+    int num_outstanding_frags;         /**< number of fragments sent but not yet returned to free list */
+    int num_pending_sends;             /**< total number on all of my pending-send queues */
+    int mem_node;
+    int num_mem_nodes;
+    
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+    char sm_fifo_path[PATH_MAX];   /**< path to fifo used to signal this process */
+    int  sm_fifo_fd;               /**< file descriptor corresponding to opened fifo */
+    opal_thread_t sm_fifo_thread;
+#endif
+    struct mca_btl_smcuda_t      **sm_btls;
+    struct mca_btl_smcuda_frag_t **table;
+    size_t sm_num_btls;
+    size_t sm_max_btls;
+
+
+    /** MCA: should we be using knem or not?  neg=try but continue if
+        not available, 0=don't try, 1=try and fail if not available */
+    int use_knem;
+
+    /** MCA: minimal message size (bytes) to offload on DMA engine
+        when using knem */
+    uint32_t knem_dma_min;
+
+    /** MCA: how many simultaneous ongoing knem operations to
+        support */
+    int knem_max_simultaneous;
+
+    /** If we want DMA and DMA is supported, this will be loaded with
+        KNEM_FLAG_DMA.  Otherwise, it'll be 0. */
+    int knem_dma_flag;
+#if OMPI_CUDA_SUPPORT
+    /** MCA: should we be CUDA RDMA or not?  neg=try but continue if
+        not available, 0=don't try, 1=try and fail if not available */
+    int use_cuda_rdma;
+#endif /* OMPI_CUDA_SUPPORT */
+};
+typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
+OMPI_MODULE_DECLSPEC extern mca_btl_smcuda_component_t mca_btl_smcuda_component;
+
+/**
+ * SM BTL Interface
+ */
+struct mca_btl_smcuda_t {
+    mca_btl_base_module_t  super;       /**< base BTL interface */
+    bool btl_inited;  /**< flag indicating if btl has been inited */
+    mca_btl_base_module_error_cb_fn_t error_cb;
+
+};
+typedef struct mca_btl_smcuda_t mca_btl_smcuda_t;
+OMPI_MODULE_DECLSPEC extern mca_btl_smcuda_t mca_btl_smcuda;
+
+
+
+
+
+struct btl_smcuda_pending_send_item_t
+{
+    opal_free_list_item_t super;
+    void *data;
+};
+typedef struct btl_smcuda_pending_send_item_t btl_smcuda_pending_send_item_t;
+
+/***
+ * FIFO support for sm BTL.
+ */
+
+/***
+ * One or more FIFO components may be a pointer that must be
+ * accessed by multiple processes.  Since the shared region may
+ * be mmapped differently into each process's address space,
+ * these pointers will be relative to some base address.  Here,
+ * we define macros to translate between relative addresses and
+ * virtual addresses.
+ */
+#define VIRTUAL2RELATIVE(VADDR ) ((long)(VADDR)  - (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
+#define RELATIVE2VIRTUAL(OFFSET) ((long)(OFFSET) + (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
+
+static inline int sm_fifo_init(int fifo_size, mca_mpool_base_module_t *mpool,
+                               sm_fifo_t *fifo, int lazy_free)
+{
+    int i, qsize;
+
+    /* figure out the queue size (a power of two that is at least 1) */
+    qsize = opal_next_poweroftwo_inclusive (fifo_size);
+
+    /* allocate the queue in the receiver's address space */
+    fifo->queue_recv = (volatile void **)mpool->mpool_alloc(
+            mpool, sizeof(void *) * qsize, opal_cache_line_size, 0, NULL);
+    if(NULL == fifo->queue_recv) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    /* initialize the queue */
+    for ( i = 0; i < qsize; i++ )
+        fifo->queue_recv[i] = SM_FIFO_FREE;
+
+    /* shift queue address to be relative */
+    fifo->queue = (volatile void **) VIRTUAL2RELATIVE(fifo->queue_recv);
+
+    /* initialize the locks */
+    opal_atomic_init(&(fifo->head_lock), OPAL_ATOMIC_UNLOCKED);
+    opal_atomic_init(&(fifo->tail_lock), OPAL_ATOMIC_UNLOCKED);
+    opal_atomic_unlock(&(fifo->head_lock));  /* should be unnecessary */
+    opal_atomic_unlock(&(fifo->tail_lock));  /* should be unnecessary */
+
+    /* other initializations */
+    fifo->head = 0;
+    fifo->mask = qsize - 1;
+    fifo->tail = 0;
+    fifo->num_to_clear = 0;
+    fifo->lazy_free = lazy_free;
+
+    return OMPI_SUCCESS;
+}
+
+
+static inline int sm_fifo_write(void *value, sm_fifo_t *fifo)
+{
+    volatile void **q = (volatile void **) RELATIVE2VIRTUAL(fifo->queue);
+
+    /* if there is no free slot to write, report exhausted resource */
+    opal_atomic_rmb();
+    if ( SM_FIFO_FREE != q[fifo->head] )
+        return OMPI_ERR_OUT_OF_RESOURCE;
+
+    /* otherwise, write to the slot and advance the head index */
+    q[fifo->head] = value;
+    opal_atomic_wmb();
+    fifo->head = (fifo->head + 1) & fifo->mask;
+    return OMPI_SUCCESS;
+}
+
+
+static inline void *sm_fifo_read(sm_fifo_t *fifo)
+{
+    void *value;
+
+    /* read the next queue entry */
+    value = (void *) fifo->queue_recv[fifo->tail];
+
+    opal_atomic_rmb();
+
+    /* if you read a non-empty slot, advance the tail pointer */
+    if ( SM_FIFO_FREE != value ) {
+
+        fifo->tail = ( fifo->tail + 1 ) & fifo->mask;
+        fifo->num_to_clear += 1;
+
+        /* check if it's time to free slots, which we do lazily */
+        if ( fifo->num_to_clear >= fifo->lazy_free ) {
+            int i = (fifo->tail - fifo->num_to_clear ) & fifo->mask;
+
+            while ( fifo->num_to_clear > 0 ) {
+                fifo->queue_recv[i] = SM_FIFO_FREE;
+                i = (i+1) & fifo->mask;
+                fifo->num_to_clear -= 1;
+            }
+            opal_atomic_wmb();
+        }
+    }
+
+    return value;
+}
+
+/**
+ * shared memory component progress.
+ */
+extern int mca_btl_smcuda_component_progress(void);
+
+
+
+/**
+ * Register a callback function that is called on error..
+ *
+ * @param btl (IN)     BTL module
+ * @return             Status indicating if cleanup was successful
+ */
+
+int mca_btl_smcuda_register_error_cb(
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_module_error_cb_fn_t cbfunc
+);
+
+/**
+ * Cleanup any resources held by the BTL.
+ *
+ * @param btl  BTL instance.
+ * @return     OMPI_SUCCESS or error status on failure.
+ */
+
+extern int mca_btl_smcuda_finalize(
+    struct mca_btl_base_module_t* btl
+);
+
+
+/**
+ * PML->BTL notification of change in the process list.
+ * PML->BTL Notification that a receive fragment has been matched.
+ * Called for message that is send from process with the virtual
+ * address of the shared memory segment being different than that of
+ * the receiver.
+ *
+ * @param btl (IN)
+ * @param proc (IN)
+ * @param peer (OUT)
+ * @return     OMPI_SUCCESS or error status on failure.
+ *
+ */
+
+extern int mca_btl_smcuda_add_procs(
+    struct mca_btl_base_module_t* btl,
+    size_t nprocs,
+    struct ompi_proc_t **procs,
+    struct mca_btl_base_endpoint_t** peers,
+    struct opal_bitmap_t* reachability
+);
+
+
+/**
+ * PML->BTL notification of change in the process list.
+ *
+ * @param btl (IN)     BTL instance
+ * @param proc (IN)    Peer process
+ * @param peer (IN)    Peer addressing information.
+ * @return             Status indicating if cleanup was successful
+ *
+ */
+extern int mca_btl_smcuda_del_procs(
+    struct mca_btl_base_module_t* btl,
+    size_t nprocs,
+    struct ompi_proc_t **procs,
+    struct mca_btl_base_endpoint_t **peers
+);
+
+
+/**
+ * Allocate a segment.
+ *
+ * @param btl (IN)      BTL module
+ * @param size (IN)     Request segment size.
+ */
+extern mca_btl_base_descriptor_t* mca_btl_smcuda_alloc(
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    uint8_t order,
+    size_t size,
+    uint32_t flags
+);
+
+/**
+ * Return a segment allocated by this BTL.
+ *
+ * @param btl (IN)      BTL module
+ * @param segment (IN)  Allocated segment.
+ */
+extern int mca_btl_smcuda_free(
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_descriptor_t* segment
+);
+
+
+/**
+ * Pack data
+ *
+ * @param btl (IN)      BTL module
+ * @param peer (IN)     BTL peer addressing
+ */
+struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    mca_mpool_base_registration_t* registration,
+    struct opal_convertor_t* convertor,
+    uint8_t order,
+    size_t reserve,
+    size_t* size,
+    uint32_t flags
+);
+
+
+/**
+ * Initiate an inlined send to the peer or return a descriptor.
+ *
+ * @param btl (IN)      BTL module
+ * @param peer (IN)     BTL peer addressing
+ */
+extern int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
+                             struct mca_btl_base_endpoint_t* endpoint,
+                             struct opal_convertor_t* convertor,
+                             void* header,
+                             size_t header_size,
+                             size_t payload_size,
+                             uint8_t order,
+                             uint32_t flags,
+                             mca_btl_base_tag_t tag,
+                             mca_btl_base_descriptor_t** descriptor );
+
+/**
+ * Initiate a send to the peer.
+ *
+ * @param btl (IN)      BTL module
+ * @param peer (IN)     BTL peer addressing
+ */
+extern int mca_btl_smcuda_send(
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    struct mca_btl_base_descriptor_t* descriptor,
+    mca_btl_base_tag_t tag
+);
+
+#if OMPI_CUDA_SUPPORT
+/**
+ * Remote get using device memory.
+ */
+extern int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
+							   struct mca_btl_base_endpoint_t* ep,
+							   struct mca_btl_base_descriptor_t* descriptor);
+
+extern struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
+		struct mca_btl_base_module_t* btl,
+		struct mca_btl_base_endpoint_t* endpoint,
+		struct mca_mpool_base_registration_t* registration,
+		struct opal_convertor_t* convertor,
+		uint8_t order,
+		size_t reserve,
+		size_t* size,
+		uint32_t flags);
+#endif /* OMPI_CUDA_SUPPORT */
+
+/**
+ * Fault Tolerance Event Notification Function
+ * @param state Checkpoint Stae
+ * @return OMPI_SUCCESS or failure status
+ */
+int mca_btl_smcuda_ft_event(int state);
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+void mca_btl_smcuda_component_event_thread(opal_object_t*);
+#endif
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+#define MCA_BTL_SMCUDA_SIGNAL_PEER(peer) \
+{ \
+    unsigned char cmd = DATA; \
+    if(write(peer->fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) { \
+        opal_output(0, "mca_btl_smcuda_send: write fifo failed: errno=%d\n", errno); \
+    } \
+}
+#else
+#define MCA_BTL_SMCUDA_SIGNAL_PEER(peer)
+#endif
+
+END_C_DECLS
+
+#endif
+

ompi/mca/btl/smcuda/btl_smcuda_component.c

+/*
+ * Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006-2007 Voltaire. All rights reserved.
+ * Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2010-2011 Los Alamos National Security, LLC.
+ *                         All rights reserved.
+ * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+#include <errno.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif  /* HAVE_UNISTD_H */
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif  /* HAVE_STRING_H */
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif  /* HAVE_FCNTL_H */
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif  /* HAVE_SYS_TYPES_H */
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif  /* HAVE_SYS_MMAN_H */
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>  /* for mkfifo */
+#endif  /* HAVE_SYS_STAT_H */
+
+#include "ompi/constants.h"
+#include "opal/mca/event/event.h"
+#include "opal/util/bit_ops.h"
+#include "opal/util/output.h"
+#include "orte/util/proc_info.h"
+#include "orte/util/show_help.h"
+#include "orte/runtime/orte_globals.h"
+
+#include "opal/mca/base/mca_base_param.h"
+#include "ompi/mca/mpool/base/base.h"
+#if OMPI_CUDA_SUPPORT
+#include "ompi/runtime/params.h"
+#include "ompi/mca/common/cuda/common_cuda.h"
+#endif /* OMPI_CUDA_SUPPORT */
+#include "ompi/mca/common/sm/common_sm.h"
+#include "ompi/mca/btl/base/btl_base_error.h"
+
+#if OPAL_ENABLE_FT_CR    == 1
+#include "opal/runtime/opal_cr.h"
+#endif
+
+#include "btl_smcuda.h"
+#include "btl_smcuda_frag.h"
+#include "btl_smcuda_fifo.h"
+
+static int mca_btl_smcuda_component_open(void);
+static int mca_btl_smcuda_component_close(void);
+static int smcuda_register(void);
+static mca_btl_base_module_t** mca_btl_smcuda_component_init(
+    int *num_btls,
+    bool enable_progress_threads,
+    bool enable_mpi_threads
+);
+
+
+/*
+ * Shared Memory (SM) component instance.
+ */
+mca_btl_smcuda_component_t mca_btl_smcuda_component = {
+    {  /* super is being filled in */
+        /* First, the mca_base_component_t struct containing meta information
+          about the component itself */
+        {
+            MCA_BTL_BASE_VERSION_2_0_0,
+
+            "smcuda", /* MCA component name */
+            OMPI_MAJOR_VERSION,  /* MCA component major version */
+            OMPI_MINOR_VERSION,  /* MCA component minor version */
+            OMPI_RELEASE_VERSION,  /* MCA component release version */
+            mca_btl_smcuda_component_open,  /* component open */
+            mca_btl_smcuda_component_close,  /* component close */
+            NULL,
+            smcuda_register,
+        },
+        {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        mca_btl_smcuda_component_init,
+        mca_btl_smcuda_component_progress,
+    }  /* end super */
+};
+
+
+/*
+ * utility routines for parameter registration
+ */
+
+static inline char* mca_btl_smcuda_param_register_string(
+    const char* param_name,
+    const char* default_value)
+{
+    char *param_value;
+    int id = mca_base_param_register_string("btl","sm",param_name,NULL,default_value);
+    mca_base_param_lookup_string(id, &param_value);
+    return param_value;
+}
+
+static inline int mca_btl_smcuda_param_register_int(
+    const char* param_name,
+    int default_value)
+{
+    int id = mca_base_param_register_int("btl","sm",param_name,NULL,default_value);
+    int param_value = default_value;
+    mca_base_param_lookup_int(id,&param_value);
+    return param_value;
+}
+
+
+static int smcuda_register(void)
+{
+    int i;
+
+    /* register SM component parameters */
+    mca_btl_smcuda_component.sm_free_list_num =
+        mca_btl_smcuda_param_register_int("free_list_num", 8);
+    mca_btl_smcuda_component.sm_free_list_max =
+        mca_btl_smcuda_param_register_int("free_list_max", -1);
+    mca_btl_smcuda_component.sm_free_list_inc =
+        mca_btl_smcuda_param_register_int("free_list_inc", 64);
+    mca_btl_smcuda_component.sm_max_procs =
+        mca_btl_smcuda_param_register_int("max_procs", -1);
+    mca_btl_smcuda_component.sm_mpool_name =
+        mca_btl_smcuda_param_register_string("mpool", "sm");
+    mca_btl_smcuda_component.fifo_size =
+        mca_btl_smcuda_param_register_int("fifo_size", 4096);
+    mca_btl_smcuda_component.nfifos =
+        mca_btl_smcuda_param_register_int("num_fifos", 1);
+
+    mca_btl_smcuda_component.fifo_lazy_free =
+        mca_btl_smcuda_param_register_int("fifo_lazy_free", 120);
+
+    /* default number of extra procs to allow for future growth */
+    mca_btl_smcuda_component.sm_extra_procs =
+        mca_btl_smcuda_param_register_int("sm_extra_procs", 0);
+
+    mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
+    mca_btl_smcuda.super.btl_eager_limit = 4*1024;
+    mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
+    mca_btl_smcuda.super.btl_max_send_size = 32*1024;
+    mca_btl_smcuda.super.btl_rdma_pipeline_send_length = 64*1024;
+    mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024;
+    mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024;
+    mca_btl_smcuda.super.btl_flags = MCA_BTL_FLAGS_SEND;
+    mca_btl_smcuda.super.btl_bandwidth = 9000;  /* Mbs */
+    mca_btl_smcuda.super.btl_latency   = 1;     /* Microsecs */
+
+    /* Register an MCA param to indicate whether we have CUDA RDMA support
+       or not */
+    mca_base_param_reg_int(&mca_btl_smcuda_component.super.btl_version,
+                           "have_cuda_rdma_support", 
+                           "Whether this component supports CUDA RDMA or not",
+                           false, true, OMPI_CUDA_SUPPORT, NULL);
+
+    if (OMPI_CUDA_SUPPORT) {
+        i = -1;
+    } else {
+        i = 0;
+    }
+    mca_base_param_reg_int(&mca_btl_smcuda_component.super.btl_version,
+                           "use_cuda_rdma",
+                           "Whether CUDA RDMA support is desired or not "
+                           "(negative = try to enable CUDA RDMA support, but continue even "
+                           "if it is not available, 0 = do not enable CUDA RDMA support, "
+                           "positive = try to enable CUDA RDMA support and print message "
+                           "if it is not available)",
+                           false, false, i, &i);
+    if (OMPI_CUDA_SUPPORT) {
+        mca_btl_smcuda_component.use_cuda_rdma = i;
+    } else {
+        if (i > 0) {
+            orte_show_help("help-mpi-btl-sm.txt",
+                           "CUDA RDMA requested but not supported", true,
+                           orte_process_info.nodename);
+       }
+        mca_btl_smcuda_component.use_cuda_rdma = 0;
+    }
+
+#if OMPI_CUDA_SUPPORT
+    if (mca_btl_smcuda_component.use_cuda_rdma) {
+        mca_btl_smcuda.super.btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
+    }
+#endif /* OMPI_CUDA_SUPPORT */
+
+    /* Call the BTL based to register its MCA params */
+    mca_btl_base_param_register(&mca_btl_smcuda_component.super.btl_version,
+                                &mca_btl_smcuda.super);
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ *  Called by MCA framework to open the component, registers
+ *  component parameters.
+ */
+
+static int mca_btl_smcuda_component_open(void)
+{
+    mca_btl_smcuda_component.sm_max_btls = 1;
+
+    /* make sure the number of fifos is a power of 2 */
+    mca_btl_smcuda_component.nfifos = opal_next_poweroftwo_inclusive (mca_btl_smcuda_component.nfifos);
+
+    /* make sure that queue size and lazy free parameter are compatible */
+    if (mca_btl_smcuda_component.fifo_lazy_free >= (mca_btl_smcuda_component.fifo_size >> 1) )
+        mca_btl_smcuda_component.fifo_lazy_free  = (mca_btl_smcuda_component.fifo_size >> 1);
+    if (mca_btl_smcuda_component.fifo_lazy_free <= 0)
+        mca_btl_smcuda_component.fifo_lazy_free  = 1;
+
+    mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_max_send_size;
+    mca_btl_smcuda_component.eager_limit = mca_btl_smcuda.super.btl_eager_limit;
+
+    /* initialize objects */
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_eager, ompi_free_list_t);
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_max, ompi_free_list_t);
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_user, ompi_free_list_t);
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.pending_send_fl, opal_free_list_t);
+    return OMPI_SUCCESS;
+}
+
+
+/*
+ * component cleanup - sanity checking of queue lengths
+ */
+
+static int mca_btl_smcuda_component_close(void)
+{
+    int return_value = OMPI_SUCCESS;
+
+
+    OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_lock);
+    /**
+     * We don't have to destroy the fragment lists. They are allocated
+     * directly into the mmapped file, they will auto-magically disappear
+     * when the file get unmapped.
+     */
+    /*OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_frags_eager);*/
+    /*OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_frags_max);*/
+
+    /* unmap the shared memory control structure */
+    if(mca_btl_smcuda_component.sm_seg != NULL) {
+        return_value = mca_common_sm_fini( mca_btl_smcuda_component.sm_seg );
+        if( OMPI_SUCCESS != return_value ) {
+            return_value=OMPI_ERROR;
+            opal_output(0," mca_common_sm_fini failed\n");
+            goto CLEANUP;
+        }
+
+        /* unlink file, so that it will be deleted when all references
+         * to it are gone - no error checking, since we want all procs
+         * to call this, so that in an abnormal termination scenario,
+         * this file will still get cleaned up */
+#if OPAL_ENABLE_FT_CR    == 1
+        /* Only unlink the file if we are *not* restarting
+         * If we are restarting the file will be unlinked at a later time.
+         */
+        if(OPAL_CR_STATUS_RESTART_PRE  != opal_cr_checkpointing_state &&
+           OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
+            unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
+        }
+#else
+        unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
+#endif
+        OBJ_RELEASE(mca_btl_smcuda_component.sm_seg);
+    }
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+    /* close/cleanup fifo create for event notification */
+    if(mca_btl_smcuda_component.sm_fifo_fd > 0) {
+        /* write a done message down the pipe */
+        unsigned char cmd = DONE;
+        if( write(mca_btl_smcuda_component.sm_fifo_fd,&cmd,sizeof(cmd)) !=
+                sizeof(cmd)){
+            opal_output(0, "mca_btl_smcuda_component_close: write fifo failed: errno=%d\n",
+                    errno);