Source

mpi3-fortran / ompi / mca / btl / vader / btl_vader_component.c

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2009 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2006-2007 Voltaire. All rights reserved.
 * Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2010-2012 Los Alamos National Security, LLC.
 *                         All rights reserved.
 * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */
#include "ompi_config.h"

#include "ompi/constants.h"
#include "opal/util/output.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"

#include "opal/mca/base/mca_base_param.h"
#include "ompi/mca/btl/base/btl_base_error.h"

#include "btl_vader.h"
#include "btl_vader_frag.h"
#include "btl_vader_fifo.h"
#include "btl_vader_fbox.h"

static int mca_btl_vader_component_progress (void);
static int mca_btl_vader_component_open(void);
static int mca_btl_vader_component_close(void);
static int mca_btl_vader_component_register(void);
static mca_btl_base_module_t** mca_btl_vader_component_init(int *num_btls,
                                                            bool enable_progress_threads,
                                                            bool enable_mpi_threads);

/* limit where we should switch from bcopy to memcpy */
int mca_btl_vader_memcpy_limit     = 524288;
int mca_btl_vader_log_align        = 21; /* 2 MiB */
/* maximum size for using copy-in-copy out semantics for contiguous sends */
int mca_btl_vader_max_inline_send = 256;

/*
 * Shared Memory (VADER) component instance.
 */
mca_btl_vader_component_t mca_btl_vader_component = {
    {
        /* First, the mca_base_component_t struct containing meta information
           about the component itself */
        {
            MCA_BTL_BASE_VERSION_2_0_0,

            "vader", /* MCA component name */
            OMPI_MAJOR_VERSION,  /* MCA component major version */
            OMPI_MINOR_VERSION,  /* MCA component minor version */
            OMPI_RELEASE_VERSION,  /* MCA component release version */
            mca_btl_vader_component_open,  /* component open */
            mca_btl_vader_component_close,  /* component close */
            NULL,
            mca_btl_vader_component_register,
        },
        {
            /* The component is checkpoint ready */
            MCA_BASE_METADATA_PARAM_CHECKPOINT
        },

        mca_btl_vader_component_init,
        mca_btl_vader_component_progress,
    }  /* end super */
};


/*
 * utility routines for parameter registration
 */

static inline char *mca_btl_vader_param_register_string(const char *param_name,
                                                        const char *default_value)
{
    char *param_value;

    (void) mca_base_param_reg_string (&mca_btl_vader_component.super.btl_version,
                                      param_name, NULL, false, false, default_value,
                                      &param_value);

    return param_value;
}

static inline int mca_btl_vader_param_register_int(const char *param_name,
                                                   int value)
{
    (void) mca_base_param_reg_int (&mca_btl_vader_component.super.btl_version,
                                   param_name, NULL, false, false, value, &value);
    return value;
}

static int mca_btl_vader_component_register (void)
{
    /* register VADER component parameters */
    mca_btl_vader_component.vader_free_list_num =
        mca_btl_vader_param_register_int("free_list_num", 8);
    mca_btl_vader_component.vader_free_list_max =
        mca_btl_vader_param_register_int("free_list_max", -1);
    mca_btl_vader_component.vader_free_list_inc =
        mca_btl_vader_param_register_int("free_list_inc", 64);
    mca_btl_vader_component.vader_mpool_name =
        mca_btl_vader_param_register_string("mpool", "sm");
    mca_btl_vader_memcpy_limit =
        mca_btl_vader_param_register_int("memcpy_limit", mca_btl_vader_memcpy_limit);
    mca_btl_vader_log_align =
        mca_btl_vader_param_register_int("log_align", mca_btl_vader_log_align);

    /* limit segment alignment to be between 4k and 16M */
    if (mca_btl_vader_log_align < 12) {
        mca_btl_vader_log_align = 12;
    } else if (mca_btl_vader_log_align > 25) {
        mca_btl_vader_log_align = 25;
    }

    mca_btl_vader_max_inline_send =
        mca_btl_vader_param_register_int("max_inline_send", mca_btl_vader_max_inline_send);

    mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
    mca_btl_vader.super.btl_eager_limit = 64 * 1024;
    mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit;
    mca_btl_vader.super.btl_max_send_size    = mca_btl_vader.super.btl_eager_limit;
    mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit;
    mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit;
    mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit;
    mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT |
        MCA_BTL_FLAGS_SEND_INPLACE;
    mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_base_segment_t);

    mca_btl_vader.super.btl_bandwidth = 40000; /* Mbs */
    mca_btl_vader.super.btl_latency   = 1;     /* Microsecs */

    /* Call the BTL based to register its MCA params */
    mca_btl_base_param_register(&mca_btl_vader_component.super.btl_version,
                                &mca_btl_vader.super);

    return OMPI_SUCCESS;
}

/*
 *  Called by MCA framework to open the component, registers
 *  component parameters.
 */

static int mca_btl_vader_component_open(void)
{
    mca_btl_vader_component.eager_limit = mca_btl_vader.super.btl_eager_limit;

    /* initialize objects */
    OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_eager, ompi_free_list_t);
    OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_user, ompi_free_list_t);
    OBJ_CONSTRUCT(&mca_btl_vader_component.active_sends, opal_list_t);

    return OMPI_SUCCESS;
}


/*
 * component cleanup - sanity checking of queue lengths
 */

static int mca_btl_vader_component_close(void)
{
    int return_value = OMPI_SUCCESS;

    /**
     * We don't have to destroy the fragment lists. They are allocated
     * directly into the mmapped file, they will auto-magically disappear
     * when the file get unmapped.
     */
    /*OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager);*/

    /* unmap the shared memory control structure */
    if(mca_btl_vader_component.vader_seg != NULL) {
        return_value = mca_common_sm_fini( mca_btl_vader_component.vader_seg );
        if( OMPI_SUCCESS != return_value ) {
            return_value=OMPI_ERROR;
            opal_output(0," mca_common_sm_fini failed\n");
            goto CLEANUP;
        }

        /* unlink file, so that it will be deleted when all references
         * to it are gone - no error checking, since we want all procs
         * to call this, so that in an abnormal termination scenario,
         * this file will still get cleaned up */
        /* XXX LANL TODO -- remove unlink once the shmem segment uses xpmem */
        unlink(mca_btl_vader_component.vader_seg->shmem_ds.seg_name);
        OBJ_RELEASE(mca_btl_vader_component.vader_seg);
    }

    if (NULL != mca_btl_vader_component.vader_mpool_name) {
        free(mca_btl_vader_component.vader_mpool_name);
    }

    OBJ_DESTRUCT(&mca_btl_vader_component.active_sends);

 CLEANUP:

    /* return */
    return return_value;
}

/*
 *  VADER component initialization
 */
static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
                                                             bool enable_progress_threads,
                                                             bool enable_mpi_threads)
{
    mca_btl_vader_component_t *component = &mca_btl_vader_component;
    mca_btl_base_module_t **btls = NULL;

    *num_btls = 0;

    /* if no session directory was created, then we cannot be used */
    /* XXX LANL FIXME -- this is not the case. we can use an anonymous segment */
    if (!orte_create_session_dirs) {
        return NULL;
    }
    
    /* lookup/create shared memory pool only when used */
    component->vader_mpool      = NULL;
    component->vader_mpool_base = NULL;

    btls = (mca_btl_base_module_t **) calloc (1, sizeof (mca_btl_base_module_t *));
    if (NULL == btls) {
        return NULL;
    }

    /* create an xpmem segment for the entire memory space */
    component->my_seg_id = xpmem_make (0, 0xffffffffffffffffll, XPMEM_PERMIT_MODE,
                                       (void *)0666);
    if (-1 == component->my_seg_id) {
        free (btls);
        return NULL;
    }

    *num_btls = 1;

    /* get pointer to the btls */
    btls[0] = (mca_btl_base_module_t *) &mca_btl_vader;

    /* initialize some BTL data */
    /* start with no VADER procs */
    component->num_smp_procs    = 0;
    component->my_smp_rank      = -1;  /* not defined */

    /* set flag indicating btl not inited */
    mca_btl_vader.btl_inited = false;

    return btls;
}

static inline void mca_btl_vader_progress_sends (void)
{
    opal_list_t *list = &mca_btl_vader_component.active_sends;
    opal_list_item_t *item, *next;
    mca_btl_vader_frag_t *frag;

    for (item = opal_list_get_first (list) ; item != opal_list_get_end (list) ; ) {
        frag = (mca_btl_vader_frag_t *) item;
        next = opal_list_get_next (item);

        if (OPAL_LIKELY(frag->hdr->complete)) {
            opal_list_remove_item (&mca_btl_vader_component.active_sends, item);

            mca_btl_vader_frag_complete (frag);
        }

        item = next;
    }
}


static int mca_btl_vader_component_progress (void)
{
    int my_smp_rank = mca_btl_vader_component.my_smp_rank;
    vader_fifo_t *fifo = mca_btl_vader_component.fifo[my_smp_rank];
    mca_btl_active_message_callback_t *reg;
    mca_btl_vader_frag_t frag;
    mca_btl_vader_hdr_t *hdr;
    mca_mpool_base_registration_t *xpmem_reg = NULL;

    /* check active sends for completion */
    mca_btl_vader_progress_sends ();

    /* check for messages in fast boxes */
    mca_btl_vader_check_fboxes ();

    /* poll the fifo once */
    hdr = vader_fifo_read (fifo);
    if (NULL == hdr) {
        return 0;
    }

    reg = mca_btl_base_active_message_trigger + hdr->tag;
    frag.base.des_dst     = frag.segments;
    frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
    frag.segments[0].seg_len       = hdr->len;

    if (OPAL_UNLIKELY(hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY)) {
        xpmem_reg = vader_get_registation (hdr->my_smp_rank, hdr->sc_iov.iov_base,
                                           hdr->sc_iov.iov_len, 0);

        frag.segments[1].seg_addr.pval = vader_reg_to_ptr (xpmem_reg, hdr->sc_iov.iov_base);
        frag.segments[1].seg_len       = hdr->sc_iov.iov_len;

        /* recv upcall */
        frag.base.des_dst_cnt = 2;
        reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
        vader_return_registration (xpmem_reg, hdr->my_smp_rank);
    } else {
        frag.base.des_dst_cnt = 1;
        reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
    }

    /* return the fragment */
    hdr->complete = true;

    return 1;
}