Commits

Craig Rasmussen  committed fb1a622 Merge

Merge from commit of adding implicit-stmt to mpigen.py

  • Participants
  • Parent commits 9129141, 40b9d5f

Comments (0)

Files changed (25)

 ./orte/mca/rmaps/Makefile.in
 ./orte/mca/rmaps/round_robin/Makefile.in
 ./orte/mca/rmaps/resilient/Makefile.in
+./orte/mca/rmaps/mindist/Makefile.in
 ./orte/mca/rmaps/lama/Makefile.in
 ./orte/mca/rmaps/staged/Makefile.in
 ./orte/mca/rmaps/rank_file/Makefile.in
 
 1.7.2
 -----
-
 - Fix an error that caused epoll to automatically be disabled
-  in libevent
-- Upgrade hwloc to 1.5.2
-- Fix MXM connection establishment flow
-- Lots of VampirTrace upgrades and fixes; upgrade to v5.14.8
-- Fixed some minor memory leaks 
+  in libevent.
+- Upgrade hwloc to 1.5.2.
+- Fix MXM connection establishment flow.
+- Lots of VampirTrace upgrades and fixes; upgrade to v5.14.8.
+- Fixed some minor memory leaks.
+- Add retransmit framework for unreliable networks.
+- Fixed datatype corruption issue when combining datatypes of specific
+  formats.
+- Added Location Aware Mapping Algorithm (LAMA) mapping component.
+- Fixes for MPI_STATUS handling in corner cases.-
 
 
 1.7.1

File config/opal_configure_options.m4

     [Whether we want checkpoint/restart enabled debugging functionality or not])
 
 #
-# Check to see if user wants CUDA support in datatype and convertor code.
+# Check to see if user wants CUDA support
 #
 AC_ARG_WITH([cuda],
             [AC_HELP_STRING([--with-cuda(=DIR)],
             [Build cuda support, optionally adding DIR/include])])
 AC_MSG_CHECKING([if --with-cuda is set])
 
-# CUDA support is off by default.  User has to request it.
+# CUDA support is off by default.  User has to request it.  Look for cuda.h file.
 AS_IF([test "$with_cuda" = "no" -o "x$with_cuda" = "x"],
       [opal_check_cuda_happy="no"
        AC_MSG_RESULT([not set (--with-cuda=$with_cuda)])],
                      AC_MSG_ERROR([Cannot continue])],
                     [AC_MSG_RESULT([found])
                      opal_check_cuda_happy="yes"
-                     with_cuda="/usr/local/cuda"])],
+                     with_cuda="/usr/local/cuda/include"])],
              [AS_IF([test ! -d "$with_cuda"],
                     [AC_MSG_RESULT([not found])
                      AC_MSG_WARN([Directory $with_cuda not found])
                      AC_MSG_ERROR([Cannot continue])],
                     [AS_IF([test "x`ls $with_cuda/include/cuda.h 2> /dev/null`" = "x"],
-                           [AC_MSG_RESULT([not found])
-                            AC_MSG_WARN([Expected file $with_cuda/include/cuda.h not found])
-                            AC_MSG_ERROR([Cannot continue])],
+                           [AS_IF([test "x`ls $with_cuda/cuda.h 2> /dev/null`" = "x"],
+                                  [AC_MSG_RESULT([not found])
+                                   AC_MSG_WARN([Could not find cuda.h in $with_cuda/include or $with_cuda])
+                                   AC_MSG_ERROR([Cannot continue])],
+                                  [opal_check_cuda_happy="yes"
+                                   with_cuda="$with_cuda"
+                                   AC_MSG_RESULT([found ($with_cuda/cuda.h)])])],
                            [opal_check_cuda_happy="yes"
-                            AC_MSG_RESULT([found ($with_cuda/include/cuda.h)])])])])])
+                            with_cuda="$with_cuda/include"
+                            AC_MSG_RESULT([found ($with_cuda/cuda.h)])])])])])
 
 # If we have CUDA support, check to see if we have CUDA 4.1 support
 AS_IF([test "$opal_check_cuda_happy"="yes"],
     AC_CHECK_MEMBER([struct CUipcMemHandle_st.reserved], [CUDA_SUPPORT_41=1], [CUDA_SUPPORT_41=0],
-        [#include <$with_cuda/include/cuda.h>]),
+        [#include <$with_cuda/cuda.h>]),
     [])
 
 AC_MSG_CHECKING([if have cuda support])
 if test "$opal_check_cuda_happy" = "yes"; then
-    AC_MSG_RESULT([yes (-I$with_cuda/include)])
+    AC_MSG_RESULT([yes (-I$with_cuda)])
     CUDA_SUPPORT=1
-    opal_datatype_cuda_CPPFLAGS="-I$with_cuda/include"
+    opal_datatype_cuda_CPPFLAGS="-I$with_cuda"
     AC_SUBST([opal_datatype_cuda_CPPFLAGS])
     AC_SUBST([opal_datatype_cuda_LIBS])
 else

File contrib/dist/linux/openmpi.spec

 #
 #############################################################################
 
-Summary: A powerful implementaion of MPI
+Summary: A powerful implementation of MPI
 Name: %{?_name:%{_name}}%{!?_name:openmpi}
 Version: $VERSION
 Release: 1

File ompi/datatype/ompi_datatype_internal.h

 #endif
 
 
-OMPI_DECLSPEC extern union dt_elem_desc ompi_datatype_predefined_elem_desc[2 * OMPI_DATATYPE_MPI_MAX_PREDEFINED];
 extern const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX_PREDEFINED];
 
 /* There 3 types of predefined data types.
 
 #if OMPI_BUILD_FORTRAN_BINDINGS
 /*
- * For Fortran, we need to pass information, such as ALIGNMENT and SIZE as well
- * Therefore, for initialization at compile-time, pass this data as well.
- *
- * However, there is no underlying OPAL-TYPE, therefore we just pass NAME, SIZE,
- * ALIGN and the FLAGS. Additionally, ONLY for Fortran we need the
- * ompi_datatype_predefined_elem_desc for the additional types.
- */
-#define OMPI_DATATYPE_INIT_DESC_PREDEFINED(TYPE, SIZE)                               \
-    {                                                                                \
-        1 /*length*/, 1 /*used*/,                                                    \
-        &(ompi_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_ ## TYPE ## SIZE]) /*desc*/ \
-    }
-
-/*
  * Fortran types are based on the underlying OPAL types: They share the ID -- however,
  * the alignment is overwritten.
  */
         (ALIGN) /*align*/,                                                           \
         1 /*nbElems*/,                                                               \
         OPAL_DATATYPE_INIT_NAME(TYPE ## SIZE) /*name*/,                              \
-        OMPI_DATATYPE_INIT_DESC_PREDEFINED(TYPE, SIZE) /*desc*/,                     \
-        OMPI_DATATYPE_INIT_DESC_PREDEFINED(TYPE, SIZE) /*opt_desc*/,                 \
+        OPAL_DATATYPE_INIT_DESC_PREDEFINED(TYPE ## SIZE) /*desc*/,                   \
+        OPAL_DATATYPE_INIT_DESC_PREDEFINED(TYPE ## SIZE) /*opt_desc*/,               \
         OPAL_DATATYPE_INIT_BTYPES_ARRAY_ ## TYPE ## SIZE /*btypes*/                  \
     }
 

File ompi/datatype/ompi_datatype_module.c

 
 /* by default the debuging is turned off */
 int ompi_datatype_dfd = -1;
-OMPI_DECLSPEC union dt_elem_desc ompi_datatype_predefined_elem_desc[2 * OMPI_DATATYPE_MPI_MAX_PREDEFINED];
 
 /**
  * This is the number of predefined datatypes. It is different than the MAX_PREDEFINED
 {
     int32_t i;
 
-    for( i = 0; i < OMPI_DATATYPE_MPI_MAX_PREDEFINED; i++ ) {
-        ompi_datatype_t* datatype = (ompi_datatype_t*)ompi_datatype_basicDatatypes[i];
-        dt_elem_desc_t* pDesc;
-
-        if( 0 == datatype->super.size ) continue;
-
-        /**
-         * Most of the OMPI datatypes have been initialized with the basic desc of the
-         * OPAL datatypes. Thus don't modify the desc, instead rebase the desc back into
-         * the OMPI predefined_elem_desc and update the fields there.
-         */
-        pDesc = &ompi_datatype_predefined_elem_desc[2 * i];
-        if( pDesc != datatype->super.desc.desc ) {
-            memcpy(pDesc, datatype->super.desc.desc, 2 * sizeof(dt_elem_desc_t));
-            datatype->super.desc.desc = pDesc;
-        } else {
-            datatype->super.desc.desc[0].elem.common.flags = OPAL_DATATYPE_FLAG_PREDEFINED |
-                                                             OPAL_DATATYPE_FLAG_DATA |
-                                                             OPAL_DATATYPE_FLAG_CONTIGUOUS |
-                                                             OPAL_DATATYPE_FLAG_NO_GAPS;
-            datatype->super.desc.desc[0].elem.common.type  = i;
-            datatype->super.desc.desc[0].elem.count        = 1;
-            datatype->super.desc.desc[0].elem.disp         = 0;
-            datatype->super.desc.desc[0].elem.extent       = datatype->super.size;
-
-            datatype->super.desc.desc[1].end_loop.common.flags    = 0;
-            datatype->super.desc.desc[1].end_loop.common.type     = OPAL_DATATYPE_END_LOOP;
-            datatype->super.desc.desc[1].end_loop.items           = 1;
-            datatype->super.desc.desc[1].end_loop.first_elem_disp = datatype->super.desc.desc[0].elem.disp;
-            datatype->super.desc.desc[1].end_loop.size            = datatype->super.size;
-        }
-        /* Check if the data contain gaps */
-        if( (datatype->super.ub - datatype->super.lb) != (OPAL_PTRDIFF_TYPE)datatype->super.size ) {
-            datatype->super.desc.desc[0].elem.common.flags &= ~OPAL_DATATYPE_FLAG_NO_GAPS;
-        }
-    }
-
     /* Create the f2c translation table */
     OBJ_CONSTRUCT(&ompi_datatype_f_to_c_table, opal_pointer_array_t);
     if( OPAL_SUCCESS != opal_pointer_array_init(&ompi_datatype_f_to_c_table,

File ompi/mca/btl/openib/btl_openib_component.c

  *                         reserved.
  * Copyright (c) 2006-2007 Voltaire All rights reserved.
  * Copyright (c) 2009-2012 Oracle and/or its affiliates.  All rights reserved.
- * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2012      Oak Ridge National Laboratory.  All rights reserved
  * $COPYRIGHT$
  *
 
         if (OMPI_SUCCESS !=
             (rc = progress_no_credits_pending_frags(ep))) {
-            /* No where to return an error to so have to abort */
-            opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
-            orte_errmgr.abort(-1, NULL);
+            /* This is a fatal issue so call into PML and let it know. */
+            mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl;
+            openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
+                                 NULL, NULL);
+            return;
         }
     }
 

File opal/mca/hwloc/base/base.h

  */
 OPAL_DECLSPEC void opal_hwloc_base_get_local_cpuset(void);
 
+struct orte_rmaps_numa_node_t {
+    opal_list_item_t super;
+    int index;
+    float dist_from_closed;
+};
+typedef struct orte_rmaps_numa_node_t orte_rmaps_numa_node_t;
+OBJ_CLASS_DECLARATION(orte_rmaps_numa_node_t);
+
 /**
  * Enum for what memory allocation policy we want for user allocations.
  * MAP = memory allocation policy.
                                                        hwloc_obj_t obj,
                                                        opal_hwloc_resource_type_t rtype);
 
+OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, 
+                                    const char* device_name, 
+                                    opal_list_t *sorted_list);
+
 /**
  * Get the number of pu's under a given hwloc object.
  */

File opal/mca/hwloc/base/hwloc_base_dt.c

     for (i=0; i < num_vals; i++) {
         t = tarray[i];
 
-
         /* extract an xml-buffer representation of the tree */
         if (0 != hwloc_topology_export_xmlbuffer(t, &xmlbuffer, &len)) {
             return OPAL_ERROR;
         if (NULL != xmlbuffer) {
             free(xmlbuffer);
         }
+
         /* get the available support - hwloc unfortunately does
          * not include this info in its xml export!
          */
         /* since we are loading this from an external source, we have to
          * explicitly set a flag so hwloc sets things up correctly
          */
-        if (0 != hwloc_topology_set_flags(t, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM)) {
+        if (0 != hwloc_topology_set_flags(t, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES)) {
             free(xmlbuffer);
             rc = OPAL_ERROR;
             hwloc_topology_destroy(t);
         if (NULL != xmlbuffer) {
             free(xmlbuffer);
         }
+
         /* get the available support - hwloc unfortunately does
          * not include this info in its xml import!
          */
         if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, support->membind, &cnt, OPAL_BYTE))) {
             goto cleanup;
         }
+
         /* pass it back */
         tarray[i] = t;
 
 
 int opal_hwloc_copy(hwloc_topology_t *dest, hwloc_topology_t src, opal_data_type_t type)
 {
+    int i;
     char *xml;
     int len;
     struct hwloc_topology_support *support, *destsupport;
         return OPAL_VALUE2_GREATER;
     }
 
-
     /* do the comparison the "cheat" way - get an xml representation
      * of each tree, and strcmp!
      */

File opal/mca/hwloc/base/hwloc_base_open.c

 {
     ptr->num_objs = 0;
     ptr->rtype = 0;
+    OBJ_CONSTRUCT(&ptr->sorted_by_dist_list, opal_list_t);
+}
+static void sum_dest(opal_hwloc_summary_t *ptr)
+{
+    opal_list_item_t *item;
+    while (NULL != (item = opal_list_remove_first(&ptr->sorted_by_dist_list))) {
+        OBJ_RELEASE(item);
+    }
+    OBJ_DESTRUCT(&ptr->sorted_by_dist_list);
 }
 OBJ_CLASS_INSTANCE(opal_hwloc_summary_t,
                    opal_list_item_t,
-                   sum_const, NULL);
+                   sum_const, sum_dest);
 static void topo_data_const(opal_hwloc_topo_data_t *ptr)
 {
     ptr->available = NULL;
                    opal_object_t,
                    topo_data_const,
                    topo_data_dest);
+
+OBJ_CLASS_INSTANCE(orte_rmaps_numa_node_t,
+        opal_list_item_t,
+        NULL,
+        NULL);
 #endif

File opal/mca/hwloc/base/hwloc_base_util.c

     if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
         0 != hwloc_topology_set_flags(opal_hwloc_topology, 
                                       (HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
-                                       HWLOC_TOPOLOGY_FLAG_WHOLE_IO)) ||
+                                       HWLOC_TOPOLOGY_FLAG_IO_DEVICES)) ||
         0 != hwloc_topology_load(opal_hwloc_topology)) {
         return OPAL_ERR_NOT_SUPPORTED;
     }
 
     loc = df_search_min_bound(topo, obj, target, cache_level, &min_bound);
 
-    if (HWLOC_OBJ_CACHE == target) {
-        OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
-                             "hwloc:base:min_bound_under_obj found min bound of %u on %s:%u:%u",
-                             min_bound, hwloc_obj_type_string(target),
-                             cache_level, loc->logical_index));
-    } else {
-        OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
-                             "hwloc:base:min_bound_under_obj found min bound of %u on %s:%u",
-                             min_bound, hwloc_obj_type_string(target), loc->logical_index));
+    if (NULL != loc) {
+        if (HWLOC_OBJ_CACHE == target) {
+            OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
+                        "hwloc:base:min_bound_under_obj found min bound of %u on %s:%u:%u",
+                        min_bound, hwloc_obj_type_string(target),
+                        cache_level, loc->logical_index));
+        } else {
+            OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
+                        "hwloc:base:min_bound_under_obj found min bound of %u on %s:%u",
+                        min_bound, hwloc_obj_type_string(target), loc->logical_index));
+        }
     }
 
     return loc;
 
     return OPAL_SUCCESS;
 }
+
+static int dist_cmp_fn (opal_list_item_t **a, opal_list_item_t **b)
+{
+    orte_rmaps_numa_node_t *aitem = *((orte_rmaps_numa_node_t **) a);
+    orte_rmaps_numa_node_t *bitem = *((orte_rmaps_numa_node_t **) b);
+
+    if (bitem->dist_from_closed > aitem->dist_from_closed) {
+        return 1;
+    } else if( aitem->dist_from_closed == bitem->dist_from_closed ) {
+        return 0;
+    } else {
+        return -1;
+    }
+}
+
+static void sort_by_dist(hwloc_topology_t topo, const char* device_name, opal_list_t *sorted_list)
+{
+    hwloc_obj_t device_obj = NULL;
+    hwloc_obj_t obj = NULL, root = NULL;
+    const struct hwloc_distances_s* distances;
+    opal_list_item_t *numa_item;
+    orte_rmaps_numa_node_t *numa_node;
+    int close_node_index;
+    float latency;
+    int j;
+    int depth;
+    unsigned i;
+
+    for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) {
+        if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS
+                || device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) {
+            if (!strcmp(device_obj->name, device_name)) {
+                /* find numa node containing this device */
+                obj = device_obj->parent;
+                while ((obj != NULL) && (obj->type != HWLOC_OBJ_NODE)) {
+                    obj = obj->parent;
+                }
+                if (obj == NULL) {
+                    return;
+                } else {
+                    close_node_index = obj->logical_index;
+                }
+
+                /* find distance matrix for all numa nodes */
+                distances = hwloc_get_whole_distance_matrix_by_type(topo, HWLOC_OBJ_NODE);
+                if (NULL ==  distances) {
+                    /* we can try to find distances under group object. This info can be there. */
+                    depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE);
+                    if (depth < 0) {
+                        return;
+                    }
+                    root = hwloc_get_root_obj(topo);
+                    for (i = 0; i < root->arity; i++) {
+                        obj = root->children[i];
+                        if (obj->distances_count > 0) {
+                            for(j = 0; j < obj->distances_count; j++) {
+                                if (obj->distances[j]->relative_depth + 1 == depth) {
+                                    distances = obj->distances[j];
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                }
+                /* find all distances for our close node with logical index = close_node_index as close_node_index + nbobjs*j */
+                if ((NULL == distances) || (0 == distances->nbobjs)) {
+                    return;
+                }
+                /* fill list of numa nodes */
+                for (j = 0; j < distances->nbobjs; j++) {
+                    latency = distances->latency[close_node_index + distances->nbobjs * j];
+                    numa_node = OBJ_NEW(orte_rmaps_numa_node_t);
+                    numa_node->index = j;
+                    numa_node->dist_from_closed = latency;
+                    opal_list_append(sorted_list, &numa_node->super);
+                }
+                /* sort numa nodes by distance from the closest one to PCI */
+                opal_list_sort(sorted_list, dist_cmp_fn);
+                return;
+            }
+        }
+    }
+}
+
+void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const char* device_name, opal_list_t *sorted_list)
+{
+    hwloc_obj_t obj;
+    opal_list_item_t *item;
+    opal_hwloc_summary_t *sum;
+    opal_hwloc_topo_data_t *data;
+    orte_rmaps_numa_node_t *numa, *copy_numa;
+
+    obj = hwloc_get_root_obj(topo);
+
+    /* first see if the topology already has this info */
+    /* we call opal_hwloc_base_get_nbobjs_by_type() before it to fill summary object so it should exist*/
+    data = (opal_hwloc_topo_data_t*)obj->userdata;
+    if (NULL != data) {
+        for (item = opal_list_get_first(&data->summaries);
+                item != opal_list_get_end(&data->summaries);
+                item = opal_list_get_next(item)) {
+            sum = (opal_hwloc_summary_t*)item;
+            if (HWLOC_OBJ_NODE == sum->type) {
+                if (opal_list_get_size(&sum->sorted_by_dist_list) > 0) { 
+                    OPAL_LIST_FOREACH(numa, &(sum->sorted_by_dist_list), orte_rmaps_numa_node_t) {
+                        copy_numa = OBJ_NEW(orte_rmaps_numa_node_t);
+                        copy_numa->index = numa->index;
+                        copy_numa->dist_from_closed = numa->dist_from_closed;
+                        opal_list_append(sorted_list, &copy_numa->super);
+                    }
+                    return;
+                }else {
+                    /* don't already know it - go get it */
+                    sort_by_dist(topo, device_name, sorted_list);
+                    /* store this info in summary object for later usage */
+                    OPAL_LIST_FOREACH(numa, sorted_list, orte_rmaps_numa_node_t) {
+                        copy_numa = OBJ_NEW(orte_rmaps_numa_node_t);
+                        copy_numa->index = numa->index;
+                        copy_numa->dist_from_closed = numa->dist_from_closed;
+                        opal_list_append(&(sum->sorted_by_dist_list), &copy_numa->super);
+                    }
+                    return;
+                }
+            }
+        }
+    }
+}

File opal/mca/hwloc/hwloc.h

     unsigned cache_level;
     unsigned int num_objs;
     opal_hwloc_resource_type_t rtype;
+    opal_list_t sorted_by_dist_list;
 } opal_hwloc_summary_t;
 OBJ_CLASS_DECLARATION(opal_hwloc_summary_t);
 

File opal/mca/hwloc/hwloc152/README-ompi.txt

 
 https://svn.open-mpi.org/trac/hwloc/changeset/5513
 https://svn.open-mpi.org/trac/hwloc/changeset/5588
+https://svn.open-mpi.org/trac/hwloc/changeset/5592
 
-

File opal/mca/hwloc/hwloc152/hwloc/src/topology.c

 static void
 merge_useless_child(hwloc_topology_t topology, hwloc_obj_t *pparent)
 {
-  hwloc_obj_t parent = *pparent, child, *pchild;
+  hwloc_obj_t parent = *pparent, child, *pchild, ios;
 
   for_each_child_safe(child, parent, pchild)
     merge_useless_child(topology, pchild);
 
   child = parent->first_child;
-  if (!child || child->next_sibling)
-    /* There are no or several children, it's useful to keep them.  */
+  if (!child)
+    /* There are no child, nothing to merge. */
     return;
 
+  if (child->next_sibling && !hwloc_obj_type_is_io(child->next_sibling->type))
+    /* There are several non-I/O children */
+    return;
+
+  /* There is one non-I/O child and possible some I/O children.
+   * I/O children shouldn't prevent merging because they can be attached
+   * to anything with the same locality.
+   * Move them to the side during merging, and append them back later.
+   * This is easy because I/O children are always last in the list.
+   */
+  ios = child->next_sibling;
+  child->next_sibling = NULL;
+
   /* TODO: have a preference order?  */
   if (topology->ignored_types[parent->type] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE) {
     /* Parent can be ignored in favor of the child.  */
     print_object(topology, 0, child);
     parent->first_child = child->first_child;
     hwloc_free_unlinked_object(child);
+
+  }
+
+  if (ios) {
+    /* append I/O children to the list of children of the remaining object */
+    pchild = &((*pparent)->first_child);
+    while (*pchild)
+      pchild = &((*pchild)->next_sibling);
+    *pchild = ios;
   }
 }
 

File orte/mca/rmaps/base/base.h

     /* default mapping directives */
     orte_mapping_policy_t mapping;
     orte_ranking_policy_t ranking;
+    /* device specification for min distance mapping */
+    char *device;
 } orte_rmaps_base_t;
 
 /**

File orte/mca/rmaps/base/rmaps_base_binding.c

 
     /* binding requested */
     /* if the job was mapped by the corresponding target, then
-     * there is nothing more to do - the launch message creator
-     * will see that the binding object is NULL and will simply
-     * use the locale as the place to bind the proc
+     * we bind in place
      *
      * otherwise, we have to bind either up or down the hwloc
      * tree. If we are binding upwards (e.g., mapped to hwthread
      * to core), then we have to do a round-robin assigment of
      * procs to the resources below.
      */
-    if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
+            
+    if (ORTE_MAPPING_BYDIST == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
+        int rc = ORTE_SUCCESS;
+        if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
+            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                                "mca:rmaps: bindings for job %s - dist to numa",
+                                ORTE_JOBID_PRINT(jdata->jobid));
+            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
+                ORTE_ERROR_LOG(rc);
+            }
+        } else if (OPAL_BIND_TO_NUMA < OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
+            if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
+                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) {
+                    ORTE_ERROR_LOG(rc);
+                } 
+            } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
+                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
+                    ORTE_ERROR_LOG(rc);
+                } 
+            } else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
+                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
+                    ORTE_ERROR_LOG(rc);
+                } 
+            } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
+                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
+                    ORTE_ERROR_LOG(rc);
+                } 
+            } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
+                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
+                    ORTE_ERROR_LOG(rc);
+                } 
+            } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
+                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
+                    ORTE_ERROR_LOG(rc);
+                } 
+            }
+        }
+        /* if the binding policy is less than numa, then we are unbound - so
+         * just ignore this and return (should have been caught in prior
+         * tests anyway as only options meeting that criteria are "none"
+         * and "board")
+         */
+        return rc;
+    } else if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
         int rc;
         if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,

File orte/mca/rmaps/base/rmaps_base_frame.c

 
 #include "orte/mca/rmaps/base/rmaps_private.h"
 #include "orte/mca/rmaps/base/base.h"
-
 /*
  * The following file was created by configure.  It contains extern
  * statements and the definition of an array of pointers to each
     rmaps_base_mapping_policy = NULL;
     var_id = mca_base_var_register("orte", "rmaps", "base", "mapping_policy",
 #if OPAL_HAVE_HWLOC
-                                   "Mapping Policy [slot (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board | node | seq], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
+                                   "Mapping Policy [slot (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board | node | seq | dist], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
 #else
                                    "Mapping Policy [slot (default) | node], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
 #endif
             return ORTE_ERR_SILENT;
         }
         if (2 == opal_argv_count(ck)) {
+            /* if the policy is "dist", then we set the policy to that value
+             * and save the second argument as the device
+             */
+#if OPAL_HAVE_HWLOC
+            if (0 == strncasecmp(ck[0], "dist", len)) {
+                tmp = ORTE_MAPPING_BYDIST;
+                ck2 = opal_argv_split(ck[1], ',');
+                if (ck2[0] != NULL) {
+                    orte_rmaps_base.device = strdup(ck2[0]);
+                    for (i=1; NULL != ck2[i]; i++) {
+                        if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) {
+                            orte_rmaps_base.mapping |= ORTE_MAPPING_SPAN;
+                        } 
+                    }
+                }
+                opal_argv_free(ck2);
+                goto setpolicy;
+            }
+#endif
             ck2 = opal_argv_split(ck[1], ',');
             for (i=0; NULL != ck2[i]; i++) {
                 if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) {
             opal_argv_free(ck);
             return ORTE_ERR_SILENT;
         }
+    setpolicy:
         ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, tmp);
         ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
         opal_argv_free(ck);
     }
 
 #if OPAL_HAVE_HWLOC
-    /* if the cpus/rank > 1, then we have to bind to cores UNLESS the binding has
-     * already been set to something else
-     */
-    if (1 < orte_rmaps_base.cpus_per_rank &&
-        !OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
-        if (opal_hwloc_use_hwthreads_as_cpus) {
-            OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
-        } else {
-            OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
+    if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
+        /* if MAP BY DIST then we set binding policy to numa UNLESS the binding has
+         * already been set to something else
+         */
+        if (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYDIST) {
+            OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_NUMA);
+        } else if (1 < orte_rmaps_base.cpus_per_rank) {
+            /* if the cpus/rank > 1, then we have to bind to cores UNLESS the binding has
+             * already been set to something else
+             */
+            if (opal_hwloc_use_hwthreads_as_cpus) {
+                OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
+            } else {
+                OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
+            }
         }
     }
 #endif

File orte/mca/rmaps/base/rmaps_base_print_fns.c

     case ORTE_MAPPING_BYUSER:
         map = "BYUSER";
         break;
+    case ORTE_MAPPING_BYDIST:
+        map = "MINDIST";
+        break;
     default:
         if (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
             map = "PPR";

File orte/mca/rmaps/mindist/Makefile.am

+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+#                         University Research and Technology
+#                         Corporation.  All rights reserved.
+# Copyright (c) 2004-2005 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, 
+#                         University of Stuttgart.  All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+#                         All rights reserved.
+# Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2013      Los Alamos National Security, LLC.  All rights reserved.
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+
+dist_pkgdata_DATA = help-orte-rmaps-md.txt
+
+sources = \
+        rmaps_mindist.h \
+        rmaps_mindist_module.c \
+        rmaps_mindist_component.c
+
+# Make the output library in this directory, and name it either
+# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
+# (for static builds).
+
+if MCA_BUILD_orte_rmaps_mindist_DSO
+component_noinst =
+component_install = mca_rmaps_mindist.la
+else
+component_noinst = libmca_rmaps_mindist.la
+component_install =
+endif
+
+mcacomponentdir = $(pkglibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_rmaps_mindist_la_SOURCES = $(sources)
+mca_rmaps_mindist_la_LDFLAGS = -module -avoid-version
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_rmaps_mindist_la_SOURCES =$(sources)
+libmca_rmaps_mindist_la_LDFLAGS = -module -avoid-version

File orte/mca/rmaps/mindist/configure.m4

+# -*- shell-script -*-
+#
+# Copyright (c) 2012-2013 Los Alamos National Security, LLC.
+#                         All rights reserved.
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+# MCA_rmaps_mindist_CONFIG([action-if-found], [action-if-not-found])
+# -----------------------------------------------------------
+AC_DEFUN([MCA_orte_rmaps_mindist_CONFIG], [
+    AC_CONFIG_FILES([orte/mca/rmaps/mindist/Makefile])
+
+    AS_IF([test "$OPAL_HAVE_HWLOC" = 1],
+          [$1],
+          [$2])
+])

File orte/mca/rmaps/mindist/help-orte-rmaps-md.txt

+# -*- text -*-
+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+#                         University Research and Technology
+#                         Corporation.  All rights reserved.
+# Copyright (c) 2004-2005 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+#                         University of Stuttgart.  All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+#                         All rights reserved.
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+#
+[multi-apps-and-zero-np]
+Open MPI found multiple applications to be launched, and at least one
+that failed to specify the number of processes to execute.  When
+specifying multiple applications, you must specify how many processes
+of each to launch via the -np argument.
+#
+[orte-rmaps-mindist:no-pci-locality-info]
+No PCI locality information could be found on at least one node:
+
+  Node: %s
+
+Open MPI therefore cannot mapp the application as specified.

File orte/mca/rmaps/mindist/rmaps_mindist.h

+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+/**
+ * @file
+ *
+ * Resource Mapping 
+ */
+#ifndef ORTE_RMAPS_MINDIST_H
+#define ORTE_RMAPS_MINDIST_H
+
+#include "orte_config.h"
+
+#include "opal/mca/hwloc/hwloc.h"
+#include "opal/class/opal_list.h"
+
+#include "orte/mca/rmaps/rmaps.h"
+
+BEGIN_C_DECLS
+
+ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_mindist_component;
+extern orte_rmaps_base_module_t orte_rmaps_mindist_module;
+
+END_C_DECLS
+
+#endif

File orte/mca/rmaps/mindist/rmaps_mindist_component.c

+/*
+ * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2013      Los Alamos National Security, LLC.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "orte_config.h"
+#include "orte/constants.h"
+
+#include "opal/mca/base/base.h"
+#include "opal/mca/base/mca_base_var.h"
+
+#include "orte/mca/rmaps/base/rmaps_private.h"
+#include "rmaps_mindist.h"
+
+/*
+ * Local functions
+ */
+
+static int orte_rmaps_mindist_open(void);
+static int orte_rmaps_mindist_close(void);
+static int orte_rmaps_mindist_query(mca_base_module_t **module, int *priority);
+static int orte_rmaps_mindist_register(void);
+
+static int my_priority = 20;
+
+orte_rmaps_base_component_t mca_rmaps_mindist_component = {
+    {
+        ORTE_RMAPS_BASE_VERSION_2_0_0,
+        
+        "mindist", /* MCA component name */
+        ORTE_MAJOR_VERSION,  /* MCA component major version */
+        ORTE_MINOR_VERSION,  /* MCA component minor version */
+        ORTE_RELEASE_VERSION,  /* MCA component release version */
+        orte_rmaps_mindist_open,  /* component open  */
+        orte_rmaps_mindist_close, /* component close */
+        orte_rmaps_mindist_query,  /* component query */
+        orte_rmaps_mindist_register
+    },
+    {
+        /* The component is checkpoint ready */
+        MCA_BASE_METADATA_PARAM_CHECKPOINT
+    }
+};
+
+
+static int orte_rmaps_mindist_register(void)
+{
+    (void) mca_base_component_var_register(&mca_rmaps_mindist_component.base_version,
+                                           "priority", "Priority of the mindist rmaps component",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &my_priority);
+    return ORTE_SUCCESS;
+}
+
+/**
+  * component open/close/init function
+  */
+static int orte_rmaps_mindist_open(void)
+{
+    return ORTE_SUCCESS;
+}
+
+
+static int orte_rmaps_mindist_query(mca_base_module_t **module, int *priority)
+{
+    /* the RMAPS framework is -only- opened on HNP's,
+     * so no need to check for that here
+     */
+    
+    *priority = my_priority;
+    *module = (mca_base_module_t *)&orte_rmaps_mindist_module;
+    return ORTE_SUCCESS;
+}
+
+/**
+ *  Close all subsystems.
+ */
+
+static int orte_rmaps_mindist_close(void)
+{
+    return ORTE_SUCCESS;
+}
+

File orte/mca/rmaps/mindist/rmaps_mindist_module.c

+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006-2011 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
+ *                         All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "orte_config.h"
+#include "orte/constants.h"
+#include "orte/types.h"
+
+#include <errno.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif  /* HAVE_UNISTD_H */
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif  /* HAVE_STRING_H */
+
+#include "opal/mca/base/mca_base_var.h"
+
+#include "orte/util/show_help.h"
+#include "orte/mca/errmgr/errmgr.h"
+#include "orte/util/error_strings.h"
+
+#include "orte/mca/rmaps/base/rmaps_private.h"
+#include "orte/mca/rmaps/base/base.h"
+#include "orte/mca/rmaps/mindist/rmaps_mindist.h"
+
+static int mindist_map(orte_job_t *jdata);
+
+orte_rmaps_base_module_t orte_rmaps_mindist_module = {
+    mindist_map
+};
+
+/*
+ * Create a round-robin mapping for the job.
+ */
+static int mindist_map(orte_job_t *jdata)
+{
+    orte_app_context_t *app;
+    int i, j;
+    unsigned int k;
+    hwloc_obj_t obj = NULL;
+    opal_list_t node_list;
+    opal_list_t numa_list;
+    opal_list_item_t *item;
+    opal_list_item_t *numa_item;
+    orte_rmaps_numa_node_t *numa;
+    orte_node_t *node;
+    orte_proc_t *proc;
+    int nprocs_mapped;
+    int extra_procs, navg, nextra;
+    orte_std_cntr_t num_nodes, num_slots;
+    unsigned int npus, total_npus, num_procs_to_assign, required;
+    int rc;
+    mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
+    bool initial_map=true;
+    bool bynode = false;
+
+    /* this mapper can only handle initial launch
+     * when mindist mapping is desired
+     */
+    if (ORTE_JOB_CONTROL_RESTART & jdata->controls) {
+        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                            "mca:rmaps:mindist: job %s is being restarted - mindist cannot map",
+                            ORTE_JOBID_PRINT(jdata->jobid));
+        return ORTE_ERR_TAKE_NEXT_OPTION;
+    }
+    if (NULL != jdata->map->req_mapper &&
+        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
+        /* a mapper has been specified, and it isn't me */
+        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                            "mca:rmaps:mindist: job %s not using mindist mapper",
+                            ORTE_JOBID_PRINT(jdata->jobid));
+        return ORTE_ERR_TAKE_NEXT_OPTION;
+    }
+    if (ORTE_MAPPING_BYDIST != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
+        /* not me */
+        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                            "mca:rmaps:mindist: job %s not using mindist mapper",
+                            ORTE_JOBID_PRINT(jdata->jobid));
+        return ORTE_ERR_TAKE_NEXT_OPTION;
+    }
+
+    /* there are two modes for mapping by dist: span and not-span. The
+     * span mode essentially operates as if there was just a single
+     * "super-node" in the system - i.e., it balances the load across
+     * all objects of the indicated type regardless of their location.
+     * In essence, it acts as if we placed one proc on each object, cycling
+     * across all objects on all nodes, and then wrapped around to place
+     * another proc on each object, doing so until all procs were placed.
+     *
+     * In contrast, the non-span mode operates similar to byslot mapping.
+     * All slots on each node are filled, assigning each proc to an object
+     * on that node in a balanced fashion, and then the mapper moves on
+     * to the next node. Thus, procs tend to be "front loaded" onto the
+     * list of nodes, as opposed to being "load balanced" in the span mode
+     */
+
+    if (ORTE_MAPPING_SPAN & jdata->map->mapping) {
+        /* do a bynode mapping */
+        bynode = true;
+    } else {
+        /* do a byslot mapping */
+        bynode = false;
+    }
+
+    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+                        "mca:rmaps:mindist: mapping job %s",
+                        ORTE_JOBID_PRINT(jdata->jobid));
+ 
+    /* flag that I did the mapping */
+    if (NULL != jdata->map->last_mapper) {
+        free(jdata->map->last_mapper);
+    }
+    jdata->map->last_mapper = strdup(c->mca_component_name);
+
+    /* start at the beginning... */
+    jdata->num_procs = 0;
+    
+    /* cycle through the app_contexts, mapping them sequentially */
+    for(i=0; i < jdata->apps->size; i++) {
+        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
+            continue;
+        }
+        
+        /* setup the nodelist here in case we jump to error */
+        OBJ_CONSTRUCT(&node_list, opal_list_t);
+
+        /* if the number of processes wasn't specified, then we know there can be only
+         * one app_context allowed in the launch, and that we are to launch it across
+         * all available slots. We'll double-check the single app_context rule first
+         */
+        if (0 == app->num_procs && 1 < jdata->num_apps) {
+            orte_show_help("help-orte-rmaps-mindist.txt", "multi-apps-and-zero-np",
+                           true, jdata->num_apps, NULL);
+            rc = ORTE_ERR_SILENT;
+            goto error;
+        }
+
+        /* for each app_context, we have to get the list of nodes that it can
+         * use since that can now be modified with a hostfile and/or -host
+         * option
+         */
+        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
+                                                                  jdata->map->mapping, initial_map, false))) {
+            ORTE_ERROR_LOG(rc);
+            goto error;
+        }
+        num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
+        /* flag that all subsequent requests should not reset the node->mapped flag */
+        initial_map = false;
+
+        /* if a bookmark exists from some prior mapping, set us to start there */
+        jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
+       
+        if (0 == app->num_procs) {
+            /* set the num_procs to equal the number of slots on these mapped nodes */
+            app->num_procs = num_slots;
+        }
+        
+        nprocs_mapped = 0;
+        if (!num_nodes) {
+            rc = ORTE_ERR_SILENT;
+            goto error;
+        } 
+        if (bynode) {
+            /* calculate num_procs_to_assign for bynode case */
+            navg = app->num_procs / num_nodes;
+            nextra = app->num_procs - navg * num_nodes;
+            num_procs_to_assign = navg;
+            if (nextra > 0)
+                num_procs_to_assign++;
+        }
+
+        /* iterate through the list of nodes */
+        for (item = opal_list_get_first(&node_list);
+                item != opal_list_get_end(&node_list);
+                item = opal_list_get_next(item)) {
+            node = (orte_node_t*)item;
+       
+            if (NULL == node->topology) {
+                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology",
+                        true, node->name);
+                rc = ORTE_ERR_SILENT;
+                goto error;
+            }
+            /* get the root object as we are not assigning
+             * locale except at the node level
+             */
+            obj = hwloc_get_root_obj(node->topology);
+            if (NULL == obj) {
+                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology",
+                        true, node->name);
+                rc = ORTE_ERR_SILENT;
+                goto error;
+            }
+            
+            /* add the node to the map, if needed */
+            if (!node->mapped) {
+                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
+                    ORTE_ERROR_LOG(rc);
+                    goto error;
+                }
+                node->mapped = true;
+                OBJ_RETAIN(node);  /* maintain accounting on object */
+                jdata->map->num_nodes++;
+            }
+
+            /* get the number of available pus */
+            if (opal_hwloc_use_hwthreads_as_cpus) {
+                total_npus = opal_hwloc_base_get_nbobjs_by_type(node->topology, HWLOC_OBJ_PU, 0, OPAL_HWLOC_AVAILABLE);
+            } else {
+                total_npus = opal_hwloc_base_get_nbobjs_by_type(node->topology, HWLOC_OBJ_CORE, 0, OPAL_HWLOC_AVAILABLE);
+            }
+            if (bynode) {
+                if (total_npus < num_procs_to_assign * orte_rmaps_base.cpus_per_rank) {
+                    /* check if oversubscribing is allowed */
+                    if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
+                        orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
+                                true, app->num_procs, app->app);
+                        rc = ORTE_ERR_SILENT;
+                        goto error;
+                    } else {
+                        node->oversubscribed = true;
+                    }
+                }
+            }
+            OBJ_CONSTRUCT(&numa_list, opal_list_t);
+            opal_hwloc_get_sorted_numa_list(node->topology, orte_rmaps_base.device, &numa_list);
+            if (opal_list_get_size(&numa_list) > 0) {
+                j = 0;
+                required = 0;
+                OPAL_LIST_FOREACH(numa, &numa_list, orte_rmaps_numa_node_t) {
+                    /* get the hwloc object for this numa */
+                    if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, HWLOC_OBJ_NODE, 0, numa->index, OPAL_HWLOC_AVAILABLE))) {
+                        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
+                        return ORTE_ERR_NOT_FOUND;
+                    }
+                    npus = opal_hwloc_base_get_npus(node->topology, obj);
+                    if (bynode) {
+                        required = ((num_procs_to_assign-j) > npus/orte_rmaps_base.cpus_per_rank) ? (npus/orte_rmaps_base.cpus_per_rank) : (num_procs_to_assign-j);
+                    } else {
+                        required = npus/orte_rmaps_base.cpus_per_rank;
+                    }
+                    for (k = 0; (k < required) && (nprocs_mapped < app->num_procs); k++) {
+                        if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, i))) {
+                            rc = ORTE_ERR_OUT_OF_RESOURCE;
+                            goto error;
+                        }
+                        nprocs_mapped++;
+                        j++;
+                        proc->locale = obj;
+                    }
+                    if ((nprocs_mapped == (int)app->num_procs) || (bynode && ((int)num_procs_to_assign == j))) {
+                        break;
+                    }
+                }
+                opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
+                        "mca:rmaps:mindist: assigned %d procs to node %s",
+                        j, node->name);
+            } else {
+                /* don't have info about pci locality */
+                orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:no-pci-locality-info",
+                        true, node->name);
+                rc = ORTE_ERR_SILENT;
+                goto error;
+            }
+            while (NULL != (numa_item = opal_list_remove_first(&numa_list))) {
+                OBJ_RELEASE(numa_item);
+            }
+            OBJ_DESTRUCT(&numa_list);
+            if (bynode) {
+                nextra--;
+                if (nextra == 0) {
+                    num_procs_to_assign--;
+                }
+            }
+        }
+
+        /* If we get to the end of all the nodes and still have procs remaining, then
+         * we check the oversubscribed flag - if oversubscription is allowed, then
+         * begin assigning procs round-robin *bynode* until all procs have been assigned.
+         * This ensures that the overload is evenly distributed across all nodes.
+         */
+
+        extra_procs = app->num_procs - nprocs_mapped;
+        if (extra_procs > 0) { 
+            /* check if oversubscribing is allowed */
+            if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
+                orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
+                        true, app->num_procs, app->app);
+                rc = ORTE_ERR_SILENT;
+                goto error;
+            }
+            opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
+                    "mca:rmaps:mindist job %s is oversubscribed - performing second pass",
+                    ORTE_JOBID_PRINT(jdata->jobid));
+            num_procs_to_assign = extra_procs/num_nodes;
+            nextra = extra_procs % num_nodes;
+            if (nextra > 0) {
+                num_procs_to_assign++;
+            }
+            for (item = opal_list_get_first(&node_list);
+                    item != opal_list_get_end(&node_list);
+                    item = opal_list_get_next(item)) {
+                node = (orte_node_t*)item;
+
+                if (nprocs_mapped == app->num_procs)
+                    break;
+                node->oversubscribed = true;
+                opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
+                        "mca:rmaps:mindist: second pass assigning %d extra procs to node %s",
+                        (int)num_procs_to_assign, node->name);
+                OBJ_CONSTRUCT(&numa_list, opal_list_t);
+                opal_hwloc_get_sorted_numa_list(node->topology, orte_rmaps_base.device, &numa_list);
+                if (opal_list_get_size(&numa_list) > 0) {
+                    numa_item = opal_list_get_first(&numa_list);
+                    k = 0;
+                    obj = hwloc_get_obj_by_type(node->topology, HWLOC_OBJ_NODE,((orte_rmaps_numa_node_t*)numa_item)->index);
+                    npus = opal_hwloc_base_get_npus(node->topology, obj);
+                    for (j = 0; j < (int)num_procs_to_assign && nprocs_mapped < (int)app->num_procs; j++) {
+                        if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, i))) {
+                            rc = ORTE_ERR_OUT_OF_RESOURCE;
+                            goto error;
+                        }
+                        nprocs_mapped++;
+                        k++;
+                        proc->locale = obj;
+                        if (k > npus/orte_rmaps_base.cpus_per_rank-1) {
+                            numa_item = opal_list_get_next(numa_item);
+                            if (numa_item == opal_list_get_end(&numa_list)) { 
+                                numa_item = opal_list_get_first(&numa_list);
+                            }
+                            obj = hwloc_get_obj_by_type(node->topology, HWLOC_OBJ_NODE,((orte_rmaps_numa_node_t*)numa_item)->index);
+                            npus = opal_hwloc_base_get_npus(node->topology, obj);
+                            k = 0;
+                        }
+                    }
+                }
+                while (NULL != (numa_item = opal_list_remove_first(&numa_list))) {
+                    OBJ_RELEASE(numa_item);
+                }
+                OBJ_DESTRUCT(&numa_list);
+                nextra--;
+                if (nextra == 0) {
+                    num_procs_to_assign--;
+                }
+            }
+        }
+
+        /* compute vpids and add proc objects to the job - do this after
+         * each app_context so that the ranks within each context are
+         * contiguous
+         */
+        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
+            ORTE_ERROR_LOG(rc);
+            return rc;
+        }
+
+        /* track the total number of processes we mapped - must update
+         * this value AFTER we compute vpids so that computation
+         * is done correctly
+         */
+        jdata->num_procs += app->num_procs;
+
+        /* cleanup the node list - it can differ from one app_context
+         * to another, so we have to get it every time
+         */
+        while (NULL != (item = opal_list_remove_first(&node_list))) {
+            OBJ_RELEASE(item);
+        }
+        OBJ_DESTRUCT(&node_list);
+    }
+
+    return ORTE_SUCCESS;
+
+error:
+    while(NULL != (item = opal_list_remove_first(&node_list))) {
+        OBJ_RELEASE(item);
+    }
+    OBJ_DESTRUCT(&node_list);
+
+    return rc;
+}

File orte/mca/rmaps/rmaps_types.h

 #define ORTE_MAPPING_BYL1CACHE         8
 #define ORTE_MAPPING_BYCORE            9
 #define ORTE_MAPPING_BYHWTHREAD        10
+#define ORTE_MAPPING_BYDIST            11
 /* convenience - declare anything <= 15 to be round-robin*/
 #define ORTE_MAPPING_RR                0x000f
 /* sequential policy */