Commits

Jeff Squyres  committed 8b3fcdd

Up to SVN r28451

  • Participants
  • Parent commits 369a242

Comments (0)

Files changed (65)

 ./opal/mca/event/libevent2021/libevent/compile
 ./opal/mca/event/libevent2021/libevent/include/Makefile.in
 ./opal/mca/event/libevent2021/libevent/include/event2/event-config.h
+./opal/mca/event/libevent2021/libevent/m4/ltsugar.m4
+./opal/mca/event/libevent2021/libevent/m4/libtool.m4
+./opal/mca/event/libevent2021/libevent/m4/ltversion.m4
+./opal/mca/event/libevent2021/libevent/m4/lt~obsolete.m4
+./opal/mca/event/libevent2021/libevent/m4/ltoptions.m4
+./opal/mca/event/external/Makefile.in
 ./opal/mca/base/Makefile.in
 ./opal/mca/base/*.obj
 ./opal/mca/base/mca_base_parse_paramfile_lex.c

File config/orte_configure_options.m4

 if test "$enable_orte_progress_threads" = "yes"; then
     AC_MSG_RESULT([yes])
     orte_enable_progress_threads=1
-    # require libevent thread support
-    if test "$enable_event_thread_support" != "yes" ; then
-        AC_MSG_WARN([ORTE progress threads require libevent thread])
-        AC_MSG_WARN([be enabled. Please configure with])
-        AC_MSG_WARN([--enable-event-thread-support])
-        AC_MSG_ERROR([Cannot continue])
-    fi
 else
     AC_MSG_RESULT([no])
     orte_enable_progress_threads=0

File contrib/platform/greenplum/bend/linux

-enable_event_thread_support=yes
 enable_opal_multi_threads=yes
 enable_orte_progress_threads=yes
 enable_ft_thread=no

File contrib/platform/greenplum/bend/linux-optimized

-enable_event_thread_support=yes
 enable_opal_multi_threads=yes
 enable_orte_progress_threads=yes
 enable_ft_thread=no

File contrib/platform/greenplum/bend/mac

-enable_event_thread_support=yes
 enable_opal_multi_threads=yes
 enable_orte_progress_threads=yes
 enable_mem_debug=yes

File contrib/platform/greenplum/bend/mac-optimized

-enable_event_thread_support=yes
 enable_opal_multi_threads=yes
 enable_orte_progress_threads=yes
 enable_mem_debug=no

File contrib/platform/iu/odin/debug

 enable_mem_debug=yes
 with_blcr=no
-with_verbs=no
+with_verbs=yes
 enable_debug_symbols=yes
 enable_binaries=yes
 with_devel_headers=yes

File contrib/platform/iu/odin/debug.conf

 #
 
 # Basic behavior to smooth startup
-mca_component_show_load_errors = 1
-mpi_param_check = 0 
+mca_base_component_show_load_errors = 1
 orte_abort_timeout = 10 
 hwloc_base_mem_bind_failure_action = silent
 
 #oob_tcp_connect_timeout=600 
 
 ## Define the MPI interconnects
-btl = sm,tcp,self 
+btl = sm,openib,self 
+btl_openib_cpc_include = udcm
 
 ## Setup shared memory
 btl_sm_free_list_max = 768

File ompi/attribute/attribute.c

  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2006 The University of Tennessee and The University
+ * Copyright (c) 2004-2013 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
  *                         University of Stuttgart.  All rights reserved.
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
- * Copyright (c) 2006-2012 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2006-2013 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2012      Los Alamos National Security, LLC.  All rights
  *                         reserved. 
  * $COPYRIGHT$
  *
  * Example A: INTEGER ret
  *            CALL MPI_ATTR_GET(..., ret, ierr)
- *            --> ret will equal &foo, possibly truncaed
+ *            --> ret will equal &foo, possibly truncated
  * Example B: INTEGER ret
  *            CALL MPI_ATTR_GET(..., ret, ierr)
- *            --> ret will equal &bar, possibly truncaed
+ *            --> ret will equal &bar, possibly truncated
  *
  * 3. Fortran MPI-2 reads the attribute value.  The C pointer is cast
  * to a fortran INTEGER(KIND=MPI_ADDRESS_KIND) (i.e., a (MPI_Aint)).
 
 #include "ompi_config.h"
 
-#include "ompi/attribute/attribute.h"
 #include "opal/class/opal_bitmap.h"
 #include "opal/threads/mutex.h"
+#include "opal/sys/atomic.h"
 
+#include "ompi/attribute/attribute.h"
 #include "ompi/constants.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/communicator/communicator.h"  /* ompi_communicator_t generated in [COPY|DELETE]_ATTR_CALLBACKS */
    Ick.
  */
 
-#define DELETE_ATTR_CALLBACKS(type, attribute, keyval_obj, object) \
+#define DELETE_ATTR_CALLBACKS(type, attribute, keyval_obj, object, err)     \
     if (0 != (keyval_obj->attr_flag & OMPI_KEYVAL_F77)) { \
         MPI_Fint f_key = OMPI_INT_2_FINT(key); \
         MPI_Fint f_err; \
                 (&(((ompi_##type##_t *)object)->attr_##type##_f), \
                  &f_key, &attr_val, &keyval_obj->extra_state.f_integer, &f_err); \
             if (MPI_SUCCESS != OMPI_FINT_2_INT(f_err)) { \
-		OPAL_THREAD_UNLOCK(&attr_hash_lock);	 \
-                return OMPI_FINT_2_INT(f_err); \
+                err = OMPI_FINT_2_INT(f_err);           \
             } \
         } \
         /* MPI-2 Fortran-style */ \
                 (&(((ompi_##type##_t *)object)->attr_##type##_f), \
                  &f_key, (int*)&attr_val, &keyval_obj->extra_state.f_address, &f_err); \
             if (MPI_SUCCESS != OMPI_FINT_2_INT(f_err)) { \
-		OPAL_THREAD_UNLOCK(&attr_hash_lock);	 \
-                return OMPI_FINT_2_INT(f_err); \
+                err = OMPI_FINT_2_INT(f_err); \
             } \
         } \
     } \
     /* C style */ \
     else { \
         void *attr_val = translate_to_c(attribute); \
-        if ((err = (*((keyval_obj->delete_attr_fn).attr_##type##_delete_fn)) \
-                            ((ompi_##type##_t *)object, \
-                            key, attr_val, \
-                            keyval_obj->extra_state.c_ptr)) != MPI_SUCCESS) {\
-	    OPAL_THREAD_UNLOCK(&attr_hash_lock);			\
-            return err;\
-        } \
+        err = (*((keyval_obj->delete_attr_fn).attr_##type##_delete_fn)) \
+            ((ompi_##type##_t *)object,                                 \
+             key, attr_val,                                             \
+             keyval_obj->extra_state.c_ptr);                            \
     }
 
 /* See the big, long comment above from DELETE_ATTR_CALLBACKS -- most of
    that text applies here, too. */
 
-#define COPY_ATTR_CALLBACKS(type, old_object, keyval_obj, in_attr, new_object, out_attr) \
+#define COPY_ATTR_CALLBACKS(type, old_object, keyval_obj, in_attr, new_object, out_attr, err) \
     if (0 != (keyval_obj->attr_flag & OMPI_KEYVAL_F77)) { \
         MPI_Fint f_key = OMPI_INT_2_FINT(key); \
         MPI_Fint f_err; \
                  &f_key, &keyval_obj->extra_state.f_integer, \
                  &in, &out, &f_flag, &f_err); \
             if (MPI_SUCCESS != OMPI_FINT_2_INT(f_err)) { \
-                OPAL_THREAD_UNLOCK(&attr_hash_lock); \
-                return OMPI_FINT_2_INT(f_err); \
-            } \
-            out_attr->av_value = (void*) 0; \
-            *out_attr->av_integer_pointer = out; \
-            flag = OMPI_LOGICAL_2_INT(f_flag); \
+                err = OMPI_FINT_2_INT(f_err);           \
+            } else {                                    \
+                out_attr->av_value = (void*) 0;         \
+                *out_attr->av_integer_pointer = out;    \
+                flag = OMPI_LOGICAL_2_INT(f_flag);      \
+            }                                           \
         } \
         /* MPI-2 Fortran-style */ \
         else { \
                  &f_key, &keyval_obj->extra_state.f_address, &in, &out, \
                  &f_flag, &f_err); \
             if (MPI_SUCCESS != OMPI_FINT_2_INT(f_err)) { \
-                OPAL_THREAD_UNLOCK(&attr_hash_lock); \
-                return OMPI_FINT_2_INT(f_err); \
-            } \
-            out_attr->av_value = (void *) out; \
-            flag = OMPI_LOGICAL_2_INT(f_flag); \
+                err = OMPI_FINT_2_INT(f_err);           \
+            } else {                                    \
+                out_attr->av_value = (void *) out;      \
+                flag = OMPI_LOGICAL_2_INT(f_flag);      \
+            }                                           \
         } \
     } \
     /* C style */ \
         in = translate_to_c(in_attr); \
         if ((err = (*((keyval_obj->copy_attr_fn).attr_##type##_copy_fn)) \
               ((ompi_##type##_t *)old_object, key, keyval_obj->extra_state.c_ptr, \
-               in, &out, &flag, (ompi_##type##_t *)(new_object))) != MPI_SUCCESS) { \
-            OPAL_THREAD_UNLOCK(&attr_hash_lock); \
-            return err; \
-        } \
-        out_attr->av_value = out; \
+               in, &out, &flag, (ompi_##type##_t *)(new_object))) == MPI_SUCCESS) { \
+            out_attr->av_value = out;                                   \
+        }                                                               \
     }
 
 
  */
 typedef struct attribute_value_t {
     opal_object_t super;
+    int av_key;
     void *av_value;
     MPI_Aint *av_address_kind_pointer;
     MPI_Fint *av_integer_pointer;
     int av_set_from;
+    int av_sequence;
 } attribute_value_t;
 
 
 static void *translate_to_c(attribute_value_t *val);
 static MPI_Fint translate_to_fortran_mpi1(attribute_value_t *val);
 static MPI_Aint translate_to_fortran_mpi2(attribute_value_t *val);
+static int compare_attr_sequence(const void *attr1, const void *attr2);
 
 
 /*
 
 static opal_hash_table_t *keyval_hash;
 static opal_bitmap_t *key_bitmap;
+static int attr_sequence;
 static unsigned int int_pos = 12345;
 
 /*
- * Have one lock protect all access to any attribute hashes on MPI
- * objects and one to protect access to any other attribute stuff
- * (keyval hash, key bitmap,).  We could eventually go for finer
- * grained locking, but this will do for now.  Originally had only
- * one lock, but this can lead to a deadlock if the delete callback
- * for an attribute attempts to free a keyval
- *
- * Notes:
- * 1. If you need both locks simultaneously, ALWAYS take
- *   the keyval_hash_lock lock first.
- * 2. You *must* always drop both locks before calling any user defined
- *   callbacks (ie copy/delete callbacks)
- *
+ * We used to have multiple locks for semi-fine-grained locking.  But
+ * the code got complex, and we had to spend time looking for subtle
+ * bugs.  Craziness -- MPI attributes are *not* high performance, so
+ * just use a One Big Lock approach: there is *no* concurrent access.
+ * If you have the lock, you can do whatever you want and no data will
+ * change/disapear from underneath you.
  */
-static opal_mutex_t keyval_hash_lock;
-static opal_mutex_t attr_hash_lock;
+static opal_mutex_t attribute_lock;
 
 
 /*
  */
 static void attribute_value_construct(attribute_value_t *item)
 {
+    item->av_key = MPI_KEYVAL_INVALID;
     item->av_address_kind_pointer = (MPI_Aint*) &item->av_value;
     item->av_integer_pointer = &(((MPI_Fint*) &item->av_value)[int_pos]);
     item->av_set_from = 0;
+    item->av_sequence = -1;
 }
 
 
 static void 
 ompi_attribute_keyval_destruct(ompi_attribute_keyval_t *keyval) 
 {
-    /* THIS FUNCTION ASSUMES THAT THE CALLER ALREADY HAS OBTAINED THE
-       keyval_hash_lock MUTEX!  Remove the keyval entry from the hash and free
-       the key. */
-
     if (-1 != keyval->key) {
         /* If the bindings_extra_state pointer is not NULL, free it */
         if (NULL != keyval->bindings_extra_state) {
 
 /* 
  * This will initialize the main list to store key- attribute
- * items. This will be called one time, mostly during MPI_INIT()
+ * items. This will be called one time, during MPI_INIT().
  */
-
 int ompi_attr_init(void)
 {
     int ret;
         }
     }
 
-    OBJ_CONSTRUCT(&keyval_hash_lock, opal_mutex_t);
-    OBJ_CONSTRUCT(&attr_hash_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&attribute_lock, opal_mutex_t);
 
     if (OMPI_SUCCESS != (ret = opal_hash_table_init(keyval_hash,
                                                     ATTR_TABLE_SIZE))) {
 
 
 /* 
- * This will destroy the list, mostly during MPI_Finalize()
+ * Cleanup everything during MPI_Finalize().
  */
-
 int ompi_attr_finalize(void)
 {
-    int ret;
-
-    ret = ompi_attr_free_predefined();
+    ompi_attr_free_predefined();
+    OBJ_DESTRUCT(&attribute_lock);
     OBJ_RELEASE(keyval_hash);
     OBJ_RELEASE(key_bitmap);
 
-    return ret;
+    return OMPI_SUCCESS;
 }
 
+/*****************************************************************************/
 
 static int ompi_attr_create_keyval_impl(ompi_attribute_type_t type,
                             ompi_attribute_fn_ptr_union_t copy_attr_fn,
                             ompi_attribute_fn_ptr_union_t delete_attr_fn,
-                                        int *key,
-                                        ompi_attribute_fortran_ptr_t *extra_state,
-                                        int flags,
+                            int *key,
+                            ompi_attribute_fortran_ptr_t *extra_state,
+                            int flags,
                             void *bindings_extra_state)
 {
     ompi_attribute_keyval_t *keyval;
     int ret;
 
     /* Allocate space for the list item */
-
     keyval = OBJ_NEW(ompi_attribute_keyval_t);
     if (NULL == keyval) {
         return MPI_ERR_SYSRESOURCE;
     /* Fill in the list item (must be done before we set the keyval
        on the keyval_hash in case some other thread immediately reads
        it from the keyval_hash) */
-  
     keyval->copy_attr_fn = copy_attr_fn;
     keyval->delete_attr_fn = delete_attr_fn;
     keyval->extra_state = *extra_state;
     keyval->bindings_extra_state = bindings_extra_state;
 
     /* Create a new unique key and fill the hash */
-  
-    OPAL_THREAD_LOCK(&keyval_hash_lock);
+    OPAL_THREAD_LOCK(&attribute_lock);
     ret = CREATE_KEY(key);
     if (OMPI_SUCCESS == ret) {
         keyval->key = *key;
         ret = opal_hash_table_set_value_uint32(keyval_hash, *key, keyval);
     }
+
     if (OMPI_SUCCESS != ret) {
         OBJ_RELEASE(keyval);
     } else {
-	ret = MPI_SUCCESS;
+        ret = MPI_SUCCESS;
     }
 
-    OPAL_THREAD_UNLOCK(&keyval_hash_lock);
-    return MPI_SUCCESS;
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
+    return ret;
 }
 
 int ompi_attr_create_keyval(ompi_attribute_type_t type,
                                         bindings_extra_state);
 }
 
+/*****************************************************************************/
+
 int ompi_attr_free_keyval(ompi_attribute_type_t type, int *key, 
                           bool predefined)
 {
     ompi_attribute_keyval_t *keyval;
 
     /* Find the key-value pair */
-
-    OPAL_THREAD_LOCK(&keyval_hash_lock);
+    OPAL_THREAD_LOCK(&attribute_lock);
     ret = opal_hash_table_get_value_uint32(keyval_hash, *key, 
                                            (void **) &keyval);
-  
     if ((OMPI_SUCCESS != ret) || (NULL == keyval) || 
         (keyval->attr_type != type) ||
         ((!predefined) && (keyval->attr_flag & OMPI_KEYVAL_PREDEFINED))) {
-        OPAL_THREAD_UNLOCK(&keyval_hash_lock);
+        OPAL_THREAD_UNLOCK(&attribute_lock);
         return OMPI_ERR_BAD_PARAM;
     }
 
     /* MPI says to set the returned value to MPI_KEYVAL_INVALID */
-
     *key = MPI_KEYVAL_INVALID;
 
     /* This will delete the key only when no attributes are associated
        with it, else it will just decrement the reference count, so that when
        the last attribute is deleted, this object gets deleted too */
+    OBJ_RELEASE(keyval);
 
-    OBJ_RELEASE(keyval);
-    OPAL_THREAD_UNLOCK(&keyval_hash_lock);
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
 
     return MPI_SUCCESS;
 }
 
-
-int ompi_attr_delete(ompi_attribute_type_t type, void *object, 
-                     opal_hash_table_t *attr_hash, int key,
-                     bool predefined)
-{
-    ompi_attribute_keyval_t *keyval;
-    int ret = OMPI_SUCCESS, err;
-    attribute_value_t *attr;
-
-    /* Check if the key is valid in the master keyval hash */
-    OPAL_THREAD_LOCK(&keyval_hash_lock);
-    ret = opal_hash_table_get_value_uint32(keyval_hash, key, 
-                                           (void **) &keyval);
-    OPAL_THREAD_UNLOCK(&keyval_hash_lock);
-
-    if ((OMPI_SUCCESS != ret) || (NULL == keyval) ||
-        (keyval->attr_type!= type) ||
-        ((!predefined) && (keyval->attr_flag & OMPI_KEYVAL_PREDEFINED))) {
-        ret = OMPI_ERR_BAD_PARAM;
-	return ret;
-    }
-
-    OPAL_THREAD_LOCK(&attr_hash_lock);
-    /* Ensure that we don't have an empty attr_hash */
-    if (NULL == attr_hash) {
-        ret = OMPI_ERR_BAD_PARAM;
-	OPAL_THREAD_UNLOCK(&attr_hash_lock);
-	return ret;
-    }
-
-    /* Check if the key is valid for the communicator/window/dtype. If
-       yes, then delete the attribute and key entry from the object's
-       hash */
-
-    /* Note that this function can be invoked by
-       ompi_attr_delete_all() to set attributes on the new object (in
-       addition to the top-level MPI_* functions that set attributes). */
-
-    ret = opal_hash_table_get_value_uint32(attr_hash, key, (void**) &attr);
-    OPAL_THREAD_UNLOCK(&attr_hash_lock);
-
-    if (OMPI_SUCCESS == ret) {
-        switch (type) {
-        case COMM_ATTR:
-            DELETE_ATTR_CALLBACKS(communicator, attr, keyval, object);
-            break;
-                
-        case WIN_ATTR:
-            DELETE_ATTR_CALLBACKS(win, attr, keyval, object);
-            break;
-                
-        case TYPE_ATTR:
-            DELETE_ATTR_CALLBACKS(datatype, attr, keyval, object);
-            break;
-                
-        default:
-            ret = MPI_ERR_INTERN;
-            goto exit;
-        }
-        OBJ_RELEASE(attr);
-    
-	OPAL_THREAD_LOCK(&attr_hash_lock);
-        ret = opal_hash_table_remove_value_uint32(attr_hash, key);
-	OPAL_THREAD_UNLOCK(&attr_hash_lock);
-
-        if (OMPI_SUCCESS != ret) {
-            goto exit;
-        }
-    }
-
-
- exit:
-    /* Decrement the ref count for the keyval.  If ref count goes to
-       0, destroy the keyval (the destructor deletes the key
-       implicitly for this object).  The ref count will only go to 0
-       here if MPI_*_FREE_KEYVAL was previously invoked and we just
-       freed the last attribute that was using the keyval. */
-
-    if (OMPI_SUCCESS == ret) {
-        OPAL_THREAD_LOCK(&keyval_hash_lock);
-        OBJ_RELEASE(keyval);
-        OPAL_THREAD_UNLOCK(&keyval_hash_lock);
-    }
-
-    return ret;
-}
-
+/*****************************************************************************/
 
 /*
  * Front-end function called by the C MPI API functions to set an
                     opal_hash_table_t **attr_hash,
                     int key, void *attribute, bool predefined)
 {
+    int ret;
     attribute_value_t *new_attr = OBJ_NEW(attribute_value_t);
     if (NULL == new_attr) {
         return MPI_ERR_SYSRESOURCE;
     }
 
+    OPAL_THREAD_LOCK(&attribute_lock);
+
     new_attr->av_value = attribute;
     new_attr->av_set_from = OMPI_ATTRIBUTE_C;
-    return set_value(type, object, attr_hash, key, new_attr,
-                     predefined);
+    ret = set_value(type, object, attr_hash, key, new_attr, predefined);
+    if (OMPI_SUCCESS != ret) {
+        OBJ_RELEASE(new_attr);
+    }
+
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
+
+    return ret;
 }
 
 
                                int key, MPI_Fint attribute, 
                                bool predefined)
 {
+    int ret;
     attribute_value_t *new_attr = OBJ_NEW(attribute_value_t);
     if (NULL == new_attr) {
         return MPI_ERR_SYSRESOURCE;
     }
 
+    OPAL_THREAD_LOCK(&attribute_lock);
+
     new_attr->av_value = (void *) 0;
     *new_attr->av_integer_pointer = attribute;
     new_attr->av_set_from = OMPI_ATTRIBUTE_FORTRAN_MPI1;
-    return set_value(type, object, attr_hash, key, new_attr,
-                     predefined);
+    ret = set_value(type, object, attr_hash, key, new_attr, predefined);
+    if (OMPI_SUCCESS != ret) {
+        OBJ_RELEASE(new_attr);
+    }
+
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
+
+    return ret;
 }
 
 
                                int key, MPI_Aint attribute, 
                                bool predefined)
 {
+    int ret;
     attribute_value_t *new_attr = OBJ_NEW(attribute_value_t);
     if (NULL == new_attr) {
         return MPI_ERR_SYSRESOURCE;
     }
 
+    OPAL_THREAD_LOCK(&attribute_lock);
+
     new_attr->av_value = (void *) attribute;
     new_attr->av_set_from = OMPI_ATTRIBUTE_FORTRAN_MPI2;
-    return set_value(type, object, attr_hash, key, new_attr,
-                     predefined);
+    ret = set_value(type, object, attr_hash, key, new_attr, predefined);
+    if (OMPI_SUCCESS != ret) {
+        OBJ_RELEASE(new_attr);
+    }
+
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
+
+    return ret;
 }
 
+/*****************************************************************************/
 
 /*
  * Front-end function called by the C MPI API functions to get
     attribute_value_t *val = NULL;
     int ret;
 
+    OPAL_THREAD_LOCK(&attribute_lock);
+
     ret = get_value(attr_hash, key, &val, flag);
     if (MPI_SUCCESS == ret && 1 == *flag) {
         *attribute = translate_to_c(val);
     }
 
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
     return ret;
 }
 
     attribute_value_t *val = NULL;
     int ret;
 
+    OPAL_THREAD_LOCK(&attribute_lock);
+
     ret = get_value(attr_hash, key, &val, flag);
     if (MPI_SUCCESS == ret && 1 == *flag) {
         *attribute = translate_to_fortran_mpi1(val);
     }
 
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
     return ret;
 }
 
     attribute_value_t *val = NULL;
     int ret;
 
+    OPAL_THREAD_LOCK(&attribute_lock);
+
     ret = get_value(attr_hash, key, &val, flag);
     if (MPI_SUCCESS == ret && 1 == *flag) {
         *attribute = translate_to_fortran_mpi2(val);
     }
 
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
     return ret;
 }
 
+/*****************************************************************************/
 
 /*
- * Copy all the attributes from one MPI object to another
+ * Copy all the attributes from one MPI object to another.  Called
+ * when MPI objects are copied (e.g., back-end actions to
+ * MPI_COMM_DUP).
  */
 int ompi_attr_copy_all(ompi_attribute_type_t type, void *old_object, 
                        void *new_object, opal_hash_table_t *oldattr_hash,
     ompi_attribute_keyval_t *hash_value;
 
     /* If there's nothing to do, just return */
-
     if (NULL == oldattr_hash) {
         return MPI_SUCCESS;
     }
 
-    /* Lock this whole sequence of events -- don't let any other
-       thread modify the structure of the attrbitue hash or bitmap
-       while we're traversing it */
+    OPAL_THREAD_LOCK(&attribute_lock);
 
-    OPAL_THREAD_LOCK(&attr_hash_lock);
     /* Get the first attribute in the object's hash */
     ret = opal_hash_table_get_first_key_uint32(oldattr_hash, &key, 
                                                (void **) &old_attr,
                                                &node);
-    OPAL_THREAD_UNLOCK(&attr_hash_lock);
 
     /* While we still have some attribute in the object's key hash */
     while (OMPI_SUCCESS == ret) {
 
         /* Get the keyval in the main keyval hash - so that we know
            what the copy_attr_fn is */
-
-	OPAL_THREAD_LOCK(&keyval_hash_lock);
         err = opal_hash_table_get_value_uint32(keyval_hash, key, 
                                                (void **) &hash_value);
-	OPAL_THREAD_UNLOCK(&keyval_hash_lock);
+        if (OMPI_SUCCESS != err) {
+            /* This should not happen! */
+            ret = MPI_ERR_INTERN;
+            goto out;
+        }
 
+        err = 0;
         new_attr = OBJ_NEW(attribute_value_t);
         switch (type) {
-        case UNUSED_ATTR:  /* keep the compiler happy */
-            assert(0);
-            break;
         case COMM_ATTR:
             /* Now call the copy_attr_fn */
             COPY_ATTR_CALLBACKS(communicator, old_object, hash_value, 
-                                old_attr, new_object, new_attr);
+                                old_attr, new_object, new_attr, err);
             break;
             
         case TYPE_ATTR:
             /* Now call the copy_attr_fn */
             COPY_ATTR_CALLBACKS(datatype, old_object, hash_value, 
-                                old_attr, new_object, new_attr);
+                                old_attr, new_object, new_attr, err);
             break;
 
         case WIN_ATTR:
             /* Now call the copy_attr_fn */
             COPY_ATTR_CALLBACKS(win, old_object, hash_value, 
-                                old_attr, new_object, new_attr);
+                                old_attr, new_object, new_attr, err);
             break;
+
+        default:
+            /* This should not happen */
+            assert(0);
+            break;
+        }
+        /* Did the callback return non-MPI_SUCCESS? */
+        if (0 != err) {
+            goto out;
         }
 
         /* Hang this off the object's hash */
             } else {
                 new_attr->av_set_from = OMPI_ATTRIBUTE_C;
             }
-            set_value(type, new_object, &newattr_hash, key, 
-                      new_attr, true);
-
+            ret = set_value(type, new_object, &newattr_hash, key, 
+                            new_attr, true);
+            if (MPI_SUCCESS != ret) {
+                goto out;
+            }
         } else {
             OBJ_RELEASE(new_attr);
         }
 
-	OPAL_THREAD_LOCK(&attr_hash_lock);
         ret = opal_hash_table_get_next_key_uint32(oldattr_hash, &key, 
                                                   (void **) &old_attr, 
                                                   in_node, &node);
-	OPAL_THREAD_UNLOCK(&attr_hash_lock);
+    }
+    ret = MPI_SUCCESS;
+
+ out:
+    /* All done */
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
+    return ret;
+}
+
+/*****************************************************************************/
+
+/*
+ * Back-end function to delete a single attribute.
+ *
+ * Assumes that you DO already have the attribute_lock.
+ */
+static int ompi_attr_delete_impl(ompi_attribute_type_t type, void *object, 
+                                 opal_hash_table_t *attr_hash, int key,
+                                 bool predefined)
+{
+    ompi_attribute_keyval_t *keyval;
+    int ret = OMPI_SUCCESS;
+    attribute_value_t *attr;
+
+    /* Check if the key is valid in the master keyval hash */
+    ret = opal_hash_table_get_value_uint32(keyval_hash, key, 
+                                           (void **) &keyval);
+
+    if ((OMPI_SUCCESS != ret) || (NULL == keyval) ||
+        (keyval->attr_type!= type) ||
+        ((!predefined) && (keyval->attr_flag & OMPI_KEYVAL_PREDEFINED))) {
+        ret = OMPI_ERR_BAD_PARAM;
+        goto exit;
     }
 
-    /* All done */
+    /* Ensure that we don't have an empty attr_hash */
+    if (NULL == attr_hash) {
+        ret = OMPI_ERR_BAD_PARAM;
+        goto exit;
+    }
 
-    return MPI_SUCCESS;
+    /* Check if the key is valid for the communicator/window/dtype. If
+       yes, then delete the attribute and key entry from the object's
+       hash */
+    ret = opal_hash_table_get_value_uint32(attr_hash, key, (void**) &attr);
+    if (OMPI_SUCCESS == ret) {
+        switch (type) {
+        case COMM_ATTR:
+            DELETE_ATTR_CALLBACKS(communicator, attr, keyval, object, ret);
+            break;
+
+        case WIN_ATTR:
+            DELETE_ATTR_CALLBACKS(win, attr, keyval, object, ret);
+            break;
+
+        case TYPE_ATTR:
+            DELETE_ATTR_CALLBACKS(datatype, attr, keyval, object, ret);
+            break;
+
+        default:
+            /* This should not happen */
+            assert(0);
+            break;
+        }
+        if (MPI_SUCCESS != ret) {
+            goto exit;
+        }
+
+        /* Ignore the return value at this point; it can't help any
+           more */
+        (void) opal_hash_table_remove_value_uint32(attr_hash, key);
+        OBJ_RELEASE(attr);
+    }
+
+ exit:
+    /* Decrement the ref count for the keyval.  If ref count goes to
+       0, destroy the keyval (the destructor deletes the key
+       implicitly for this object).  The ref count will only go to 0
+       here if MPI_*_FREE_KEYVAL was previously invoked and we just
+       freed the last attribute that was using the keyval. */
+    if (OMPI_SUCCESS == ret) {
+        OBJ_RELEASE(keyval);
+    }
+
+    return ret;
 }
 
+/*
+ * Front end function to delete a single attribute.
+ */
+int ompi_attr_delete(ompi_attribute_type_t type, void *object, 
+                     opal_hash_table_t *attr_hash, int key,
+                     bool predefined)
+{
+    int ret;
+
+    OPAL_THREAD_LOCK(&attribute_lock);
+    ret = ompi_attr_delete_impl(type, object, attr_hash, key, predefined);
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
+    return ret;
+}
 
 /*
- * Delete all the attributes on an MPI object
+ * Front-end function to delete all the attributes on an MPI object
  */
 int ompi_attr_delete_all(ompi_attribute_type_t type, void *object, 
                          opal_hash_table_t *attr_hash)
 {
-    int key_ret, del_ret;
-    uint32_t key, oldkey;
-    void *node, *in_node, *old_attr;
+    int ret, i, num_attrs;
+    uint32_t key;
+    void *node, *in_node, *attr;
+    attribute_value_t **attrs;
 
     /* Ensure that the table is not empty */
 
     if (NULL == attr_hash) {
         return MPI_SUCCESS;
     }
-        
-    /* Lock this whole sequence of events -- don't let any other
-       thread modify the structure of the attribute hash or bitmap
-       while we're traversing it */
 
-    OPAL_THREAD_LOCK(&attr_hash_lock);
-    /* Get the first key in local object's hash  */
-    key_ret = opal_hash_table_get_first_key_uint32(attr_hash,
-                                               &key, &old_attr,
-                                               &node);
-    OPAL_THREAD_UNLOCK(&attr_hash_lock);
+    OPAL_THREAD_LOCK(&attribute_lock);
 
-    del_ret = OMPI_SUCCESS;
-    while (OMPI_SUCCESS == key_ret && OMPI_SUCCESS == del_ret) {
+    /* Make an array that contains all attributes in local object's hash */
+    num_attrs = opal_hash_table_get_size(attr_hash);
+    if (0 == num_attrs) {
+        OPAL_THREAD_UNLOCK(&attribute_lock);
+        return MPI_SUCCESS;
+    }
 
-        /* Save this node info for deletion, before we move onto the
-           next node */
+    attrs = malloc(sizeof(attribute_value_t *) * num_attrs);
+    if (NULL == attrs) {
+        OPAL_THREAD_UNLOCK(&attribute_lock);
+        return MPI_ERR_SYSRESOURCE;
+    }
 
+    ret = opal_hash_table_get_first_key_uint32(attr_hash, &key, &attr, &node);
+    for (i = 0; OMPI_SUCCESS == ret; i++) {
+        attrs[i] = attr;
         in_node = node;
-        oldkey = key;
-        
-        /* Move to the next node */
+        ret = opal_hash_table_get_next_key_uint32(attr_hash, &key, &attr,
+                                                  in_node, &node);
+    }
 
-	OPAL_THREAD_LOCK(&attr_hash_lock);
-        key_ret = opal_hash_table_get_next_key_uint32(attr_hash,
-                                                      &key, &old_attr, 
-                                                      in_node, &node);
-	OPAL_THREAD_UNLOCK(&attr_hash_lock);
+    /* Sort attributes in the order that they were set */
+    qsort(attrs, num_attrs, sizeof(attribute_value_t *), compare_attr_sequence);
 
-        /* Now delete this attribute */
-
-        del_ret = ompi_attr_delete(type, object, attr_hash, oldkey, true);
+    /* Delete attributes in the reverse order that they were set.
+       Actually this ordering is required only for MPI_COMM_SELF, as
+       specified in MPI-2.2: 8.7.1 Allowing User Functions at Process
+       Termination, but we do it for everything -- what the heck.
+       :-) */
+    for (i = num_attrs - 1; i >= 0; i--) {
+        ret = ompi_attr_delete_impl(type, object, attr_hash, 
+                                    attrs[i]->av_key, true);
+        if (OMPI_SUCCESS != ret) {
+            break;
+        }
     }
 
     /* All done */
 
-    return del_ret;
+    free(attrs);
+    opal_atomic_wmb();
+    OPAL_THREAD_UNLOCK(&attribute_lock);
+    return ret;
 }
 
 /*************************************************************************/
 
 /*
- * Back-end function to set an attribute on an MPI object
+ * Back-end function to set an attribute on an MPI object.  Assumes
+ * that you already hold the attribute_lock.
  */
 static int set_value(ompi_attribute_type_t type, void *object, 
                      opal_hash_table_t **attr_hash, int key, 
                      bool predefined)
 {
     ompi_attribute_keyval_t *keyval;
-    int ret, err;
+    int ret;
     attribute_value_t *old_attr;
     bool had_old = false;
 
     /* Note that this function can be invoked by ompi_attr_copy_all()
        to set attributes on the new object (in addition to the
        top-level MPI_* functions that set attributes). */
-
-    OPAL_THREAD_LOCK(&keyval_hash_lock);
     ret = opal_hash_table_get_value_uint32(keyval_hash, key, 
                                            (void **) &keyval);
-    OPAL_THREAD_UNLOCK(&keyval_hash_lock);
 
     /* If key not found */
-
     if ((OMPI_SUCCESS != ret ) || (NULL == keyval) || 
         (keyval->attr_type != type) ||
         ((!predefined) && (keyval->attr_flag & OMPI_KEYVAL_PREDEFINED))) {
     }
 
     /* Do we need to make a new attr_hash? */
-    OPAL_THREAD_LOCK(&attr_hash_lock);
     if (NULL == *attr_hash) {
         ompi_attr_hash_init(attr_hash);
     }
 
     /* Now see if an attribute is already present in the object's hash
        on the old keyval. If so, delete the old attribute value. */
-
     ret = opal_hash_table_get_value_uint32(*attr_hash, key, (void**) &old_attr);
-    OPAL_THREAD_UNLOCK(&attr_hash_lock);
-
     if (OMPI_SUCCESS == ret)  {
         switch (type) {
         case COMM_ATTR:
-            DELETE_ATTR_CALLBACKS(communicator, old_attr, keyval, object);
+            DELETE_ATTR_CALLBACKS(communicator, old_attr, keyval, object, ret);
             break;
 
         case WIN_ATTR:
-            DELETE_ATTR_CALLBACKS(win, old_attr, keyval, object);
+            DELETE_ATTR_CALLBACKS(win, old_attr, keyval, object, ret);
             break;
 
         case TYPE_ATTR:
-            DELETE_ATTR_CALLBACKS(datatype, old_attr, keyval, object);
+            DELETE_ATTR_CALLBACKS(datatype, old_attr, keyval, object, ret);
             break;
 
         default:
-            return MPI_ERR_INTERN;
+            /* This should not happen */
+            assert(0);
+            break;
+        }
+        if (MPI_SUCCESS != ret) {
+            return ret;
         }
         had_old = true;
-        OBJ_RELEASE(old_attr);
     }
 
-    OPAL_THREAD_LOCK(&keyval_hash_lock);
     ret = opal_hash_table_get_value_uint32(keyval_hash, key,
                                            (void **) &keyval);
     if ((OMPI_SUCCESS != ret ) || (NULL == keyval)) {
-	/* Keyval has disappeared underneath us. Someone must have
-	   called ompi_attr_free_keyval since we last looked it up
-	   in the hash. We'll behave as if we never found it in the
-	   first place */
-	OPAL_THREAD_UNLOCK(&keyval_hash_lock);
-	return OMPI_ERR_BAD_PARAM;
+        /* Keyval has disappeared underneath us -- this shouldn't
+           happen! */
+        assert(0);
+        return OMPI_ERR_BAD_PARAM;
     }
 
-    OPAL_THREAD_LOCK(&attr_hash_lock);
+    new_attr->av_key = key;
+    new_attr->av_sequence = attr_sequence++;
+
     ret = opal_hash_table_set_value_uint32(*attr_hash, key, new_attr);
-    OPAL_THREAD_UNLOCK(&attr_hash_lock);
-    OPAL_THREAD_UNLOCK(&keyval_hash_lock);
 
     /* Increase the reference count of the object, only if there was no
        old atribute/no old entry in the object's key hash */
-
     if (OMPI_SUCCESS == ret && !had_old) {
         OBJ_RETAIN(keyval);
     }
 
-    if (OMPI_SUCCESS != ret) {
-        return ret;
-    }
-
-    return MPI_SUCCESS;
+    return ret;
 }
 
+/*************************************************************************/
 
 /*
  * Back-end function to get an attribute from the hash map and return
  * it to the caller.  Translation services are not provided -- they're
  * in small, standalone functions that are called from several
  * different places.
+ *
+ * Assumes that you do NOT already have the attribute lock.
  */
 static int get_value(opal_hash_table_t *attr_hash, int key, 
                      attribute_value_t **attribute, int *flag)
        hash).  If the keyval exists but no attribute is associated
        with the key, then the call is valid and returns FALSE in the
        flag argument */
-
     *flag = 0;
-    OPAL_THREAD_LOCK(&keyval_hash_lock);
     ret = opal_hash_table_get_value_uint32(keyval_hash, key, 
                                            (void**) &keyval);
-    OPAL_THREAD_UNLOCK(&keyval_hash_lock);
-
     if (OMPI_ERR_NOT_FOUND == ret) {
         return MPI_KEYVAL_INVALID;
     }
 
     /* If we have a null attr_hash table, that means that nothing has
        been cached on this object yet.  So just return *flag = 0. */
-
-    OPAL_THREAD_LOCK(&attr_hash_lock);
     if (NULL == attr_hash) {
-	OPAL_THREAD_UNLOCK(&attr_hash_lock);
         return OMPI_SUCCESS;
     }
 
     ret = opal_hash_table_get_value_uint32(attr_hash, key, &attr);
-    OPAL_THREAD_UNLOCK(&attr_hash_lock);
     if (OMPI_SUCCESS == ret) {
         *attribute = (attribute_value_t*)attr;
         *flag = 1;
     }
+
     return OMPI_SUCCESS;
 }
 
+/*************************************************************************/
 
 /*
  * Take an attribute and translate it according to the cases listed in
         return 0;
     }
 }
+
+/*
+ * Comparator for qsort() to sort attributes in the order that they were set.
+ */
+static int compare_attr_sequence(const void *attr1, const void *attr2)
+{
+    return (*(attribute_value_t **)attr1)->av_sequence -
+           (*(attribute_value_t **)attr2)->av_sequence;
+}

File ompi/attribute/attribute.h

 static inline
 int ompi_attr_hash_init(opal_hash_table_t **hash)
 {
-   *hash = OBJ_NEW(opal_hash_table_t);
-    if (NULL == hash) {
+    *hash = OBJ_NEW(opal_hash_table_t);
+    if (NULL == *hash) {
         fprintf(stderr, "Error while creating the local attribute list\n");
         return MPI_ERR_SYSRESOURCE;
     }
  * Get an attribute on the comm/win/datatype in a form valid for
  * Fortran MPI-2.
  *
- * @param attrhash       The attribute hash table hanging on the object(IN)
+ * @param attr_hash      The attribute hash table hanging on the object(IN)
  * @param key            Key val for the attribute (IN)
  * @param attribute      The actual attribute pointer (OUT)
  * @param flag           Flag whether an attribute is associated 
 
 
 /** 
- * This to be used from functions like MPI_*_DUP inorder to copy all
+ * This to be used from functions like MPI_*_DUP in order to copy all
  * the attributes from the old Comm/Win/Dtype object to a new
  * object. 
  * @param type         Type of attribute (COMM/WIN/DTYPE) (IN)
  * @param old_object   The old COMM/WIN/DTYPE object (IN)
  * @param new_object   The new COMM/WIN/DTYPE object (IN)
- * @param attr_hash    The attribute hash table hanging on old object(IN)
+ * @param oldattr_hash The attribute hash table hanging on old object(IN)
  * @param newattr_hash The attribute hash table hanging on new object(IN)
  * @return OMPI error code
  *
  */
 
 int ompi_attr_copy_all(ompi_attribute_type_t type, void *old_object, 
-                      void *new_object, opal_hash_table_t *oldattr_hash,
-                      opal_hash_table_t *newkeyhash);
+                       void *new_object, opal_hash_table_t *oldattr_hash,
+                       opal_hash_table_t *newattr_hash);
 
 
 /** 

File ompi/class/ompi_rb_tree.c

 
 #include "ompi/class/ompi_rb_tree.h"
 
-/* declare the instance of the classes  */
-OBJ_CLASS_INSTANCE(ompi_rb_tree_node_t, ompi_free_list_item_t, NULL, NULL);
-OBJ_CLASS_INSTANCE(ompi_rb_tree_t, opal_object_t, ompi_rb_tree_construct,
-                   ompi_rb_tree_destruct);
-
 /* Private functions */
 static void btree_insert(ompi_rb_tree_t *tree, ompi_rb_tree_node_t * node);
 static void btree_delete_fixup(ompi_rb_tree_t *tree, ompi_rb_tree_node_t * x);
                               ompi_rb_tree_node_t * node);
 
 
-
-/* constructor */
-void ompi_rb_tree_construct(opal_object_t * object)
+/**
+ * the constructor function. creates the free list to get the nodes from
+ *
+ * @param object the tree that is to be used
+ *
+ * @retval NONE
+ */
+static void ompi_rb_tree_construct(opal_object_t * object)
 {
     ompi_rb_tree_t * tree = (ompi_rb_tree_t *) object;
     tree->root_ptr = NULL;
             0, -1 , 128, NULL);
 }
 
-/* the destructor function */
-void ompi_rb_tree_destruct(opal_object_t * object)
+/**
+ * the destructor function. Free the tree and destroys the free list.
+ *
+ * @param object the tree object
+ */
+static void ompi_rb_tree_destruct(opal_object_t * object)
 {
     if(NULL != ((ompi_rb_tree_t *)object)->root_ptr) {
         ompi_rb_tree_destroy((ompi_rb_tree_t *) object);
     return;
 }
 
+/* declare the instance of the classes  */
+OBJ_CLASS_INSTANCE(ompi_rb_tree_node_t, ompi_free_list_item_t, NULL, NULL);
+OBJ_CLASS_INSTANCE(ompi_rb_tree_t, opal_object_t, ompi_rb_tree_construct,
+                   ompi_rb_tree_destruct);
+
 /* Create the tree */
 int ompi_rb_tree_init(ompi_rb_tree_t * tree,
                       ompi_rb_tree_comp_fn_t comp)

File ompi/class/ompi_rb_tree.h

  */
 
 /**
-  * red and black enum
-  */
+ * red and black enum
+ */
 typedef enum {RED, BLACK} ompi_rb_tree_nodecolor_t;
 
 /**
-  * node data structure
-  */
+ * node data structure
+ */
 struct ompi_rb_tree_node_t
 {
-    ompi_free_list_item_t super;             /**< the parent class */
+    ompi_free_list_item_t super;        /**< the parent class */
     ompi_rb_tree_nodecolor_t color;     /**< the node color */
     struct ompi_rb_tree_node_t * parent;/**< the parent node, can be NULL */
     struct ompi_rb_tree_node_t * left;  /**< the left child - can be nill */
 typedef struct ompi_rb_tree_node_t ompi_rb_tree_node_t;
 
 /**
-  * the compare function typedef. This function is used to compare 2 nodes.
-  */
+ * the compare function typedef. This function is used to compare 2 nodes.
+ */
 typedef int (*ompi_rb_tree_comp_fn_t)(void *key1, void *key2);
 
 /**
-  * the data structure that holds all the needed information about the tree.
-  */
+ * the data structure that holds all the needed information about the tree.
+ */
 struct ompi_rb_tree_t {
     opal_object_t parent;           /**< the parent class */
     /* this root pointer doesn't actually point to the root of the tree.
      * rather, it points to a sentinal node who's left branch is the real
      * root of the tree. This is done to eliminate special cases */
     ompi_rb_tree_node_t * root_ptr;/**< a pointer to the root of the tree */
-    ompi_rb_tree_node_t * nill;     /**< the nill sentinal node */
+    ompi_rb_tree_node_t * nill;     /**< the nill sentinel node */
     ompi_rb_tree_comp_fn_t comp;    /**< the compare function */
     ompi_free_list_t free_list;   /**< the free list to get the memory from */
     size_t tree_size;                  /**< the size of the tree */
 
 /* Function pointers for map traversal function */
 /**
-  * this function is used for the ompi_rb_tree_traverse function.
-  * it is passed a pointer to the value for each node and, if it returns
-  * a one, the action function is called on that node. Otherwise, the node is ignored.
-  */
+ * this function is used for the ompi_rb_tree_traverse function.
+ * it is passed a pointer to the value for each node and, if it returns
+ * a one, the action function is called on that node. Otherwise, the node is ignored.
+ */
 typedef int (*ompi_rb_tree_condition_fn_t)(void *);
 /**
-  * this function is uused for the user to perform any action on the passed
-  * values. The first argument is the key and the second is the value.
-  * note that this function SHOULD NOT modify the keys, as that would
-  * mess up the tree.
-  */
+ * this function is used for the user to perform any action on the passed
+ * values. The first argument is the key and the second is the value.
+ * note that this function SHOULD NOT modify the keys, as that would
+ * mess up the tree.
+ */
 typedef void (*ompi_rb_tree_action_fn_t)(void *, void *);
 
 /*
  */
 
 /**
-  * the construct function. creates the free list to get the nodes from
-  *
-  * @param object the tree that is to be used
-  *
-  * @retval NONE
-  */
-OMPI_DECLSPEC void ompi_rb_tree_construct(opal_object_t * object);
+ * the function creates a new tree
+ *
+ * @param tree a pointer to an allocated area of memory for the main
+ *  tree data structure.
+ * @param comp a pointer to the function to use for comaparing 2 nodes
+ *
+ * @retval OMPI_SUCCESS if it is successful
+ * @retval OMPI_ERR_TEMP_OUT_OF_RESOURCE if unsuccessful
+ */
+OMPI_DECLSPEC int ompi_rb_tree_init(ompi_rb_tree_t * tree, ompi_rb_tree_comp_fn_t comp);
+
 
 /**
-  * the destruct function. tries to free the tree and destroys the free list
-  *
-  * @param object the tree object
-  */
-OMPI_DECLSPEC void ompi_rb_tree_destruct(opal_object_t * object);
-
-/**
-  * the function creates a new tree
-  *
-  * @param tree a pointer to an allocated area of memory for the main
-  *  tree data structure.
-  * @param comp a pointer to the function to use for comaparing 2 nodes
-  *
-  * @retval OMPI_SUCCESS if it is successful
-  * @retval OMPI_ERR_TEMP_OUT_OF_RESOURCE if unsuccessful
-  */
-OMPI_DECLSPEC int ompi_rb_tree_init(ompi_rb_tree_t * tree, ompi_rb_tree_comp_fn_t comp);
-
-
-/**
-  * inserts a node into the tree
-  *
-  * @param tree a pointer to the tree data structure
-  * @param key the key for the node
-  * @param value the value for the node
-  *
-  * @retval OMPI_SUCCESS
-  * @retval OMPI_ERR_TEMP_OUT_OF_RESOURCE if unsuccessful
-  */
+ * inserts a node into the tree
+ *
+ * @param tree a pointer to the tree data structure
+ * @param key the key for the node
+ * @param value the value for the node
+ *
+ * @retval OMPI_SUCCESS
+ * @retval OMPI_ERR_TEMP_OUT_OF_RESOURCE if unsuccessful
+ */
 OMPI_DECLSPEC int ompi_rb_tree_insert(ompi_rb_tree_t *tree, void * key, void * value);
 
 /**
-  * finds a value in the tree based on the passed key using passed
-  * compare function
-  *
-  * @param tree a pointer to the tree data structure
-  * @param key a pointer to the key
-  * @param compare function
-  *
-  * @retval pointer to the value if found
-  * @retval NULL if not found
-  */
+ * finds a value in the tree based on the passed key using passed
+ * compare function
+ *
+ * @param tree a pointer to the tree data structure
+ * @param key a pointer to the key
+ * @param compare function
+ *
+ * @retval pointer to the value if found
+ * @retval NULL if not found
+ */
 OMPI_DECLSPEC void * ompi_rb_tree_find_with(ompi_rb_tree_t *tree, void *key, ompi_rb_tree_comp_fn_t compfn);
 
 /**
-  * finds a value in the tree based on the passed key
-  *
-  * @param tree a pointer to the tree data structure
-  * @param key a pointer to the key
-  *
-  * @retval pointer to the value if found
-  * @retval NULL if not found
-  */
+ * finds a value in the tree based on the passed key
+ *
+ * @param tree a pointer to the tree data structure
+ * @param key a pointer to the key
+ *
+ * @retval pointer to the value if found
+ * @retval NULL if not found
+ */
 static inline void * ompi_rb_tree_find(ompi_rb_tree_t *tree, void *key)
 {
-        return ompi_rb_tree_find_with(tree, key, tree->comp);
+    return ompi_rb_tree_find_with(tree, key, tree->comp);
 }
 
 /**
-  * deletes a node based on its key
-  *
-  * @param tree a pointer to the tree data structure
-  * @param key a pointer to the key
-  *
-  * @retval OMPI_SUCCESS if the node is found and deleted
-  * @retval OMPI_ERR_NOT_FOUND if the node is not found
-  */
+ * deletes a node based on its key
+ *
+ * @param tree a pointer to the tree data structure
+ * @param key a pointer to the key
+ *
+ * @retval OMPI_SUCCESS if the node is found and deleted
+ * @retval OMPI_ERR_NOT_FOUND if the node is not found
+ */
 OMPI_DECLSPEC int ompi_rb_tree_delete(ompi_rb_tree_t *tree, void *key);
 
 /**
-  * frees all the nodes on the tree
-  *
-  * @param tree a pointer to the tree data structure
-  *
-  * @retval OMPI_SUCCESS
-  */
+ * frees all the nodes on the tree
+ *
+ * @param tree a pointer to the tree data structure
+ *
+ * @retval OMPI_SUCCESS
+ */
 OMPI_DECLSPEC int ompi_rb_tree_destroy(ompi_rb_tree_t *tree);
 
 /**
-  * traverses the entire tree, performing the cond function on each of the
-  * values and if it returns one it performs the action function on the values
-  *
-  * @param tree a pointer to the tree
-  * @param cond a pointer to the condition function
-  * @param action a pointer to the action function
-  *
-  * @retval OMPI_SUCCESS
-  * @retval OMPI_ERROR if there is an error
-  */
+ * traverses the entire tree, performing the cond function on each of the
+ * values and if it returns one it performs the action function on the values
+ *
+ * @param tree a pointer to the tree
+ * @param cond a pointer to the condition function
+ * @param action a pointer to the action function
+ *
+ * @retval OMPI_SUCCESS
+ * @retval OMPI_ERROR if there is an error
+ */
 OMPI_DECLSPEC int ompi_rb_tree_traverse(ompi_rb_tree_t *tree,
                           ompi_rb_tree_condition_fn_t cond,
                           ompi_rb_tree_action_fn_t action);
 
 /**
-  * returns the size of the tree
-  *
-  * @param tree a pointer to the tree data structure
-  *
-  * @retval int the nuber of items on the tree
-  */
+ * returns the size of the tree
+ *
+ * @param tree a pointer to the tree data structure
+ *
+ * @retval int the nuber of items on the tree
+ */
 OMPI_DECLSPEC int ompi_rb_tree_size(ompi_rb_tree_t *tree);
 
 END_C_DECLS

File ompi/mca/btl/base/btl_base_frame.c

  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006-2007 Sun Microsystems, Inc.  All rights reserved.
- * Copyright (c) 2008      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2008-2013 Cisco Systems, Inc.  All rights reserved.
  * $COPYRIGHT$
  * 
  * Additional copyrights may follow
 
   OBJ_CONSTRUCT(&mca_btl_base_modules_initialized, opal_list_t);
 
+  /* get the verbosity so that BTL_VERBOSE will work */
+  mca_btl_base_verbose = opal_output_get_verbosity(ompi_btl_base_framework.framework_output);
+
   /* All done */
   return OMPI_SUCCESS;
 }
 
 static int mca_btl_base_close(void)
 {
-    mca_btl_base_selected_module_t *sm;
+    mca_btl_base_selected_module_t *sm, *next;
 
 #if 0
     /* disable event processing while cleaning up btls */
 #endif
     /* Finalize all the btl components and free their list items */
 
-    OPAL_LIST_FOREACH(sm, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
+    OPAL_LIST_FOREACH_SAFE(sm, next, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
         /* Blatently ignore the return code (what would we do to recover,
            anyway?  This component is going away, so errors don't matter
            anymore) */
 
         sm->btl_module->btl_finalize(sm->btl_module);
+        opal_list_remove_item(&mca_btl_base_modules_initialized, &sm->super);
         free(sm);
     }
 
 
     (void) mca_base_framework_components_close(&ompi_btl_base_framework, NULL);
 
+    OBJ_DESTRUCT(&mca_btl_base_modules_initialized);
+
 #if 0
     /* restore event processing */
     opal_event_enable();

File ompi/mca/btl/openib/btl_openib.c

 
     mem_total = calculate_total_mem ();
 
-    if (0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) {
+    if (0 == stat("/sys/module/mlx5_core", &statinfo)) {
+        /* mlx5 means that we have ofed 2.0 and it can always register 2xmem_total for any mlx hca */
+        max_reg = 2 * mem_total;
+    }
+    else if (0 == stat("/sys/module/mlx4_core/parameters", &statinfo)) {
         mtts_per_seg = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1);
         num_mtt = 1 << read_module_param("/sys/module/mlx4_core/parameters/log_num_mtt", 20);
         if (1 == num_mtt) {

File ompi/mca/btl/openib/btl_openib_mca.c

     }
 
     snprintf(default_qps, 100,
-            "P,128,256,192,128:S,%u,256,128,32:S,%u,256,128,32:S,%u,256,128,32",
+            "P,128,256,192,128:S,%u,1024,1008,64:S,%u,1024,1008,64:S,%u,1024,1008,64",
             mid_qp_size,
             (uint32_t)mca_btl_openib_module.super.btl_eager_limit,
             (uint32_t)mca_btl_openib_module.super.btl_max_send_size);

File ompi/mca/btl/openib/connect/btl_openib_connect_base.c

              "Method used to exclude OpenFabrics connections (valid values: %s)",
              all_cpc_names);
 
-    btl_openib_cpc_include = NULL;
+    btl_openib_cpc_exclude = NULL;
     (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
                                            "cpc_exclude", string, MCA_BASE_VAR_TYPE_STRING,
                                            NULL, 0, 0, OPAL_INFO_LVL_9,

File ompi/mca/btl/openib/connect/btl_openib_connect_udcm.c

  * The UD connection module creates and listens on a unconnected
  * datagram (UD) queue pair (QP) for connections requests.
  *
- * This connection method uses a two-step process:
- * Step 1 (CONNECT):
- *   A connect request is sent/received using an unconnected
- *   datagram queue pair.
- * Step 2 (SYNC):
- *   The connection is then synced by sending a 0-byte request
- *   on a per-peer rc queue pair.
- *    * This step is required to avoid a race condition between
- *      the last UD message and the first RC BTL message.
- *
  * There are two ways a connection can be established by UD:
  *  1. One side starts a connection and the request is received before
  *    the receiving side starts a connection. (One sided)
 
 #include <pthread.h>
 
+#include "opal/util/show_help.h"
 #include "opal/util/output.h"
 #include "opal/util/error.h"
 #include "opal_stdint.h"
     opal_list_t  cm_recv_msg_queue;
     bool         cm_message_event_active;
 
-    /* ID of next outgoing message */
-    uint32_t     next_message_id;
-
     /* The associated BTL */
     struct mca_btl_openib_module_t *btl;
 
 
 typedef struct udcm_msg_hdr {
     udcm_message_type_t type;
-    uint32_t id;
 
     /* endpoint local to the sender */
     mca_btl_base_endpoint_t *rem_ep;
 /*--------------------------------------------------------------------*/
 
 #define UDCM_MIN_RECV_COUNT 512
-#define UDCM_MIN_TIMEOUT    1000000
+#define UDCM_MIN_TIMEOUT    500000
 
 #define UDCM_SEND_CQ_SIZE   512
 
 #define UDCM_WR_RECV_ID  0x20000000ll
 #define UDCM_WR_SEND_ID  0x10000000ll
+#define UDCM_WR_ACK_ID  0x10000000ll
 #define UDCM_WR_DIR_MASK 0x30000000ll
 
 /* Useless 40 bytes of data that proceeds received scatter gather data.
 
     m->cm_exiting = true;
 
+    /* stop monitoring the channel's fd before destroying the listen qp */
+    ompi_btl_openib_fd_unmonitor(m->cm_channel->fd, udcm_unmonitor, (void *)&barrier);
+
+    while (0 == barrier) {
+	sched_yield();
+    }
+
     opal_mutex_lock (&m->cm_lock);
 
     opal_mutex_lock (&m->cm_recv_msg_queue_lock);
 
     BTL_VERBOSE(("destroying listing thread"));
 
-    /* stop monitoring the channel's fd before destroying the listen qp */
-    ompi_btl_openib_fd_unmonitor(m->cm_channel->fd, udcm_unmonitor, (void *)&barrier);
-
-    while (0 == barrier) {
-	sched_yield();
-    }
-
     /* destroy the listen queue pair. this will cause ibv_get_cq_event to
        return. */
     udcm_module_destroy_listen_qp (m);
     message->endpoint = lcl_ep;
 
     opal_atomic_wmb ();
-    opal_mutex_lock(&m->cm_lock);
-    message->data->id = m->next_message_id++;
-    opal_mutex_unlock(&m->cm_lock);
 
     *msgp = message;
 
-    BTL_VERBOSE(("created message %d", message->data->id));
+    BTL_VERBOSE(("created message with type %d", type));
 
     return OMPI_SUCCESS;
 }
     return 0;
 }
 
-static int udcm_send_ack (mca_btl_base_endpoint_t *lcl_ep, uint32_t id)
+static int udcm_send_ack (mca_btl_base_endpoint_t *lcl_ep)
 {
     udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
     udcm_module_t *m = UDCM_ENDPOINT_MODULE(lcl_ep);
     /* NTH: need to lock here or we run into problems */
     opal_mutex_lock(&m->cm_send_lock);
 
-    wr.wr_id      = UDCM_WR_SEND_ID;
+    BTL_VERBOSE(("sending ack for message on ep %p", (void *) lcl_ep));
+
+    wr.wr_id      = UDCM_WR_ACK_ID;
     wr.next       = NULL;
     wr.num_sge    = 0;
+    /* use imm flag to signal the other side that this is an ack */
     wr.opcode     = IBV_WR_SEND_WITH_IMM;
     wr.send_flags = IBV_SEND_SOLICITED | IBV_SEND_SIGNALED;
     wr.wr.ud.ah   = udep->ah;
 
-    wr.imm_data   = id;
+    wr.imm_data   = 0; /* dom't care */
 
     wr.wr.ud.remote_qpn  = UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_qp_num;
     wr.wr.ud.remote_qkey = 0;
     return rc;
 }
 
-static int udcm_handle_ack (udcm_module_t *m, const uint32_t id)
+static int udcm_handle_ack (udcm_module_t *m, const uint32_t id, const uint16_t slid,
+			    const uint32_t rem_qp)
 {
-    opal_list_item_t *item;
-    udcm_message_sent_t *msg = NULL;
+    udcm_message_sent_t *msg, *next;
 
     pthread_mutex_lock (&m->cm_timeout_lock);
 
-    BTL_VERBOSE(("got ack for message %d", id));
+    BTL_VERBOSE(("got ack for message 0x%08x from slid 0x%04x qp 0x%08x", id, slid,
+		 rem_qp));
 
-    for (item = opal_list_get_first (&m->flying_messages) ;
-	 item != opal_list_get_end (&m->flying_messages) ;
-	 item = opal_list_get_next (item)) {
-	msg = (udcm_message_sent_t *) item;
+    OPAL_LIST_FOREACH_SAFE(msg, next, &m->flying_messages, udcm_message_sent_t) {
+	const struct mca_btl_base_endpoint_t *lcl_ep = msg->endpoint;
 
-	if (msg->data && id == msg->data->id) {
-	    opal_list_remove_item (&m->flying_messages, item);
+	if (NULL == msg->data || NULL == msg->endpoint) {
+	    /* shouldn't happen */
+	    opal_list_remove_item(&m->flying_messages, &msg->super);
+	    continue;
+	}
+
+	if (slid == UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_lid &&
+	    rem_qp == UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_qp_num) {
+	    BTL_VERBOSE(("found matching message"));
+	    opal_list_remove_item (&m->flying_messages, &msg->super);
 	    
 	    break;
 	}
-	msg = NULL;
     }
 
     pthread_mutex_unlock (&m->cm_timeout_lock);
 
     mca_btl_openib_endpoint_cpc_complete(lcl_ep);
 
+    lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED;
+
     return OMPI_SUCCESS;
 }
 
     udcm_endpoint_t *udep;
     uint64_t dir;
 
+    memset(wc, 0, sizeof(wc));
+
     count = ibv_poll_cq (event_cq, 20, wc);
     if (count < 0)
 	return count;
 
-    for (i = 0 ; i < count && !m->cm_exiting ; i++) {
+    for (i = 0 ; i < count ; i++) {
 	dir = wc[i].wr_id & UDCM_WR_DIR_MASK;
 
+	BTL_VERBOSE(("WC: wr_id: 0x%016" PRIu64 ", status: %d, opcode: 0x%x, byte_len: %x, imm_data: 0x%08x, "
+		     "qp_num: 0x%08x, src_qp: 0x%08x, wc_flags: 0x%x, slid: 0x%04x\n",
+		     wc[i].wr_id, wc[i].status, wc[i].opcode, wc[i].byte_len,
+		     wc[i].imm_data, wc[i].qp_num, wc[i].src_qp, wc[i].wc_flags, wc[i].slid));
+
 	if (UDCM_WR_RECV_ID != dir) {
 	    opal_output (0, "unknown packet");
 	    continue;
 
 	if (wc[i].wc_flags & IBV_WC_WITH_IMM) {
 	    /* ack! */
-	    udcm_handle_ack (m, wc[i].imm_data);
+	    udcm_handle_ack (m, wc[i].imm_data, wc[i].slid, wc[i].src_qp);
 	    udcm_module_post_one_recv (m, msg_num);
 
 	    continue;
 	opal_list_append (&m->cm_recv_msg_queue, &item->super);
 	opal_mutex_unlock(&m->cm_recv_msg_queue_lock);
 	
-	udcm_send_ack (lcl_ep, msg_hdr->id);
+	udcm_send_ack (lcl_ep);
 
 	/* Repost the receive */
 	udcm_module_post_one_recv (m, msg_num);
     }
 
-    if (!m->cm_exiting) {
-	opal_mutex_lock (&m->cm_recv_msg_queue_lock);
-	if (opal_list_get_size (&m->cm_recv_msg_queue) &&
-	    !m->cm_message_event_active) {
-	    m->cm_message_event_active = true;
- 	    ompi_btl_openib_fd_run_in_main (udcm_message_callback, (void *) m);
-	}
-	opal_mutex_unlock (&m->cm_recv_msg_queue_lock);
+    opal_mutex_lock (&m->cm_recv_msg_queue_lock);
+    if (opal_list_get_size (&m->cm_recv_msg_queue) &&
+	!m->cm_message_event_active) {
+	m->cm_message_event_active = true;
+	ompi_btl_openib_fd_run_in_main (udcm_message_callback, (void *) m);
     }
+    opal_mutex_unlock (&m->cm_recv_msg_queue_lock);
 
     return count;
 }
     void *event_context;
     int rc;
 
-    if (OPAL_UNLIKELY(NULL == m || NULL == m->cm_channel)) {
-	return NULL;
-    }
+    opal_mutex_lock (&m->cm_lock);
 
-    rc = ibv_get_cq_event (m->cm_channel, &event_cq, &event_context);
+    do {
+	if (OPAL_UNLIKELY(NULL == m || NULL == m->cm_channel)) {
+	    break;
+	}
 
-    if (0 != rc || NULL == event_cq) {
-	return NULL;
-    }
+	rc = ibv_get_cq_event (m->cm_channel, &event_cq, &event_context);
 
-    /* acknowlege the event */
-    ibv_ack_cq_events (event_cq, 1);
+	if (0 != rc || NULL == event_cq) {
+	    break;
+	}
 
-    rc = udcm_process_messages (event_cq, m);
-    if (rc < 0) {
-	BTL_VERBOSE(("error processing incomming messages"));
-	return NULL;
-    }
+	/* acknowlege the event */
+	ibv_ack_cq_events (event_cq, 1);
 
-    if (false == m->cm_exiting) {
+	if (m->cm_exiting) {
+	    break;
+	}
+
+	rc = udcm_process_messages (event_cq, m);
+	if (rc < 0) {
+	    BTL_VERBOSE(("error processing incomming messages"));
+	    break;
+	}
+
 	if (ibv_req_notify_cq(event_cq, 0)) {
 	    BTL_VERBOSE(("error asking for cq notifications"));
 	}
-    }
+    } while (0);
+
+    opal_mutex_unlock (&m->cm_lock);
 
     return NULL;
 }
     mca_btl_base_endpoint_t *lcl_ep = msg->endpoint;
 
     do {
-	BTL_VERBOSE(("send for message %d timed out (msg = %p)", msg->data->id,
-		     (void *) msg));
+	BTL_VERBOSE(("send for message to 0x%04x:0x%08x timed out",
+		     UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_lid,
+		     UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_qp_num));
 
-	/* This happens from time to time at the end of a run (probably die to a</