Commits

Jeff Squyres committed c419909 Draft

Fixes #3503: Move TCP BTL handshake fix to v1.6

---svn-pre-commit-ignore-below---

Custom patch on the ticket based on:

r28023 [[BR]]
Fixes #3494: If we get 0 bytes back for the ACK, it doesn't
necessarily mean an error -- it could (and usually does) mean that the
peer realized that we both initiated a connect at the same time, and
therefore it decided to hang up.

I also added a friendly show_help error message for other cases where
recv_blocking() fails (i.e., "Something went wrong. Kaboom! Your job
will abort...").

Comments (0)

Files changed (2)

ompi/mca/btl/tcp/btl_tcp_endpoint.c

  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2007-2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2013 Cisco Systems, Inc.  All rights reserved.
  * $COPYRIGHT$
  * 
  * Additional copyrights may follow
 #include <time.h>
 #endif  /* HAVE_TIME_H */
 
+#include "opal/util/net.h"
+#include "opal/util/show_help.h"
+
 #include "ompi/types.h"
 #include "ompi/mca/btl/base/btl_base_error.h"
-#include "opal/util/net.h"
 
 #include "btl_tcp.h"
 #include "btl_tcp_endpoint.h" 
         /* remote closed connection */
         if(retval == 0) {
             mca_btl_tcp_endpoint_close(btl_endpoint);
-            return -1;
+            return cnt;
         }
 
         /* socket is non-blocking so handle errors */
 static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_endpoint)
 {
     orte_process_name_t guid;
+    size_t s;
     mca_btl_tcp_proc_t* btl_proc = btl_endpoint->endpoint_proc;
 
-    if((mca_btl_tcp_endpoint_recv_blocking(btl_endpoint, &guid, sizeof(orte_process_name_t))) != sizeof(orte_process_name_t)) {
+    s = mca_btl_tcp_endpoint_recv_blocking(btl_endpoint,
+                                           &guid, sizeof(orte_process_name_t));
+    if (s != sizeof(orte_process_name_t)) {
+        if (0 == s) {
+            /* If we get zero bytes, the peer closed the socket. This
+               can happen when the two peers started the connection
+               protocol simultaneously. Just report the problem
+               upstream. */
+            return OMPI_ERROR;
+        }
+        opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
+                       true, orte_process_info.nodename,
+                       orte_process_info.pid, 
+                       "did not receive entire connect ACK from peer");
         return OMPI_ERR_UNREACH;
     }
     ORTE_PROCESS_NAME_NTOH(guid);

ompi/mca/btl/tcp/help-mpi-btl-tcp.txt

 # -*- text -*-
 #
-# Copyright (c) 2009-2011 Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2009-2013 Cisco Systems, Inc.  All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
 
   Local host: %s
   Value:      %d
+[client connect fail]
+WARNING: Open MPI failed to TCP connect to a peer MPI process.  This
+should not happen.
+
+Your Open MPI job may now fail.
+
+  Local host: %s
+  PID:        %d
+  Message:    %s
+  Error:      %s (%d)
+#
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.