Source

mpi3-fortran / ompi / mca / btl / openib / btl_openib_failover.c

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
/*
 * Copyright (c) 2010-2011 Oracle and/or its affiliates.  All rights reserved.
 * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
 * Copyright (c) 2012      Los Alamos National Security, LLC.  All rights
 *                         reserved. 
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

/**
 * @file
 * Functions specific to implementing failover support.
 *
 * This file is conditionally copiled into the BTL when one configures
 * it in with --enable-openib-failover.  When this file is compiled
 * in, the multi-BTL configurations can handle errors.  The
 * requirement is that there needs to be more than one openib BTL in
 * use so that all the traffic can move to the other BTL.  This does
 * not support failing over to a different BTL like TCP.
 */

#include "ompi_config.h"
#include "opal_stdint.h"

#include "btl_openib.h"
#include "btl_openib_endpoint.h"
#include "btl_openib_proc.h"
#include "btl_openib_failover.h"

static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
                                        struct mca_btl_base_module_t* module,
                                        bool errout);
static void mca_btl_openib_endpoint_notify(mca_btl_openib_endpoint_t *endpoint,
                                           uint8_t type, int index);

/* debug functions that are normally not needed */
void mca_btl_openib_dump_all_local_rdma_frags(mca_btl_openib_device_t *device);
void mca_btl_openib_dump_all_internal_queues(bool errout);
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint);

/**
 * This function is called when we get an error on the completion
 * event of a fragment.  We check to see what type of fragment it is
 * and act accordingly.  In most cases, we first call up into the PML
 * and have it map out this connection for any future communication.
 * In addition, this function will possibly send some control messages
 * over the other openib BTL.  The first control message will tell the
 * remote side to also map out this connection.  The second control
 * message makes sure the eager RDMA connection remains in a sane
 * state.  See that function for more details.
 * @param openib_btl Pointer to BTL that had the error
 * @param des Pointer to descriptor that had the error
 * @param qp Queue pair that had the error
 * @param remote_proc Pointer to process that had the error
 * @param endpoint Pointer to endpoint that had the error
 */
void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
                                          mca_btl_base_descriptor_t *des,
                                          int qp,
                                          ompi_proc_t* remote_proc,
                                          mca_btl_openib_endpoint_t* endpoint)
{
    char *btlname = NULL;
    int btl_ownership;
    /* Since this BTL supports failover, it will call the PML error handler
     * function with the NONFATAL flag.  If the PML is running with failover
     * support, then it will map out the endpoint for further communication
     * and return control here.  If the PML does not have failover support,
     * it will abort the job and control will not return here. */

    /* Note: At this point, what needs to be done is based on the type
     * of openib fragment that got the error.  Also note that in the wc
     * struct, when wc->status != IBV_WC_SUCCESS, these are the only
     * valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num.
     * This means that one cannot key off of the wc->opcode to see what
     * operation was done.  The important information needs to be read
     * from the fragment. */

    /* Cannot issue callback to SRQ errors because the shared receive
     * queue is shared and is not specific to a connection.  There is no
     * way to figure out what type of message created the error because
     * we need the information in the wc->imm_data field which does not
     * exist when we have an error.  So, nothing to do here but return. */
    if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
        !BTL_OPENIB_QP_TYPE_PP(qp)) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "SRQ RECV type=%d", openib_frag_type(des));
        /* Need to think about returning any shared resources of the
         * SRQ.  For now, we do nothing as we rarely see an error on
         * the SRQ. */
        return;
    }
    assert(NULL != remote_proc);

    /* Create a nice string to help with debug */
    if (NULL != openib_btl) {
        asprintf(&btlname, "lid=%d:name=%s",
                 openib_btl->lid, openib_btl->device->ib_dev->name);
    }

    /* The next set of errors are associated with an endpoint, but not
     * with a PML descriptor.  They are not associated with a PML
     * descriptor because:
     *    A. It was a receive
     *    B. It was some type of openib specific control message.
     * Therefore, just drop the fragments and call up into the PML to
     * disable this endpoint for future communication. */
    if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
         (BTL_OPENIB_QP_TYPE_PP(qp))) ||
         (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) ||
         (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) {
        openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                              remote_proc, btlname);
        /* Now that this connection has been mapped out at the PML layer,
         * we change the state in the BTL layer.  The change in the PML
         * layer should prevent that we ever try to send on this BTL
         * again.  If we do, then this is an error case.  */
        if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
            endpoint->endpoint_state = MCA_BTL_IB_FAILED;
            mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
            error_out_all_pending_frags(endpoint, &openib_btl->super, true);
        }
        opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
                            "MCA_BTL_OPENIG_FRAG=%d, "
                            "dropping since connection is broken (des=%lx)",
                            openib_frag_type(des), (long unsigned int) des);
        if (NULL != btlname) free(btlname);
        return;
    }

    /* These are RDMA read type fragments.  Just continue with processing */
    if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) {
        OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "OPENIB_FRAG_RECV_USER fragment, "
                            "btl=%lx, continue with callbacks",
                            (long unsigned int) &openib_btl->super);
    }

    /* If we are at this point, we have completed a send, RDMA read or
     * RDMA write.  Call the PML callback function to map out this
     * btl for further sending.  We just call this every time we get an
     * error even though it is not necessary.  Subsequent calls with
     * the same remote_proc argument will not actually map anything out. */
    openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                         remote_proc, btlname);
    if (NULL != btlname) free(btlname);

    /* Since we believe we have done a send, read or write, then the
     * des_src fields should have valid data. */
    assert(des->des_src != NULL);

    /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then
     * change the status.  Since this connection was mapped out in the
     * PML layer, no more attempts should be made to send on it.  In
     * addition, send a message to other end of the connection letting
     * it know that this side is now broken.  This is needed in the case
     * of a spurious error which may not cause the remote side to detect
     * the error.  */
    if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
        endpoint->endpoint_state = MCA_BTL_IB_FAILED;
        mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
    }

    /* Now, call the callback function associated with the fragment.
     * In case the fragments were coalesced we need to pull them apart
     * and call the callback function for each one. */
    if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
        opal_list_item_t *i;
        while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
            btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
            to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
                                             &to_base_frag(i)->base, OMPI_ERROR);
            if( btl_ownership ) {
                mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
            }
        }
    }

    /* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER
     * or MCA_BTL_OPENIB_FRAG_RECV_USER. */
    btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
    des->des_cbfunc(&openib_btl->super, endpoint, des, OMPI_ERROR);
    if( btl_ownership ) {
        mca_btl_openib_free(&openib_btl->super, des);
    }

    /* Here we send another control message to notify the remote side
     * we had an error on a eager fragment.  A non-zero value for the
     * ftr variable indicates that this was an eager RDMA fragment.
     * We need to do this in case the eager RDMA fragment after this
     * one actually made it successfully. */
    if (0 != to_send_frag(des)->ftr) {
        mca_btl_openib_endpoint_notify(endpoint,
                                       MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR,
                                       (long)to_send_frag(des)->ftr - 1);
    }

    /* We know we have completed a send so return some resources even
     * though connection is broken.  With SRQ, the resources are shared
     * so if we do not return the credits we may not be allowed to send
     * anymore. */
    qp_put_wqe(endpoint, qp);
    if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) {
        OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
    }

    /* There are several queues associated with an endpoint that may
     * have some unsent fragments sitting in them.  Remove them and
     * call the callback functions with an error so the PML can send
     * them down a different path.  This really only needs to be called
     * once on an endpoint, but for now, just call it a bunch of times.
     * The first time through will remove the unsent fragments so
     * subsequent calls are no-ops. */
    if (endpoint) {
        error_out_all_pending_frags(endpoint, &openib_btl->super, true);
    }
}

/**
 * This functions allows an error to map out the entire BTL.  First a
 * call is made up to the PML to map out all connections from this BTL.
 * Then a message is sent to all the endpoints connected to this BTL.
 * This function is enabled by the btl_openib_port_error_failover
 * MCA parameter.  If that parameter is not set, then this function
 * does not do anything.
 * @param openib_btl Pointer to BTL that had the error
 */
void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
    mca_btl_base_endpoint_t* endpoint;
    int i;

    /* Check to see that the flag is set for the entire map out. */
    if(mca_btl_openib_component.port_error_failover) {
        /* Since we are not specifying a specific connection to bring down,
         * the PML layer will may out the entire BTL for future communication. */
        char *btlname = NULL;
        asprintf(&btlname, "lid=%d:name=%s",
                 openib_btl->lid, openib_btl->device->ib_dev->name);
        openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                             NULL, btlname);
        if (NULL != btlname) free(btlname);

        /* Now send out messages to all endpoints that we are disconnecting.
         * Only do this to endpoints that are connected.  Otherwise, the
         * remote side does not yet have the information on this endpoint.  */
        for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) {
            endpoint = (mca_btl_openib_endpoint_t*)
                opal_pointer_array_get_item(openib_btl->device->endpoints, i);
            if (NULL == endpoint) {
                continue;
            }
            if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) {
                mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
                endpoint->endpoint_state = MCA_BTL_IB_FAILED;
                error_out_all_pending_frags(endpoint, &openib_btl->super, true);
            }
        }
    }
}

/**
 * This function gets called when a control message is received that
 * is one of the following types:
 *   MCA_BTL_OPENIB_CONTROL_EP_BROKEN
 *   MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message
 * Note that we are using the working connection to send information
 * about the broken connection.  That is why we have to look at the
 * various information in the control message to figure out which
 * endpoint is broken.  It is (obviously) not the one the message was
 * received on, because we would not have received the message in that
 * case.  In the case of the BROKEN message, that means the remote
 * side is notifying us that it has brought down its half of the
 * connection.  Therefore, we need to bring out half down.  This is
 * done because it has been observed that there are cases where only
 * one side of the connection actually sees the error.  This means we
 * can be left in a state where one side believes it has two BTLs, but
 * the other side believes it only has one.  This can cause problems.
 * In the case of the EAGER_RDMA_ERROR, see elsewhere in the code what
 * we are doing.
 * @param ctl_hdr Pointer control header that was received
 */
void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t *ctl_hdr,
                                                 mca_btl_openib_endpoint_t* ep)
{
    mca_btl_openib_broken_connection_header_t *bc_hdr =
        (mca_btl_openib_broken_connection_header_t*)ctl_hdr;
    int i;
    int found = false;

    if(ep->nbo) {
        BTL_OPENIB_BROKEN_CONNECTION_HEADER_NTOH((*bc_hdr));
    }

    opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
                        "IB: Control message received from %d: lid=%d,subnet=0x%" PRIx64 "",
                        bc_hdr->vpid, bc_hdr->lid, bc_hdr->subnet_id);

    /* Now we walk through all the endpoints on all the BTLs to
     * find out which one to map out.  */
    for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
        mca_btl_openib_module_t* newbtl;
        int j;

        newbtl = mca_btl_openib_component.openib_btls[i];
        /* Now, find the endpoint associated with it */
        for (j = 0; j < opal_pointer_array_get_size(newbtl->device->endpoints); j++) {
            mca_btl_base_endpoint_t* newep;
            newep = (mca_btl_openib_endpoint_t*)
                opal_pointer_array_get_item(newbtl->device->endpoints, j);
            if (NULL == newep) {
                continue;
            }
            /* Now compare the LID, subnet ID, and the vpid we received
             * from the remote side and try to match it to an endpoint. */
            if ((bc_hdr->lid == newep->rem_info.rem_lid) &&
                (bc_hdr->subnet_id == newep->rem_info.rem_subnet_id) &&
                (bc_hdr->vpid == newep->endpoint_proc->proc_ompi->proc_name.vpid)) {
                opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
                                    "IB: Control message received from %d: "
                                    "found match: lid=%d,"
                                    "subnet=0x%" PRIx64 ",endpoint_state=%d",
                                    newep->endpoint_proc->proc_ompi->proc_name.vpid,
                                    newep->rem_info.rem_lid,
                                    newep->rem_info.rem_subnet_id,
                                    newep->endpoint_state);
                found = true;
                /* At this point, we have found the endpoint.  Now decode the
                 * message type and do the appropriate action. */
                if (MCA_BTL_OPENIB_CONTROL_EP_BROKEN == ctl_hdr->type) {
                    /* Now that we found a match, check the state of the
                     * endpoint to see it is already in a failed state.
                     * If not, then notify the upper layer and error out
                     * any pending fragments. */
                    if (MCA_BTL_IB_FAILED == newep->endpoint_state) {
                        return;
                    } else {
                        char *btlname = NULL;
                        ompi_proc_t* remote_proc = NULL;

                        asprintf(&btlname, "lid=%d:name=%s",
                                 newbtl->lid, newbtl->device->ib_dev->name);

                        remote_proc = newep->endpoint_proc->proc_ompi;

                        opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
                                            "IB: Control message received from %d: "
                                            "bringing down connection,lid=%d,"
                                            "subnet=0x%" PRIx64 ",endpoint_state=%d",
                                            newep->endpoint_proc->proc_ompi->proc_name.vpid,
                                            newep->rem_info.rem_lid,
                                            newep->rem_info.rem_subnet_id,
                                            newep->endpoint_state);
                        newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                                         remote_proc, btlname);
                        if (NULL != btlname) free(btlname);

                        error_out_all_pending_frags(newep, &newbtl->super, true);
                        newep->endpoint_state = MCA_BTL_IB_FAILED;
                        return;
                    }
                } else { /* MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message */
                    /* If we are still pointing at the location where
                     * we detected an error on the remote side, then
                     * bump the index by one. */
                    if (newep->eager_rdma_local.head == (uint16_t)bc_hdr->index) {
                        /* Adjust the local head by one just in case */
                        MCA_BTL_OPENIB_RDMA_NEXT_INDEX(newep->eager_rdma_local.head);
                        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                                            "IB: rank=%d, control message (remote=%d), "
                                            "moved local head by one (new=%d)",
                                            OMPI_PROC_MY_NAME->vpid,
                                            newep->endpoint_proc->proc_ompi->proc_name.vpid,
                                            newep->eager_rdma_local.head);
                    } else {
                        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                                            "IB: rank=%d, control message (remote=%d), "
                                            "did not move local head by one (still=%d)",
                                            OMPI_PROC_MY_NAME->vpid,
                                            newep->endpoint_proc->proc_ompi->proc_name.vpid,
                                            newep->eager_rdma_local.head);
                    }
                }
                break; /* since we found the endpoint */
            }
        }
    }
    if (false == found) {
        opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
                            "IB: Control message: no match found");
    }
}

/**
 * This function will find all the pending fragments on an endpoint
 * and call the callback function with OMPI_ERROR.  It walks through
 * each qp with each priority and looks for both no_credits_pending_frags
 * and no_wqe_pending_frags.  It then looks for any pending_lazy_frags,
 * pending_put_frags, and pending_get_frags.  This function is only
 * called when running with failover support enabled.  Note that
 * the errout parameter allows the function to also be used as a
 * debugging tool to see if there are any fragments on any of the
 * queues.
 * @param ep Pointer to endpoint that had error
 * @param module Pointer to module that had error
 * @param errout Boolean which says whether to error them out or not
 */
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
                                        struct mca_btl_base_module_t* module,
                                        bool errout)
{
    int qp, pri, len, total, btl_ownership;

    opal_list_item_t *item;
    mca_btl_openib_com_frag_t* frag;
    mca_btl_base_descriptor_t *des;
    int verbose = 10;  /* Verbosity level unless debugging */

    /* If debugging, drop verbosity level so we can see the output
     * regardless of the level the program was run with. */
    if (false == errout) {
	verbose = 0;
    }

    total = 0;
    /* Traverse all QPs and all priorities and move to other endpoint */
    for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
        for (pri = 0; pri < 2; ++pri) {
            /* All types of qp's have a no_wqe_pending_frags list */
            len = opal_list_get_size(&ep->qps[qp].no_wqe_pending_frags[pri]);
            if (len > 0) {
                total += len;
                opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                    "IB: Checking for no_wqe_pending_frags qp=%d, "
                                    "pri=%d, list size=%d",
                                    qp, pri, len);
                if (true == errout) {
                    while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
                                                                  no_wqe_pending_frags[pri]))) {
                        frag = (mca_btl_openib_com_frag_t *) item;
                        des = (mca_btl_base_descriptor_t *)frag;

                        /* Error out any coalesced frags if they exist */
                        if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
                            opal_list_item_t *i;
                            while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                                opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                                    "IB: Found coalesced frag in no_wqe_pending_frags");
                                btl_ownership = (to_base_frag(i)->base.des_flags &
                                                 MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                                to_base_frag(i)->base.des_cbfunc(module, ep,
                                                                 &to_base_frag(i)->base, OMPI_ERROR);
                                if( btl_ownership ) {
                                    mca_btl_openib_free(module, &to_base_frag(i)->base);
                                }
                            }
                        }
                        btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                        des->des_cbfunc(module, ep, des, OMPI_ERROR);
                        if( btl_ownership ) {
                            mca_btl_openib_free(module, des);
                        }
                    }
                }
            }
            if (BTL_OPENIB_QP_TYPE_PP(qp)) {
                len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
                if (len > 0) {
                    total += len;
                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                        "IB: Checking for no_credits_pending_frags qp=%d, "
                                        "pri=%d, list size=%d",
                                        qp, pri, len);
                    if (true == errout) {
                        while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
                                                                      no_credits_pending_frags[pri]))) {
                            frag = (mca_btl_openib_com_frag_t *) item;
                            des = (mca_btl_base_descriptor_t *)frag;

                            /* Error out any coalesced frags if they exist */
                            if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
                                opal_list_item_t *i;
                                while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                                        "IB: Found coalesced frag in "
                                                        "no_credits_pending_frags");
                                    btl_ownership = (to_base_frag(i)->base.des_flags &
                                                     MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                                    to_base_frag(i)->base.des_cbfunc(module, ep,
                                                                     &to_base_frag(i)->base, OMPI_ERROR);
                                    if( btl_ownership ) {
                                        mca_btl_openib_free(module, &to_base_frag(i)->base);
                                    }
                                }
                            }
                            btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                            des->des_cbfunc(module, ep, des, OMPI_ERROR);
                            if( btl_ownership ) {
                                mca_btl_openib_free(module, des);
                            }
                        }
                    }
                }

            } else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
                len = opal_list_get_size(&ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]);
                if (len > 0) {
                    total += len;
                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                        "IB: Checking for srq pending_frags qp=%d, pri=%d, "
                                        "list size=%d",
                                        qp, pri, len);
                    if (true == errout) {
                        while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp].
                                                                      u.srq_qp.pending_frags[pri]))) {
                            frag = (mca_btl_openib_com_frag_t *) item;
                            des = (mca_btl_base_descriptor_t *)frag;

                            /* Error out any coalesced frags if they exist */
                            if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
                                opal_list_item_t *i;
                                while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                                        "IB: Found coalesced frag in SRQ pending_frags");
                                    btl_ownership = (to_base_frag(i)->base.des_flags &
                                                     MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                                    to_base_frag(i)->base.des_cbfunc(module, ep,
                                                                     &to_base_frag(i)->base, OMPI_ERROR);
                                    if( btl_ownership ) {
                                        mca_btl_openib_free(module, &to_base_frag(i)->base);
                                    }
                                }
                            }
                            btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                            des->des_cbfunc(module, ep, des, OMPI_ERROR);
                            if( btl_ownership ) {
                                mca_btl_openib_free(module, des);
                            }
                        }
                    }
                }
            }
        }
    }

    /* Check for any frags from a connection that was never made.  Not sure if this
     * can actually happen. */
    len = opal_list_get_size(&ep->pending_lazy_frags);

    if (len > 0) {
        total += len;
        opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                            "IB: Checking for pending_lazy_frags, list size=%d", len);
        if (true == errout) {
            while  (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) {
                frag = (mca_btl_openib_com_frag_t *) item;
                des = (mca_btl_base_descriptor_t *)frag;
                des->des_cbfunc(module, ep, des, OMPI_ERROR);
            }
        }
    }

    len = opal_list_get_size(&ep->pending_put_frags);
    if (len > 0) {
        total += len;
        opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                            "IB: Checking for pending_put_frags, list size=%d", len);
        if (true == errout) {
            while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
                frag = (mca_btl_openib_com_frag_t *) item;
                des = (mca_btl_base_descriptor_t *)frag;
                des->des_cbfunc(module, ep, des, OMPI_ERROR);
            }
        }
    }

    len = opal_list_get_size(&ep->pending_get_frags);
    if (len > 0) {
        total += len;
        opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                            "IB: Checking for pending_get_frags, list size=%d", len);
        if (true == errout) {
            while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
                frag = (mca_btl_openib_com_frag_t *) item;
                des = (mca_btl_base_descriptor_t *)frag;
                des->des_cbfunc(module, ep, des, OMPI_ERROR);
            }
        }
    }

    opal_output_verbose(verbose + 30, mca_btl_openib_component.verbose_failover,
                        "IB: Finished checking for pending_frags, total moved=%d",
                        total);
}

/* local callback function for completion of a failover control message */
static void mca_btl_openib_endpoint_notify_cb(mca_btl_base_module_t* btl,
                                              struct mca_btl_base_endpoint_t* endpoint,
                                              struct mca_btl_base_descriptor_t* descriptor,
                                              int status)
{
    MCA_BTL_IB_FRAG_RETURN(descriptor);
}

/**
 * This function is used to send a message to the remote side
 * indicating the endpoint is broken and telling the remote side to
 * brings its endpoint down as well.  This is needed because there are
 * cases where only one side of the connection determines that the
 * there was a problem.
 * @param endpoint Pointer to endpoint with error
 * @param type Type of message to be sent, can be one of two types
 * @param index When sending RDMA error message, index is non zero
 */
static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_module_t* newbtl = NULL;
    bool found = false;
    mca_btl_openib_broken_connection_header_t *bc_hdr;
    mca_btl_openib_send_control_frag_t* frag;
    mca_btl_base_endpoint_t* newep;
    int i, rc;
    ompi_proc_t* remote_proc = endpoint->endpoint_proc->proc_ompi;

    /* First, find a different BTL than this one that got the
     * error to send the message over. */
    for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
        if (mca_btl_openib_component.openib_btls[i] != openib_btl) {
            newbtl = mca_btl_openib_component.openib_btls[i];
            break;
        }
    }
    if (NULL == newbtl) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No BTL found");
        /* If we cannot find one, then just return. */
        return;
    }

    /* Now, find the endpoint associated with it.  The device
     * associated with the BTL has the list of all the
     * endpoints. */
    for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) {
        newep = (mca_btl_openib_endpoint_t*)
            opal_pointer_array_get_item(newbtl->device->endpoints, i);
        if (NULL == newep) {
            continue;
        }
        if (newep->endpoint_proc->proc_ompi == remote_proc) {
            found = true;
            break;
        }
    }
    if (false == found) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No endpoint found");
        /* If we cannot find a match, then just return. */
        return;
    }

    frag = alloc_control_frag(newbtl);
    if(NULL == frag) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No frag space");
        /* If no frag available, then just return. */
        return;
    }

    to_base_frag(frag)->base.des_cbfunc =
        mca_btl_openib_endpoint_notify_cb;
    to_base_frag(frag)->base.des_cbdata = NULL;
    to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
    to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
    to_base_frag(frag)->segment.base.seg_len =
        sizeof(mca_btl_openib_broken_connection_header_t);
    to_com_frag(frag)->endpoint = newep;

    frag->hdr->tag = MCA_BTL_TAG_BTL;
    bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval;
    bc_hdr->control.type = type;
    bc_hdr->lid = endpoint->endpoint_btl->port_info.lid;
    bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id;
    bc_hdr->vpid = OMPI_PROC_MY_NAME->vpid;
    bc_hdr->index = index;

    if(newep->nbo) {
        BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr));
    }
    rc = mca_btl_openib_endpoint_send(newep, frag);
    if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc) {
        return;
    }

    MCA_BTL_IB_FRAG_RETURN(frag);
    BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno)));
    return;
}

/*
 * Function used for debugging problems in eager rdma.
 */
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint) {
    mca_btl_openib_recv_frag_t *headers_buf = endpoint->eager_rdma_local.frags;
    mca_btl_openib_recv_frag_t * frag;
    mca_btl_openib_control_header_t* chdr;
    int i, size;

    opal_output(0, "Head = %d", endpoint->eager_rdma_local.head);

    for (i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
        frag = &headers_buf[i];
        size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);

        frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
               size + sizeof(mca_btl_openib_footer_t));
        to_base_frag(frag)->segment.base.seg_addr.pval =
               ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);

        chdr = to_base_frag(frag)->segment.base.seg_addr.pval;
        if ((MCA_BTL_TAG_BTL == frag->hdr->tag) &&
            (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type)) {
            opal_output(0, "tag[%d] is credit message", i);
        } else {
            opal_output(0, "frag[%d] size=%d,tag=%d,ftr->u.buf=%d", i, size, frag->hdr->tag,
                        frag->ftr->u.buf[3]);
        }
    }
}

/*
 * Function used for debugging problems in eager rdma.
 */
void mca_btl_openib_dump_all_local_rdma_frags(mca_btl_openib_device_t *device) {
    int i, c;
    mca_btl_openib_endpoint_t* endpoint;

    c = device->eager_rdma_buffers_count;
    opal_output(0, "rank=%d, device=%s", OMPI_PROC_MY_NAME->vpid, device->ib_dev->name);

    for(i = 0; i < c; i++) {
        endpoint = device->eager_rdma_buffers[i];

        if(!endpoint)
            continue;

        dump_local_rdma_frags(endpoint);
    }
}

/**
 * This function is a debugging tool.  If you notify a hang, you can
 * call this function from a debugger and see if there are any
 * messages stuck in any of the queues.  If you call it with
 * errout=true, then it will error them out.  Otherwise, it will
 * just print out the size of the queues with data in them.
 */
void mca_btl_openib_dump_all_internal_queues(bool errout) {
    int i, j, num_eps;
    mca_btl_openib_module_t* btl;
    int total;
    mca_btl_base_endpoint_t* ep;
    struct mca_btl_base_module_t* module;

    for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
        btl = mca_btl_openib_component.openib_btls[i];
        module = &btl->super;
        num_eps = opal_pointer_array_get_size(btl->device->endpoints);

        /* Now, find the endpoint associated with it */
        for (j = 0; j < num_eps; j++) {
            ep = (mca_btl_openib_endpoint_t*)
                opal_pointer_array_get_item(btl->device->endpoints, j);
            if (NULL == ep) {
                continue;
            }

            total = 0;
            error_out_all_pending_frags(ep, module, errout);
        }
    }
}