Checkpointing errors in parallel

Issue #86 new
James R. Maddison created an issue

Tested with FEniCS 2017.1.0 and dolfin-adjoint 2017.1.0. Enabling checkpointing leads to errors in parallel. For example editing tests_dolfin/checkpoint_burgers/checkpoint_burgers.py via

diff --git a/tests_dolfin/checkpoint_burgers/checkpoint_burgers.py b/tests_dolfin/checkpoint_burgers/checkpoint_burgers.py
index 2432715..b0d38dd 100644
--- a/tests_dolfin/checkpoint_burgers/checkpoint_burgers.py
+++ b/tests_dolfin/checkpoint_burgers/checkpoint_burgers.py
@@ -8,7 +8,7 @@ from dolfin import *
 from dolfin_adjoint import *
 from math import ceil

-n = 100
+n = 10
 mesh = UnitIntervalMesh(n)
 V = FunctionSpace(mesh, "CG", 2)

@@ -22,11 +22,13 @@ def Dt(u, u_, timestep):
 def main(ic, annotate=False):
     timestep = Constant(1.0/n)
     t = 0.0
-    end = 0.5
+    steps = 20
+    end = steps * float(timestep)
     if annotate:
-        adj_checkpointing('multistage', int(ceil(end/float(timestep))), 5, 10, verbose=True)
+        adj_checkpointing('multistage', steps, 3, 3, verbose=True)

     u_ = ic.copy(deepcopy=True, annotate=annotate)
+    u_.rename("u_", u_.label())
     u = TrialFunction(V)
     v = TestFunction(V)
     nu = Constant(0.0001)
@@ -38,14 +40,16 @@ def main(ic, annotate=False):

     bc = DirichletBC(V, 0.0, "on_boundary")

-    u = Function(V)
+    u = Function(V, name = "u")
+    w = Function(V, name = "w")
     j = 0
     j += 0.5*float(timestep)*assemble(u_*u_*dx)
     if annotate:
         adjointer.time.start(t)

-    while (t <= end):
+    for i in range(steps):
         solve(a == L, u, bc, annotate=annotate)
+        solve(inner(v, TrialFunction(V)) * dx == inner(v, u + w) * dx, w)

         u_.assign(u, annotate=annotate)

and running in parallel

mpirun -np 4 python3 checkpoint_burgers.py

may lead to an error of the type

*** -------------------------------------------------------------------------
*** DOLFIN encountered an error. If you are not able to resolve this issue
*** using the information listed below, you can ask for help at
***
***     fenics-support@googlegroups.com
***
*** Remember to include the error message listed below and, if possible,
*** include a *minimal* running example to reproduce the error.
***
*** -------------------------------------------------------------------------
*** Error:   Unable to read function from file.
*** Reason:  Group with name "u_-5-0-Forward" does not exist.
*** Where:   This error was encountered inside HDF5File.cpp.
*** Process: 1
*** 
*** DOLFIN version: 2017.1.0
*** Git changeset:  
*** -------------------------------------------------------------------------

This appears to be an issue with the order that coefficient data is encountered on different processes, and seems to be addressed via

index 8d3b7d7..4dfc78f 100644
--- a/dolfin_adjoint/coeffstore.py
+++ b/dolfin_adjoint/coeffstore.py
@@ -1,11 +1,13 @@
 import libadjoint

+from collections import OrderedDict
+
 class CoeffStore(object):
     '''This object manages the mapping from Dolfin coefficients to libadjoint Variables.
     In the process, it also manages the incrementing of the timestep associated with each
     variable, so that the user does not have to manually manage the time information.'''
     def __init__(self):
-        self.coeffs = {}
+        self.coeffs = OrderedDict()
         self.libadjoint_timestep = 0
         self.str_to_coeff = {}

Comments (0)

  1. Log in to comment