Checkpointing errors in parallel
Issue #86
new
Tested with FEniCS 2017.1.0 and dolfin-adjoint 2017.1.0. Enabling checkpointing leads to errors in parallel. For example editing tests_dolfin/checkpoint_burgers/checkpoint_burgers.py via
diff --git a/tests_dolfin/checkpoint_burgers/checkpoint_burgers.py b/tests_dolfin/checkpoint_burgers/checkpoint_burgers.py
index 2432715..b0d38dd 100644
--- a/tests_dolfin/checkpoint_burgers/checkpoint_burgers.py
+++ b/tests_dolfin/checkpoint_burgers/checkpoint_burgers.py
@@ -8,7 +8,7 @@ from dolfin import *
from dolfin_adjoint import *
from math import ceil
-n = 100
+n = 10
mesh = UnitIntervalMesh(n)
V = FunctionSpace(mesh, "CG", 2)
@@ -22,11 +22,13 @@ def Dt(u, u_, timestep):
def main(ic, annotate=False):
timestep = Constant(1.0/n)
t = 0.0
- end = 0.5
+ steps = 20
+ end = steps * float(timestep)
if annotate:
- adj_checkpointing('multistage', int(ceil(end/float(timestep))), 5, 10, verbose=True)
+ adj_checkpointing('multistage', steps, 3, 3, verbose=True)
u_ = ic.copy(deepcopy=True, annotate=annotate)
+ u_.rename("u_", u_.label())
u = TrialFunction(V)
v = TestFunction(V)
nu = Constant(0.0001)
@@ -38,14 +40,16 @@ def main(ic, annotate=False):
bc = DirichletBC(V, 0.0, "on_boundary")
- u = Function(V)
+ u = Function(V, name = "u")
+ w = Function(V, name = "w")
j = 0
j += 0.5*float(timestep)*assemble(u_*u_*dx)
if annotate:
adjointer.time.start(t)
- while (t <= end):
+ for i in range(steps):
solve(a == L, u, bc, annotate=annotate)
+ solve(inner(v, TrialFunction(V)) * dx == inner(v, u + w) * dx, w)
u_.assign(u, annotate=annotate)
and running in parallel
mpirun -np 4 python3 checkpoint_burgers.py
may lead to an error of the type
*** -------------------------------------------------------------------------
*** DOLFIN encountered an error. If you are not able to resolve this issue
*** using the information listed below, you can ask for help at
***
*** fenics-support@googlegroups.com
***
*** Remember to include the error message listed below and, if possible,
*** include a *minimal* running example to reproduce the error.
***
*** -------------------------------------------------------------------------
*** Error: Unable to read function from file.
*** Reason: Group with name "u_-5-0-Forward" does not exist.
*** Where: This error was encountered inside HDF5File.cpp.
*** Process: 1
***
*** DOLFIN version: 2017.1.0
*** Git changeset:
*** -------------------------------------------------------------------------
This appears to be an issue with the order that coefficient data is encountered on different processes, and seems to be addressed via
index 8d3b7d7..4dfc78f 100644
--- a/dolfin_adjoint/coeffstore.py
+++ b/dolfin_adjoint/coeffstore.py
@@ -1,11 +1,13 @@
import libadjoint
+from collections import OrderedDict
+
class CoeffStore(object):
'''This object manages the mapping from Dolfin coefficients to libadjoint Variables.
In the process, it also manages the incrementing of the timestep associated with each
variable, so that the user does not have to manually manage the time information.'''
def __init__(self):
- self.coeffs = {}
+ self.coeffs = OrderedDict()
self.libadjoint_timestep = 0
self.str_to_coeff = {}