@phdthesis{RISC122,author = {Károly Bósa},
title = {{Fault Tolerance for Distributed Maple}},
language = {english},
abstract = {Distributed Maple is a Java-based system for implementing in distributed environments
parallel programs in the computer algebra system Maple. It has evolved from
Dr. Wolfgang Schreiner's experience in the development of parallel computer algebra
environments and from learning from the work of other researchers. As the problems to
which the system was applied became more and more complex, the meantime
between session failures became a limiting factor of the applicability of
the system. However, the fact that the parallel programming model of the
system is basically functional gave the chance to develop new
fault tolerance mechanisms for Distributed Maple which are more effective than
existing solutions targeted to general parallel applications (like checkpointing).
In this thesis, we present and describe how we have extended Distributed Maple
with fault tolerance such that the time spent in a long running
computation is not any more wasted by the eventual occurrence of a
failure. First we introduced a mechanism for the logging
of task return values and of shared object values such that after a failure
a newly started session can (transparently to the application program)
reuse already computed results. Then we concentrate on node failures and
permanent connection failures. We implemented some new mechanisms by which a
session is able to tolerate connection and node failures (even if the root node fails)
without overall failure and continue normal operation. Furthermore, the system periodically
attempts to restart the failed nodes and to reestablish the broken connections. Together
these fault tolerance mechanisms allow to run computations that take much longer than
the meantime between session failures.
With these developments, Distributed Maple is by far the most advanced system
for computer algebra concerning reliability in distributed environments.},
year = {2004},
month = {September},
translation = {0},
school = {RISC-Linz, Johannes Kepler University, Linz, Austria},
keywords = {distributed systems, fault tolerance, computer algebra},
length = {116}
}