Coverage for pySDC/projects/Resilience/fault

1import numpy as np

2import matplotlib.pyplot as plt

3from mpi4py import MPI

4import pickle

6import pySDC.helpers.plot_helper as plot_helper

7from pySDC.helpers.stats_helper import get_sorted

9from pySDC.projects.Resilience.hook import hook_collection, LogUAllIter, LogData

10from pySDC.projects.Resilience.fault_injection import get_fault_injector_hook

11from pySDC.implementations.convergence_controller_classes.hotrod import HotRod

12from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity

13from pySDC.implementations.hooks.log_errors import LogLocalErrorPostStep

14from pySDC.implementations.hooks.log_work import LogWork

16# these problems are available for testing

17from pySDC.projects.Resilience.advection import run_advection

18from pySDC.projects.Resilience.vdp import run_vdp

19from pySDC.projects.Resilience.piline import run_piline

20from pySDC.projects.Resilience.Lorenz import run_Lorenz

21from pySDC.projects.Resilience.Schroedinger import run_Schroedinger

22from pySDC.projects.Resilience.quench import run_quench

23from pySDC.projects.Resilience.AC import run_AC

24from pySDC.projects.Resilience.RBC import run_RBC

25from pySDC.projects.Resilience.GS import run_GS

27from pySDC.projects.Resilience.strategies import BaseStrategy, AdaptivityStrategy, IterateStrategy, HotRodStrategy

28import logging

30plot_helper.setup_mpl(reset=True)

32LOGGER_LEVEL = 40

34RECOVERY_THRESH_ABS = {

35 # run_quench: 5e-3,

36 # run_Schroedinger: 1.7e-6,

37}

40class FaultStats:

41 '''

42 Class to generate and analyse fault statistics

43 '''

45 def __init__(

46 self,

47 prob=None,

48 strategies=None,

49 faults=None,

50 reload=True,

51 recovery_thresh=1 + 1e-3,

52 recovery_thresh_abs=0.0,

53 num_procs=1,

54 mode='default',

55 stats_path='data/stats',

56 use_MPI=False,

57 **kwargs,

58 ):

59 '''

60 Initialization routine

62 Args:

63 prob: A function that runs a pySDC problem, see imports for available problems

64 strategies (list): List of resilience strategies

65 faults (list): List of booleans that describe whether to use faults or not

66 reload (bool): Load previously computed statistics and continue from there or start from scratch

67 recovery_thresh (float): Relative threshold for recovery

68 num_procs (int): Number of processes

69 mode (str): Mode for fault generation: Either 'random' or 'combination'

70 '''

71 self.prob = prob

72 self.strategies = [None] if strategies is None else strategies

73 self.faults = [False, True] if faults is None else faults

74 self.reload = reload

75 self.recovery_thresh = recovery_thresh

76 self.recovery_thresh_abs = recovery_thresh_abs

77 self.num_procs = num_procs

78 self.use_MPI = use_MPI

79 self.stats_path = stats_path

80 self.kwargs = {

81 'fault_frequency_iter': 500,

82 **kwargs,

83 }

85 # decide mode

86 if mode == 'default':

87 if prob.__name__ in ['run_vdp', 'run_Lorenz']:

88 self.mode = 'combination'

89 else:

90 self.mode = 'random'

91 else:

92 self.mode = mode

94 self.logger = logging.getLogger('FaultStats')

95 self.logger.level = LOGGER_LEVEL

97 msg = 'Starting FaultStats with attributes'

98 for key, val in self.__dict__.items():

99 if key in ['logger']:

100 continue

101 item = [str(me) for me in val] if type(val) == list else str(val)

102 msg += '\n\t' f'{key}: {item}'

103 self.logger.log(25, msg)

104

105 def get_Tend(self):

106 '''

107 Get the final time of runs for fault stats based on the problem

108

109 Returns:

110 float: Tend to put into the run

111 '''

112 return self.strategies[0].get_Tend(self.prob, self.num_procs, resilience_experiment=True)

113

114 def run_stats_generation(

115 self, runs=1000, step=None, comm=None, kwargs_range=None, faults=None, _reload=False, _runs_partial=0

116 ):

117 '''

118 Run the generation of stats for all strategies in the `self.strategies` variable

119

120 Args:

121 runs (int): Number of runs you want to do

122 step (int): Number of runs you want to do between saving

123 comm (MPI.Communicator): Communicator for distributing runs

124 faults (bool): Whether to do stats with faults or without

125 kw_args_range (dict): Range for the parameters

126 _reload, _runs_partial: Variables only used for recursion. Do not change!

127

128 Returns:

129 None

130 '''

131 if faults is None:

132 for f in self.faults:

133 self.run_stats_generation(runs=runs, step=step, comm=comm, kwargs_range=kwargs_range, faults=f)

134 return None

135

136 for key, val in kwargs_range.items() if kwargs_range is not None else {}:

137 if type(val) == int:

138 self.kwargs[key] = val

139 else:

140 for me in val:

141 kwargs_range_me = {**kwargs_range, key: me}

142 self.run_stats_generation(

143 runs=runs, step=step, comm=comm, kwargs_range=kwargs_range_me, faults=faults

144 )

145 return None

146

147 comm = MPI.COMM_WORLD if comm is None else comm

148 step = (runs if comm.size == 1 else comm.size) if step is None else step

149 _runs_partial = step if _runs_partial == 0 else _runs_partial

150 reload = self.reload or _reload

151

152 # see if we limit the number of runs we want to do

153 max_runs = (

154 (min(self.get_max_combinations(), runs) if self.mode == 'combination' else runs) if faults else min(runs, 5)

155 )

156

157 if reload:

158 # sort the strategies to do some load balancing

159 sorting_index = None

160 if comm.rank == 0:

161 already_completed = np.array(

162 [self.load(strategy=strategy, faults=faults).get('runs', 0) for strategy in self.strategies]

163 )

164 sorting_index_ = np.argsort(already_completed)

165 sorting_index = sorting_index_[already_completed[sorting_index_] < max_runs]

166 _runs_partial = max(min(already_completed), _runs_partial)

167

168 # tell all ranks what strategies to use

169 sorting_index = comm.bcast(sorting_index, root=0)

170 _runs_partial = comm.bcast(_runs_partial, root=0)

171 strategies = [self.strategies[i] for i in sorting_index]

172 if len(strategies) == 0: # check if we are already done

173 return None

174 else:

175 strategies = self.strategies

176

177 strategy_comm = comm.Split(comm.rank % len(strategies))

178

179 for j in range(0, len(strategies), comm.size):

180 self.generate_stats(

181 strategy=strategies[j + (comm.rank % len(strategies) % (len(strategies) - j))],

182 runs=min(_runs_partial, max_runs),

183 faults=faults,

184 reload=reload,

185 comm=strategy_comm,

186 )

187 strategy_comm.Free()

188 self.run_stats_generation(

189 runs=runs, step=step, comm=comm, faults=faults, _reload=True, _runs_partial=_runs_partial + step

190 )

191

192 return None

193

194 def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, comm=None):

195 '''

196 Generate statistics for recovery from bit flips

197 -----------------------------------------------

198

199 Every run is given a different random seed such that we have different faults and the results are then stored

200

201 Args:

202 strategy (Strategy): Resilience strategy

203 runs (int): Number of runs you want to do

204 reload (bool): Load previously computed statistics and continue from there or start from scratch

205 faults (bool): Whether to do stats with faults or without

206 comm (MPI.Communicator): Communicator for distributing runs

207

208 Returns:

209 None

210 '''

211 comm = MPI.COMM_WORLD if comm is None else comm

212

213 # initialize dictionary to store the stats in

214 dat = {

215 'level': np.zeros(runs),

216 'iteration': np.zeros(runs),

217 'node': np.zeros(runs),

218 'problem_pos': [],

219 'bit': np.zeros(runs),

220 'error': np.zeros(runs),

221 'total_iteration': np.zeros(runs),

222 'total_newton_iteration': np.zeros(runs),

223 'restarts': np.zeros(runs),

224 'target': np.zeros(runs),

225 'rank': np.zeros(runs),

226 }

227

228 # store arguments for storing and loading

229 identifier_args = {

230 'faults': faults,

231 'strategy': strategy,

232 }

233

234 # reload previously recorded stats and write them to dat

235 if reload:

236 already_completed_ = None

237 if comm.rank == 0:

238 already_completed_ = self.load(**identifier_args)

239 already_completed = comm.bcast(already_completed_, root=0)

240 if already_completed['runs'] > 0 and already_completed['runs'] <= runs and comm.rank == 0:

241 avail_len = min([already_completed['runs'], runs])

242 for k in dat.keys():

243 dat[k][:avail_len] = already_completed.get(k, [None] * avail_len)

244 else:

245 already_completed = {'runs': 0}

246

247 # prepare a message

248 involved_ranks = comm.gather(MPI.COMM_WORLD.rank, root=0)

249 msg = f'{comm.size} rank(s) ({involved_ranks}) doing {strategy.name}{" with faults" if faults else ""} with {self.num_procs} ranks for {self.prob.__name__} from {already_completed["runs"]} to {runs}'

250 if comm.rank == 0 and already_completed['runs'] < runs:

251 print(msg, flush=True)

252

253 space_comm = MPI.COMM_SELF.Split(True)

254

255 # perform the remaining experiments

256 for i in range(already_completed['runs'], runs):

257 if i % comm.size != comm.rank:

258 continue

259

260 # perform a single experiment with the correct random seed

261 stats, controller, crash = self.single_run(strategy=strategy, run=i, faults=faults, space_comm=space_comm)

262

263 # get the data from the stats

264 faults_run = get_sorted(stats, type='bitflip')

265

266 if faults:

267 assert len(faults_run) > 0, f"Did not record a fault in run {i} of {strategy.name}!"

268 dat['level'][i] = faults_run[0][1][0]

269 dat['iteration'][i] = faults_run[0][1][1]

270 dat['node'][i] = faults_run[0][1][2]

271 dat['problem_pos'] += [faults_run[0][1][3]]

272 dat['bit'][i] = faults_run[0][1][4]

273 dat['target'][i] = faults_run[0][1][5]

274 dat['rank'][i] = faults_run[0][1][6]

275

276 if crash:

277 print('Code crashed!')

278 dat['error'][i] = np.inf

279 continue

280

281 # record the rest of the data

282 t, u = get_sorted(stats, type='u', recomputed=False)[-1]

283 error = self.get_error(u, t, controller, strategy, self.get_Tend())

284 total_iteration = sum([k[1] for k in get_sorted(stats, type='k')])

285 total_newton_iteration = sum([k[1] for k in get_sorted(stats, type='work_newton')])

286

287 dat['error'][i] = error

288 dat['total_iteration'][i] = total_iteration

289 dat['total_newton_iteration'][i] = total_newton_iteration

290 dat['restarts'][i] = sum([me[1] for me in get_sorted(stats, type='restart')])

291

292 # free the space communicator

293 space_comm.Free()

294

295 dat_full = {}

296 for k in dat.keys():

297 dat_full[k] = comm.reduce(dat[k], op=MPI.SUM)

298

299 # store the completed stats

300 dat_full['runs'] = runs

301

302 if already_completed['runs'] < runs:

303 if comm.rank == 0:

304 self.store(dat_full, **identifier_args)

305 if self.faults:

306 self.get_recovered(strategy=strategy)

307

308 return None

309

310 def get_error(self, u, t, controller, strategy, Tend):

311 """

312 Compute the error.

313

314 Args:

315 u (dtype_u): The solution at the end of the run

316 t (float): Time at which `u` was recorded

317 controller (pySDC.controller.controller): The controller

318 strategy (Strategy): The resilience strategy

319 Tend (float): Time you want to run to

320

321 Returns:

322 float: Error

323 """

324 lvl = controller.MS[0].levels[0]

325

326 # decide if we reached Tend

327 Tend_reached = t + lvl.params.dt_initial >= Tend

328

329 if Tend_reached:

330 u_star = lvl.prob.u_exact(t=t)

331 return abs(u - u_star) / abs(u_star)

332 else:

333 print(f'Final time was not reached! Code crashed at t={t:.2f} instead of reaching Tend={Tend:.2f}')

334 return np.inf

335

336 def single_run(

337 self,

338 strategy,

339 run=0,

340 faults=False,

341 force_params=None,

342 hook_class=None,

343 space_comm=None,

344 Tend=None,

345 time_comm=None,

346 ):

347 '''

348 Run the problem once with the specified parameters

349

350 Args:

351 strategy (Strategy): The resilience strategy you plan on using

352 run (int): Index for fault generation

353 faults (bool): Whether or not to put faults in

354 force_params (dict): Change parameters in the description of the problem

355 space_comm (MPI.Communicator): A communicator for space parallelisation

356 Tend (float): Time you want to run to

357 time_comm (MPI.Intracomm.Communicator): Communicator in time

358

359 Returns:

360 dict: Stats object containing statistics for each step, each level and each iteration

361 pySDC.Controller: The controller of the run

362 float: The time the problem should have run to

363 '''

364 hook_class = hook_collection + [LogWork] + ([LogData] if hook_class is None else hook_class)

365 force_params = {} if force_params is None else force_params

366

367 # build the custom description

368 custom_description = strategy.get_custom_description_for_faults(self.prob, self.num_procs)

369 for k in force_params.keys():

370 custom_description[k] = {**custom_description.get(k, {}), **force_params[k]}

371

372 custom_controller_params = {

373 'logger_level': LOGGER_LEVEL,

374 **force_params.get('controller_params', {}),

375 }

376

377 if faults:

378 fault_stuff = {

379 'rng': None,

380 'args': strategy.get_fault_args(self.prob, self.num_procs),

381 'rnd_params': strategy.get_random_params(self.prob, self.num_procs),

382 }

383

384 # make parameters for faults:

385 if self.mode == 'random':

386 fault_stuff['rng'] = np.random.RandomState(run)

387 elif self.mode == 'combination':

388 fault_stuff['rng'] = run

389 elif self.mode == 'regular':

390 fault_stuff['rng'] = np.random.RandomState(run)

391 fault_stuff['fault_frequency_iter'] = self.kwargs['fault_frequency_iter']

392 fault_stuff['rnd_params'] = {

393 'bit': 12,

394 'min_node': 1,

395 }

396 else:

397 raise NotImplementedError(f'Don\'t know how to add faults in mode {self.mode}')

398

399 else:

400 fault_stuff = None

401

402 return self.prob(

403 custom_description=custom_description,

404 num_procs=self.num_procs,

405 hook_class=hook_class,

406 fault_stuff=fault_stuff,

407 Tend=self.get_Tend() if Tend is None else Tend,

408 custom_controller_params=custom_controller_params,

409 space_comm=space_comm,

410 comm=time_comm,

411 use_MPI=time_comm is not None,

412 )

413

414 def compare_strategies(self, run=0, faults=False, ax=None): # pragma: no cover

415 '''

416 Take a closer look at how the strategies compare for a specific run

417

418 Args:

419 run (int): The number of the run to get the appropriate random generator

420 faults (bool): Whether or not to include faults

421 ax (Matplotlib.axes): Somewhere to plot

422

423 Returns:

424 None

425 '''

426 if ax is None:

427 fig, ax = plt.subplots(1, 1)

428 store = True

429 else:

430 store = False

431

432 k_ax = ax.twinx()

433 ls = ['-.' if type(strategy) == HotRodStrategy else '-' for strategy in self.strategies]

434 [self.scrutinize_visual(self.strategies[i], run, faults, ax, k_ax, ls[i]) for i in range(len(self.strategies))]

435

436 # make a legend

437 [k_ax.plot([None], [None], label=strategy.label, color=strategy.color) for strategy in self.strategies]

438 k_ax.legend(frameon=True)

439

440 if store:

441 fig.tight_layout()

442 plt.savefig(f'data/{self.get_name()}-comparison.pdf', transparent=True)

443

444 def scrutinize_visual(

445 self, strategy, run, faults, ax=None, k_ax=None, ls='-', plot_restarts=False

446 ): # pragma: no cover

447 '''

448 Take a closer look at a specific run with a plot

449

450 Args:

451 strategy (Strategy): The resilience strategy you plan on using

452 run (int): The number of the run to get the appropriate random generator

453 faults (bool): Whether or not to include faults

454 ax (Matplotlib.axes): Somewhere to plot the error

455 k_ax (Matplotlib.axes): Somewhere to plot the iterations

456 plot_restarts (bool): Make vertical lines wherever restarts happened

457

458 Returns:

459 dict: Stats from the run

460 '''

461 if ax is None:

462 fig, ax = plt.subplots(1, 1)

463 store = True

464 else:

465 store = False

466

467 force_params = {}

468

469 stats, controller, Tend = self.single_run(

470 strategy=strategy,

471 run=run,

472 faults=faults,

473 force_params=force_params,

474 hook_class=hook_collection + [LogLocalErrorPostStep, LogData],

475 )

476

477 # plot the local error

478 e_loc = get_sorted(stats, type='e_local_post_step', recomputed=False)

479 ax.plot([me[0] for me in e_loc], [me[1] for me in e_loc], color=strategy.color, ls=ls)

480

481 # plot the iterations

482 k_ax = ax.twinx() if k_ax is None else k_ax

483 k = get_sorted(stats, type='k')

484 k_ax.plot([me[0] for me in k], np.cumsum([me[1] for me in k]), color=strategy.color, ls='--')

485

486 # plot the faults

487 faults = get_sorted(stats, type='bitflip')

488 for fault_time in [me[0] for me in faults]:

489 ax.axvline(fault_time, color='grey', ls=':')

490

491 # plot restarts

492 if plot_restarts:

493 restarts = get_sorted(stats, type='restart')

494 [ax.axvline(me[0], color='black', ls='-.') if me[1] else '' for me in restarts]

495

496 # decorate

497 ax.set_yscale('log')

498 ax.set_ylabel(r'$\epsilon$')

499 k_ax.set_ylabel('cumulative iterations (dashed)')

500 ax.set_xlabel(r'$t$')

501

502 if store:

503 fig.tight_layout()

504 plt.savefig(f'data/{self.get_name()}-{strategy.name}-details.pdf', transparent=True)

505

506 def scrutinize(self, strategy, logger_level=15, **kwargs):

507 '''

508 Take a closer look at a specific run

509

510 Args:

511 strategy (Strategy): The resilience strategy you plan on using

512

513 Returns:

514 None

515 '''

516 from pySDC.projects.Resilience.fault_injection import FaultInjector

517

518 force_params = {}

519 force_params['controller_params'] = {'logger_level': logger_level}

520 force_params['sweeper_params'] = {'skip_residual_computation': ()}

521

522 stats, controller, Tend = self.single_run(strategy=strategy, force_params=force_params, **kwargs)

523

524 u_all = get_sorted(stats, type='u', recomputed=False)

525 t, u = get_sorted(stats, type='u', recomputed=False)[np.argmax([me[0] for me in u_all])]

526 k = [me[1] for me in get_sorted(stats, type='k')]

527

528 fault_hook = [me for me in controller.hooks if type(me) == FaultInjector]

529

530 unhappened_faults = fault_hook[0].faults if len(fault_hook) > 0 else []

531

532 print(f'\nOverview for {strategy.name} strategy')

533 # print faults

534 faults = get_sorted(stats, type='bitflip')

535 print('\nfaults:')

537 print('--------+-------+------+------+-----+------+---------------------')

538 for f in faults:

539 print(

540 f' {f[0]:6.2f} | {f[1][0]:5d} | {f[1][1]:4d} | {f[1][2]:4d} | {f[1][4]:3d} | {f[1][5]:4d} | {f[1][6]:4d} |',

541 f[1][3],

542 )

543 print('--------+-------+------+------+-----+------+---------------------')

544 for f in unhappened_faults:

545 print(

546 f'!{f.time:6.2f} | {f.level_number:5d} | {f.iteration:4d} | {f.node:4d} | {f.bit:3d} | {f.target:4d} | {f.rank:4d} |',

547 f.problem_pos,

548 )

549

550 # see if we can determine if the faults where recovered

551 no_faults = self.load(strategy=strategy, faults=False)

552 e_star = np.mean(no_faults.get('error', [0]))

553 error = self.get_error(u, t, controller, strategy, Tend)

554 recovery_thresh = self.get_thresh(strategy)

555

556 print(

557 f'e={error:.2e}, e^*={e_star:.2e}, thresh: {recovery_thresh:.2e} -> recovered: {error < recovery_thresh}, e_rel = {error/abs(u):.2e}'

558 )

559 print(f'k: sum: {np.sum(k)}, min: {np.min(k)}, max: {np.max(k)}, mean: {np.mean(k):.2f},')

560

561 _newton_iter = get_sorted(stats, type='work_newton')

562 if len(_newton_iter) > 0:

563 newton_iter = [me[1] for me in _newton_iter]

564 print(

565 f'Newton: k: sum: {np.sum(newton_iter)}, min: {np.min(newton_iter)}, max: {np.max(newton_iter)}, mean: {np.mean(newton_iter):.2f},'

566 )

567

568 # checkout the step size

569 dt = [me[1] for me in get_sorted(stats, type='dt')]

570 print(f't: {t:.2f}, dt: min: {np.min(dt):.2e}, max: {np.max(dt):.2e}, mean: {np.mean(dt):.2e}')

571

572 # restarts

573 restarts = [me[1] for me in get_sorted(stats, type='restart')]

574 print(f'restarts: {sum(restarts)}, without faults: {no_faults["restarts"][0]}')

575

576 return stats

577

578 def convert_faults(self, faults):

579 '''

580 Make arrays of useable data from an entry in the stats object returned by pySDC

581

582 Args:

583 faults (list): The entry for faults returned by the pySDC run

584

585 Returns:

586 list: The times when the faults happened

587 list: The levels in which the faults happened

588 list: The iterations in which the faults happened

589 list: The nodes in which the faults happened

590 list: The problem positions in which the faults happened

591 list: The bits in which the faults happened

592 '''

593 time = [faults[i][0] for i in range(len(faults))]

594 level = [faults[i][1][0] for i in range(len(faults))]

595 iteration = [faults[i][1][1] for i in range(len(faults))]

596 node = [faults[i][1][2] for i in range(len(faults))]

597 problem_pos = [faults[i][1][3] for i in range(len(faults))]

598 bit = [faults[i][1][4] for i in range(len(faults))]

599 return time, level, iteration, node, problem_pos, bit

600

601 def get_path(self, **kwargs):

602 '''

603 Get the path to where the stats are stored

604

605 Args:

606 strategy (Strategy): The resilience strategy

607 faults (bool): Whether or not faults have been activated

608

609 Returns:

610 str: The path to what you are looking for

611 '''

612 return f'{self.stats_path}/{self.get_name(**kwargs)}.pickle'

613

614 def get_name(self, strategy=None, faults=True, mode=None):

615 '''

616 Function to get a unique name for a kind of statistics based on the problem and strategy that was used

617

618 Args:

619 strategy (Strategy): Resilience strategy

620 faults (bool): Whether or not faults where inserted

621

622 Returns:

623 str: The unique identifier

624 '''

625 if self.prob == run_advection:

626 prob_name = 'advection'

627 elif self.prob == run_vdp:

628 prob_name = 'vdp'

629 elif self.prob == run_piline:

630 prob_name = 'piline'

631 elif self.prob == run_Lorenz:

632 prob_name = 'Lorenz'

633 elif self.prob == run_Schroedinger:

634 prob_name = 'Schroedinger'

635 elif self.prob == run_quench:

636 prob_name = 'Quench'

637 elif self.prob.__name__ == 'run_AC':

638 prob_name = 'Allen-Cahn'

639 elif self.prob.__name__ == 'run_RBC':

640 prob_name = 'Rayleigh-Benard'

641 elif self.prob.__name__ == 'run_GS':

642 prob_name = 'Gray-Scott'

643 else:

644 raise NotImplementedError(f'Name not implemented for problem {self.prob}')

645

646 if faults:

647 fault_name = '-faults'

648 else:

649 fault_name = ''

650

651 if strategy is not None:

652 strategy_name = f'-{strategy.name}'

653 else:

654 strategy_name = ''

655

656 mode = self.mode if mode is None else mode

657 if mode == 'regular':

658 mode_thing = f'-regular{self.kwargs["fault_frequency_iter"] if faults else ""}'

659 else:

660 mode_thing = ''

661

662 mpi_thing = '-MPI' if self.use_MPI else ''

663 return f'{prob_name}{strategy_name}{fault_name}-{self.num_procs}procs{mode_thing}{mpi_thing}'

664

665 def store(self, dat, **kwargs):

666 '''

667 Stores the data for a run at a predefined path

668

669 Args:

670 dat (dict): The data of the recorded statistics

671

672 Returns:

673 None

674 '''

675 with open(self.get_path(**kwargs), 'wb') as f:

676 pickle.dump(dat, f)

677 return None

678

679 def load(self, **kwargs):

680 '''

681 Loads the stats belonging to a specific strategy and whether or not faults where inserted.

682 When no data has been generated yet, a dictionary is returned which only contains the number of completed runs,

683 which is 0 of course.

684

685 Returns:

686 dict: Data from previous run or if it is not available a placeholder dictionary

687 '''

688 kwargs['strategy'] = kwargs.get('strategy', self.strategies[MPI.COMM_WORLD.rank % len(self.strategies)])

689

690 try:

691 with open(self.get_path(**kwargs), 'rb') as f:

692 dat = pickle.load(f)

693 except FileNotFoundError:

694 self.logger.debug(f'File \"{self.get_path(**kwargs)}\" not found!')

695 return {'runs': 0}

696 return dat

697

698 def get_thresh(self, strategy=None):

699 """

700 Get recovery threshold based on relative and absolute tolerances

701

702 Args:

703 strategy (Strategy): The resilience strategy

704 """

705 fault_free = self.load(strategy=strategy, faults=False)

706 assert (

707 fault_free['error'].std() / fault_free['error'].mean() < 1e-5

708 ), f'Got too large variation of errors in {strategy} strategy!'

709 return max([self.recovery_thresh_abs, self.recovery_thresh * fault_free["error"].mean()])

710

711 def get_recovered(self, **kwargs):

712 '''

713 Determine the recovery rate for a specific strategy and store it to disk.

714

715 Returns:

716 None

717 '''

718 if 'strategy' not in kwargs.keys():

719 [self.get_recovered(strategy=strat, **kwargs) for strat in self.strategies]

720 else:

721 try:

722 with_faults = self.load(faults=True, **kwargs)

723 with_faults['recovered'] = with_faults['error'] <= self.get_thresh(kwargs['strategy'])

724 self.store(faults=True, dat=with_faults, **kwargs)

725 except KeyError as error:

726 print(

727 f'Warning: Can\'t compute recovery rate for strategy {kwargs["strategy"].name} in {self.prob.__name__} problem right now: KeyError: {error}'

728 )

729

730 return None

731

732 def crash_rate(self, dat, no_faults, thingA, mask):

733 '''

734 Determine the rate of runs that crashed

735

736 Args:

737 dat (dict): The data of the recorded statistics with faults

738 no_faults (dict): The data of the corresponding fault-free stats

739 thingA (str): Some key stored in the stats that will go on the y-axis

740 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply before determining the rate

741

742 Returns:

743 int: Ratio of the runs which crashed and fall under the specific criteria set by the mask

744 '''

745 if len(dat[thingA][mask]) > 0:

746 crash = dat['error'] == np.inf

747 return len(dat[thingA][mask & crash]) / len(dat[thingA][mask])

748 else:

749 return None

750

751 def rec_rate(self, dat, no_faults, thingA, mask):

752 '''

753 Operation for plotting which returns the recovery rate for a given mask.

754 Which thingA you apply this to actually does not matter here since we compute a rate.

755

756 Args:

757 dat (dict): The recorded statistics

758 no_faults (dict): The corresponding fault-free stats

759 thingA (str): Some key stored in the stats

760 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering

761

762 Returns:

763 float: Recovery rate

764 '''

765 if len(dat[thingA][mask]) > 0:

766 return len(dat[thingA][mask & dat['recovered']]) / len(dat[thingA][mask])

767 else:

768 return None

769

770 def mean(self, dat, no_faults, thingA, mask):

771 '''

772 Operation for plotting which returns the mean of thingA after applying the mask

773

774 Args:

775 dat (dict): The recorded statistics

776 no_faults (dict): The corresponding fault-free stats

777 thingA (str): Some key stored in the stats

778 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering

779

780 Returns:

781 float: Mean of thingA after applying mask

782 '''

783 return np.mean(dat[thingA][mask])

784

785 def extra_mean(self, dat, no_faults, thingA, mask):

786 '''

787 Operation for plotting which returns the difference in mean of thingA between runs with and without faults after

788 applying the mask

789

790 Args:

791 dat (dict): The recorded statistics

792 no_faults (dict): The corresponding fault-free stats

793 thingA (str): Some key stored in the stats

794 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering

795

796 Returns:

797 float: Difference in mean of thingA between runs with and without faults after applying mask

798 '''

799 if True in mask or int in [type(me) for me in mask]:

800 return np.mean(dat[thingA][mask]) - np.mean(no_faults[thingA])

801 else:

802 return None

803

804 def plot_thingA_per_thingB(

805 self,

806 strategy,

807 thingA,

808 thingB,

809 ax=None,

810 mask=None,

811 recovered=False,

812 op=None,

813 **kwargs,

814 ): # pragma: no cover

815 '''

816 Plot thingA vs. thingB for a single strategy

817

818 Args:

819 strategy (Strategy): The resilience strategy you want to plot

820 thingA (str): Some key stored in the stats that will go on the y-axis

821 thingB (str): Some key stored in the stats that will go on the x-axis

822 ax (Matplotlib.axes): Somewhere to plot

823 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes

824 recovered (bool): Show the plot for both all runs and only the recovered ones

825 op (function): Operation that is applied to thingA before plotting default is recovery rate

826

827 Returns:

828 None

829 '''

830 op = self.rec_rate if op is None else op

831 dat = self.load(strategy=strategy, faults=True)

832 no_faults = self.load(strategy=strategy, faults=False)

833

834 if mask is None:

835 mask = np.ones_like(dat[thingB], dtype=bool)

836

837 admissable_thingB = np.unique(dat[thingB][mask])

838 me = np.zeros(len(admissable_thingB))

839 me_recovered = np.zeros_like(me)

840

841 for i in range(len(me)):

842 _mask = np.logical_and((dat[thingB] == admissable_thingB[i]), mask)

843 if _mask.any():

844 me[i] = op(dat, no_faults, thingA, _mask)

845 me_recovered[i] = op(dat, no_faults, thingA, np.logical_and(_mask, dat['recovered']))

846

847 if recovered:

848 ax.plot(

849 admissable_thingB,

850 me_recovered,

851 label=f'{strategy.label} (only recovered)',

852 color=strategy.color,

853 marker=strategy.marker,

854 ls='--',

855 linewidth=3,

856 **kwargs,

857 )

858

859 ax.plot(

860 admissable_thingB,

861 me,

862 label=f'{strategy.label}',

863 color=strategy.color,

864 marker=strategy.marker,

865 linewidth=2,

866 **kwargs,

867 )

868

869 ax.legend(frameon=False)

870 ax.set_xlabel(thingB)

871 ax.set_ylabel(thingA)

872 return None

873

874 def plot_things_per_things(

875 self,

876 thingA='bit',

877 thingB='bit',

878 recovered=False,

879 mask=None,

880 op=None,

881 args=None,

882 strategies=None,

883 name=None,

884 store=False,

885 ax=None,

886 fig=None,

887 plotting_args=None,

888 ): # pragma: no cover

889 '''

890 Plot thingA vs thingB for multiple strategies

891

892 Args:

893 thingA (str): Some key stored in the stats that will go on the y-axis

894 thingB (str): Some key stored in the stats that will go on the x-axis

895 recovered (bool): Show the plot for both all runs and only the recovered ones

896 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes

897 op (function): Operation that is applied to thingA before plotting default is recovery rate

898 args (dict): Parameters for how the plot should look

899 strategies (list): List of the strategies you want to plot, if None, all will be plotted

900 name (str): Optional name for the plot

901 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)

902 ax (Matplotlib.axes): Somewhere to plot

903 fig (Matplotlib.figure): Figure of the ax

904

905 Returns

906 None

907 '''

908 strategies = self.strategies if strategies is None else strategies

909 args = {} if args is None else args

910

911 # make sure we have something to plot in

912 if ax is None:

913 fig, ax = plt.subplots(1, 1)

914 elif fig is None:

915 store = False

916

917 # execute the plots for all strategies

918 for s in strategies:

919 self.plot_thingA_per_thingB(

920 s,

921 thingA=thingA,

922 thingB=thingB,

923 recovered=recovered,

924 ax=ax,

925 mask=mask,

926 op=op,

927 **(plotting_args if plotting_args else {}),

928 )

929

930 # set the parameters

931 [plt.setp(ax, k, v) for k, v in args.items()]

932

933 if store:

934 fig.tight_layout()

935 plt.savefig(f'data/{self.get_name()}-{thingA if name is None else name}_per_{thingB}.pdf', transparent=True)

936 plt.close(fig)

937

938 return None

939

940 def plot_recovery_thresholds(

941 self, strategies=None, thresh_range=None, ax=None, recoverable_only=False, **kwargs

942 ): # pragma: no cover

943 '''

944 Plot the recovery rate for a range of thresholds

945

946 Args:

947 strategies (list): List of the strategies you want to plot, if None, all will be plotted

948 thresh_range (list): thresholds for deciding whether to accept as recovered

949 ax (Matplotlib.axes): Somewhere to plot

950

951 Returns:

952 None

953 '''

954 # fill default values if nothing is specified

955 strategies = self.strategies if strategies is None else strategies

956

957 thresh_range = 1 + np.linspace(-4e-2, 4e-2, 100) if thresh_range is None else thresh_range

958 if ax is None:

959 fig, ax = plt.subplots(1, 1)

960

961 rec_rates = [[None] * len(thresh_range)] * len(strategies)

962 for strategy_idx in range(len(strategies)):

963 strategy = strategies[strategy_idx]

964 # load the stats

965 fault_free = self.load(strategy=strategy, faults=False)

966 with_faults = self.load(strategy=strategy, faults=True)

967

968 if recoverable_only:

969 recoverable_mask = self.get_fixable_faults_only(strategy)

970 else:

971 recoverable_mask = self.get_mask(strategy=strategy)

972

973 for thresh_idx in range(len(thresh_range)):

974 rec_mask = self.get_mask(

975 strategy=strategy,

976 key='error',

977 val=(thresh_range[thresh_idx] * fault_free['error'].mean()),

978 op='gt',

979 old_mask=recoverable_mask,

980 )

981 rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len(

982 with_faults['error'][recoverable_mask]

983 )

984

985 ax.plot(

986 thresh_range, rec_rates[strategy_idx], **{'color': strategy.color, 'label': strategy.label, **kwargs}

987 )

988 ax.legend(frameon=False)

989 ax.set_ylabel('recovery rate')

990 ax.set_xlabel('threshold as ratio to fault-free error')

991

992 return None

993

994 def analyse_adaptivity(self, mask): # pragma: no cover

995 '''

996 Analyse a set of runs with adaptivity

997

998 Args:

999 mask (Numpy.ndarray of shape (n)): The mask you want to know about

1000

1001 Returns:

1002 None

1003 '''

1004 index = self.get_index(mask)

1005 dat = self.load()

1006

1007 # make a header

1009 print('-------+-----+------+------+----------+----------+----------+----------')

1010 for i in index:

1011 e_em, e_glob = self.analyse_adaptivity_single(int(i))

1012 print(

1013 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_em[1]:.2e}\

1014 | {e_em[0]:.2e} | {e_glob[1]:.2e} | {e_glob[0]:.2e}'

1015 )

1016

1017 e_tol = AdaptivityStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][

1018 Adaptivity

1019 ]['e_tol']

1020 print(f'We only restart when e_em > e_tol = {e_tol:.2e}!')

1021 return None

1022

1023 def analyse_adaptivity_single(self, run): # pragma: no cover

1024 '''

1025 Compute what the difference in embedded and global error are for a specific run with adaptivity

1026

1027 Args:

1028 run (int): The run you want to know about

1029

1030 Returns:

1031 list: Embedded error with fault and without for the last iteration in the step with a fault

1032 list: Global error with and without fault at the end of the run

1033 '''

1034 # perform one run with and one without faults

1035 stats = []

1036 controllers = []

1037 for faults in [True, False]:

1038 s, c, _ = self.single_run(

1039 strategy=AdaptivityStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]

1040 )

1041 stats += [s]

1042 controllers += [c]

1043

1044 # figure out when the fault happened

1045 t_fault = get_sorted(stats[0], type='bitflip')[0][0]

1046

1047 # get embedded error

1048 e_em = [

1049 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]

1050 for stat in stats

1051 ]

1052

1053 # compute the global error

1054 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]

1055 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]

1056

1057 return [e_em[i][-1] for i in [0, 1]], e_glob

1058

1059 def analyse_HotRod(self, mask): # pragma: no cover

1060 '''

1061 Analyse a set of runs with Hot Rod

1062

1063 Args:

1064 mask (Numpy.ndarray of shape (n)): The mask you want to know about

1065

1066 Returns:

1067 None

1068 '''

1069 index = self.get_index(mask)

1070 dat = self.load()

1071

1072 # make a header

1073 print(

1074 ' run | bit | node | iter | e_ex^* | e_ex | e_em^* | e_em | diff* | diff | e_glob^* \

1075| e_glob '

1076 )

1077 print(

1078 '-------+-----+------+------+----------+----------+----------+----------+----------+----------+----------\

1079+----------'

1080 )

1081 for i in index:

1082 e_em, e_ex, e_glob = self.analyse_HotRod_single(int(i))

1083 print(

1084 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_ex[1]:.2e}\

1085 | {e_ex[0]:.2e} | {e_em[1]:.2e} | {e_em[0]:.2e} | {abs(e_em[1]-e_ex[1]):.2e} | {abs(e_em[0]-e_ex[0]):.2e} | \

1086{e_glob[1]:.2e} | {e_glob[0]:.2e}'

1087 )

1088

1089 tol = HotRodStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][HotRod][

1090 'HotRod_tol'

1091 ]

1092 print(f'We only restart when diff > tol = {tol:.2e}!')

1093 return None

1094

1095 def analyse_HotRod_single(self, run): # pragma: no cover

1096 '''

1097 Compute what the difference in embedded, extrapolated and global error are for a specific run with Hot Rod

1098

1099 Args:

1100 run (int): The run you want to know about

1101

1102 Returns:

1103 list: Embedded error with fault and without for the last iteration in the step with a fault

1104 list: Extrapolation error with fault and without for the last iteration in the step with a fault

1105 list: Global error with and without fault at the end of the run

1106 '''

1107 # perform one run with and one without faults

1108 stats = []

1109 controllers = []

1110 for faults in [True, False]:

1111 s, c, _ = self.single_run(

1112 strategy=HotRodStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]

1113 )

1114 stats += [s]

1115 controllers += [c]

1116

1117 # figure out when the fault happened

1118 t_fault = get_sorted(stats[0], type='bitflip')[0][0]

1119

1120 # get embedded error

1121 e_em = [

1122 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]

1123 for stat in stats

1124 ]

1125 # get extrapolated error

1126 e_ex = [

1127 [me[1] for me in get_sorted(stat, type='error_extrapolation_estimate', time=t_fault, sortby='iter')]

1128 for stat in stats

1129 ]

1130

1131 # compute the global error

1132 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]

1133 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]

1134

1135 return [e_em[i][-1] for i in [0, 1]], [e_ex[i][-1] for i in [0, 1]], e_glob

1136

1137 def print_faults(self, mask=None, strategy=None): # pragma: no cover

1138 '''

1139 Print all faults that happened within a certain mask

1140

1141 Args:

1142 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of

1143 strategy (Strategy): The resilience strategy you want to see the error for

1144

1145 Returns:

1146 None

1147 '''

1148 strategy = BaseStrategy() if strategy is None else strategy

1149

1150 index = self.get_index(mask)

1151 dat = self.load(strategy=strategy)

1152

1153 e_star = np.mean(self.load(faults=False, strategy=strategy).get('error', [0]))

1154

1155 # make a header

1157 print('-------+-----+------+------+------+---------+-----------')

1158 for i in index:

1159 print(

1160 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {dat.get("rank", np.zeros_like(dat["iteration"]))[i]:4.0f} | {dat["error"][i] / e_star:.1e} | {dat["problem_pos"][i]}'

1161 )

1162

1163 return None

1164

1165 def get_mask(self, strategy=None, key=None, val=None, op='eq', old_mask=None, compare_faults=False):

1166 '''

1167 Make a mask to apply to stored data to filter out certain things

1168

1169 Args:

1170 strategy (Strategy): The resilience strategy you want to apply the mask to. Most masks are the same for all

1171 strategies so None is fine

1172 key (str): The key in the stored statistics that you want to filter for some value

1173 val (str, float, int, bool): A value that you want to use for filtering. Dtype depends on key

1174 op (str): Operation that is applied for filtering

1175 old_mask (Numpy.ndarray of shape (n)): Apply this mask on top of the filter

1176 compare_faults (bool): instead of comparing to val, compare to the mean value for fault free runs

1177

1178 Returns:

1179 Numpy.ndarray with boolean entries that can be used as a mask

1180 '''

1181 strategy = self.strategies[0] if strategy is None else strategy

1182 dat = self.load(strategy=strategy, faults=True)

1183

1184 if compare_faults:

1185 if val is not None:

1186 raise ValueError('Can\'t use val and compare_faults in get_mask at the same time!')

1187 else:

1188 vals = self.load(strategy=strategy, faults=False)[key]

1189 val = sum(vals) / len(vals)

1190

1191 if None in [key, val] and op not in ['isfinite']:

1192 mask = dat['bit'] == dat['bit']

1193 else:

1194 if op == 'uneq':

1195 mask = dat[key] != val

1196 elif op == 'eq':

1197 mask = dat[key] == val

1198 elif op == 'leq':

1199 mask = dat[key] <= val

1200 elif op == 'geq':

1201 mask = dat[key] >= val

1202 elif op == 'lt':

1203 mask = dat[key] < val

1204 elif op == 'gt':

1205 mask = dat[key] > val

1206 elif op == 'isfinite':

1207 mask = np.isfinite(dat[key])

1208 else:

1209 raise NotImplementedError(f'Please implement op={op}!')

1210

1211 if old_mask is not None:

1212 return mask & old_mask

1213 else:

1214 return mask

1215

1216 def get_fixable_faults_only(self, strategy):

1217 """

1218 Return a mask of only faults that can be fixed with a given strategy.

1219

1220 Args:

1221 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all

1222

1223 Returns:

1224 Numpy.ndarray with boolean entries that can be used as a mask

1225 """

1226 fixable = strategy.get_fixable_params(

1227 maxiter=strategy.get_custom_description(self.prob, self.num_procs)['step_params']['maxiter']

1228 )

1229 mask = self.get_mask(strategy=strategy)

1230

1231 for kwargs in fixable:

1232 mask = self.get_mask(strategy=strategy, **kwargs, old_mask=mask)

1233

1234 return mask

1235

1236 def get_index(self, mask=None):

1237 '''

1238 Get the indeces of all runs in mask

1239

1240 Args:

1241 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of

1242

1243 Returns:

1244 Numpy.ndarray: Array of indeces

1245 '''

1246 if mask is None:

1247 dat = self.load()

1248 return np.arange(len(dat['iteration']))

1249 else:

1250 return np.arange(len(mask))[mask]

1251

1252 def get_statistics_info(self, mask=None, strategy=None, print_all=False, ax=None): # pragma: no cover

1253 '''

1254 Get information about how many data points for faults we have given a particular mask

1255

1256 Args:

1257 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting

1258 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all

1259 strategies, so you don't need to supply this argument

1260 print_all (bool): Whether to add information that is normally not useful to the table

1261 ax (Matplotlib.axes): Somewhere to plot the combinations histogram

1262

1263 Returns:

1264 None

1265 '''

1266

1267 # load some data from which to infer the number occurrences of some event

1268 strategy = self.strategies[0] if strategy is None else strategy

1269 dat = self.load(stratagy=strategy, faults=True)

1270

1271 # make a dummy mask in case none is supplied

1272 if mask is None:

1273 mask = np.ones_like(dat['error'], dtype=bool)

1274

1275 # print a header

1276 print(f' tot: {len(dat["error"][mask]):6} | avg. counts | mean deviation | unique entries')

1277 print('-------------------------------------------------------------')

1278

1279 # make a list of all keys that you want to look at

1280 keys = ['iteration', 'bit', 'node']

1281 if print_all:

1282 keys += ['problem_pos', 'level', 'target']

1283

1284 # print the table

1285 for key in keys:

1286 counts, dev, unique = self.count_occurrences(dat[key][mask])

1287 print(f' {key:11} | {counts:11.1f} | {dev:14.2f} | {unique:14}')

1288

1289 return None

1290

1291 def combinations_histogram(self, dat=None, keys=None, mask=None, ax=None): # pragma: no cover

1292 '''

1293 Make a histogram ouf of the occurrences of combinations

1294

1295 Args:

1296 dat (dict): The data of the recorded statistics

1297 keys (list): The keys in dat that you want to know the combinations of

1298 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting

1299

1300 Returns:

1301 Matplotlib.axes: The plot

1302 '''

1303 if ax is None:

1304 fig, ax = plt.subplots(1, 1)

1305

1306 occurrences, bins = self.get_combination_histogram(dat, keys, mask)

1307

1308 ax.bar(bins[:-1], occurrences)

1309

1310 ax.set_xlabel('Occurrence of combinations')

1311

1312 return ax

1313

1314 def get_combination_histogram(self, dat=None, keys=None, mask=None): # pragma: no cover

1315 '''

1316 Check how many combinations of values we expect and how many we find to see if we need to do more experiments.

1317 It is assumed that each allowed value for each key appears at least once in dat after the mask was applied

1318

1319 Args:

1320 dat (dict): The data of the recorded statistics

1321 keys (list): The keys in dat that you want to know the combinations of

1322 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting

1323

1324 Returns:

1325 Numpy.ndarray: Number of occurrences of combinations

1326 Numpy.ndarray: Bins

1327 '''

1328

1329 # load default values

1330 dat = self.load(strategy=self.strategies[0], faults=True) if dat is None else dat

1331 keys = ['iteration', 'bit', 'node'] if keys is None else keys

1332 if mask is None:

1333 mask = np.ones_like(dat['error'], dtype=bool)

1334

1335 # get unique values and compute how many combinations you expect

1336 num_unique = [len(np.unique(dat[key][mask])) for key in keys]

1337 expected_number_of_combinations = np.prod(num_unique)

1338

1339 # test what you actually get

1340 combination_counts = self.get_combination_counts(dat, keys, mask)

1341

1342 # make a histogram with the result

1343 occurrences, bins = np.histogram(combination_counts, bins=np.arange(max(combination_counts) + 1))

1344 occurrences[0] = expected_number_of_combinations - len(combination_counts)

1345

1346 return occurrences, bins

1347

1348 def get_max_combinations(self, dat=None, strategy=None):

1349 '''

1350 Count how many combinations of parameters for faults are possible

1351

1352 Args:

1353 dat (dict): The recorded statistics

1354 keys (list): The keys in dat that you want to know the combinations of

1355 strategy (Strategy): The resilience strategy

1356

1357 Returns:

1358 int: Number of possible combinations

1359 '''

1360 strategy = self.strategies[0] if strategy is None else strategy

1361 stats, controller, Tend = self.single_run(strategy=strategy, run=0, faults=True)

1362 faultHook = get_fault_injector_hook(controller)

1363 ranges = [

1364 (0, faultHook.rnd_params['level_number']),

1365 (faultHook.rnd_params.get('min_node', 0), faultHook.rnd_params['node'] + 1),

1366 (1, faultHook.rnd_params['iteration'] + 1),

1367 (0, faultHook.rnd_params['bit']),

1368 (0, faultHook.rnd_params['rank']),

1369 ]

1370 ranges += [(0, i) for i in faultHook.rnd_params['problem_pos']]

1371 return np.prod([me[1] - me[0] for me in ranges], dtype=int)

1372

1373 def get_combination_counts(self, dat, keys, mask):

1374 '''

1375 Get counts of how often all combinations of values of keys appear. This is done recursively to support arbitrary

1376 numbers of keys

1377

1378 Args:

1379 dat (dict): The data of the recorded statistics

1380 keys (list): The keys in dat that you want to know the combinations of

1381 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting

1382

1383 Returns:

1384 list: Occurrences of all combinations

1385 '''

1386 key = keys[0]

1387 unique_vals = np.unique(dat[key][mask])

1388 res = []

1389

1390 for i in range(len(unique_vals)):

1391 inner_mask = self.get_mask(key=key, val=unique_vals[i], op='eq', old_mask=mask)

1392 if len(keys) > 1:

1393 res += self.get_combination_counts(dat, keys[1:], inner_mask)

1394 else:

1395 res += [self.count_occurrences(dat[key][inner_mask])[0]]

1396 return res

1397

1398 def count_occurrences(self, vals):

1399 '''

1400 Count the occurrences of unique values in vals and compute average deviation from mean

1401

1402 Args:

1403 vals (list): Values you want to check

1404

1405 Returns:

1406 float: Mean of number of occurrences of unique values in vals

1407 float: Average deviation from mean number of occurrences

1408 int: Number of unique entries

1409 '''

1410 unique_vals, counts = np.unique(vals, return_counts=True)

1411

1412 if len(counts) > 0:

1413 return counts.mean(), sum(abs(counts - counts.mean())) / len(counts), len(counts)

1414 else:

1415 return None, None, 0

1416

1417 def bar_plot_thing(

1418 self, x=None, thing=None, ax=None, mask=None, store=False, faults=False, name=None, op=None, args=None

1419 ): # pragma: no cover

1420 '''

1421 Make a bar plot about something!

1422

1423 Args:

1424 x (Numpy.ndarray of dimension 1): x values for bar plot

1425 thing (str): Some key stored in the stats that will go on the y-axis

1426 mask (Numpy.ndarray of shape (n)): The mask you want to apply before plotting

1427 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)

1428 faults (bool): Whether to load stats with faults or without

1429 name (str): Optional name for the plot

1430 op (function): Operation that is applied to thing before plotting default is recovery rate

1431 args (dict): Parameters for how the plot should look

1432

1433 Returns:

1434 None

1435 '''

1436 if ax is None:

1437 fig, ax = plt.subplots(1, 1)

1438 store = True

1439 op = self.mean if op is None else op

1440

1441 # get the values for the bars

1442 height = np.zeros(len(self.strategies))

1443 for strategy_idx in range(len(self.strategies)):

1444 strategy = self.strategies[strategy_idx]

1445

1446 # load the values

1447 dat = self.load(strategy=strategy, faults=faults)

1448 no_faults = self.load(strategy=strategy, faults=False)

1449

1450 # check if we have a mask

1451 if mask is None:

1452 mask = np.ones_like(dat[thing], dtype=bool)

1453

1454 height[strategy_idx] = op(dat, no_faults, thing, mask)

1455

1456 # get some x values

1457 x = np.arange(len(self.strategies)) if x is None else x

1458

1459 # prepare labels

1460 ticks = [strategy.bar_plot_x_label for strategy in self.strategies]

1461

1462 ax.bar(x, height, tick_label=ticks)

1463

1464 # set the parameters

1465 ax.set_ylabel(thing)

1466 args = {} if args is None else args

1467 [plt.setp(ax, k, v) for k, v in args.items()]

1468

1469 if store:

1470 fig.tight_layout()

1471 plt.savefig(f'data/{self.get_name()}-{thing if name is None else name}-barplot.pdf', transparent=True)

1472 plt.close(fig)

1473

1474 return None

1475

1476 def fault_frequency_plot(self, ax, iter_ax, kwargs_range, strategy=None): # pragma: no cover

1477 func_args = locals()

1478 func_args.pop('self', None)

1479 if strategy is None:

1480 for strat in self.strategies:

1481 args = {**func_args, 'strategy': strat}

1482 self.fault_frequency_plot(**args)

1483 return None

1484

1485 # load data

1486 all_data = {}

1487 for me in kwargs_range['fault_frequency_iter']:

1488 self.kwargs['fault_frequency_iter'] = me

1489 self.get_recovered()

1490 all_data[me] = self.load(strategy=strategy, faults=True, mode='regular')

1491

1492 # get_recovery_rate

1493 results = {}

1494 results['frequencies'] = list(all_data.keys())

1495 results['recovery_rate'] = [

1496 len(all_data[key]['recovered'][all_data[key]['recovered'] is True]) / len(all_data[key]['recovered'])

1497 for key in all_data.keys()

1498 ]

1499 # results['iterations'] = [np.mean(all_data[key]['total_iteration']) for key in all_data.keys()]

1500 results['iterations'] = [

1501 np.mean(all_data[key]['total_iteration'][all_data[key]['error'] != np.inf]) for key in all_data.keys()

1502 ]

1503

1504 ax.plot(results['frequencies'], results['recovery_rate'], **strategy.style)

1505 iter_ax.plot(results['frequencies'], results['iterations'], **{**strategy.style, 'ls': '--'})

1506

1507 ax.set_xscale('log')

1508 ax.set_xlabel('iterations between fault')

1509 ax.set_ylabel('recovery rate')

1510 iter_ax.set_ylabel('average total iterations if not crashed (dashed)')

1511 ax.legend(frameon=False)

1512

1513 return None

1514

1515 def get_HR_tol(self, verbose=False):

1516 from pySDC.implementations.convergence_controller_classes.hotrod import HotRod

1517

1518 HR_strategy = HotRodStrategy(useMPI=self.use_MPI)

1519

1520 description = HR_strategy.get_custom_description_for_faults(self.prob, self.num_procs)

1521 description['convergence_controllers'][HotRod]['HotRod_tol'] = 1e2

1522

1523 stats, _, _ = self.single_run(HR_strategy, force_params=description)

1524

1525 e_extrapolation = get_sorted(stats, type='error_extrapolation_estimate')

1526 diff = []

1527 for i in range(len(e_extrapolation)):

1528 if e_extrapolation[i][1] is not None:

1529 e_embedded = get_sorted(stats, type='error_embedded_estimate', time=e_extrapolation[i][0])

1530 diff += [abs(e_extrapolation[i][1] - e_embedded[-1][1])]

1531

1532 max_diff = max(diff)

1533 proposed_tol = 2 * max_diff

1534 if verbose:

1535 print(

1536 f'Max. diff: {max_diff:.6e} -> proposed HR tolerance: {proposed_tol:.6e} for {self.prob.__name__} problem with {self.num_procs} procs.'

1537 )

1538 else:

1539 print(f'{proposed_tol:.6e}')

1540

1541

1542def check_local_error(): # pragma: no cover

1543 """

1544 Make a plot of the resolution over time for all problems

1545 """

1546 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]

1547 problems = [run_quench]

1548 strategies = [BaseStrategy(), AdaptivityStrategy(), IterateStrategy()]

1549

1550 for i in range(len(problems)):

1551 stats_analyser = FaultStats(

1552 prob=problems[i],

1553 strategies=strategies,

1554 faults=[False],

1555 reload=True,

1556 recovery_thresh=1.1,

1557 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),

1558 num_procs=1,

1559 mode='random',

1560 )

1561 stats_analyser.compare_strategies()

1562 plt.show()

1563

1564

1565def parse_args():

1566 import sys

1567

1568 kwargs = {}

1569

1570 for i in range(len(sys.argv)):

1571 if 'prob' in sys.argv[i]:

1572 if sys.argv[i + 1] == 'run_Lorenz':

1573 kwargs['prob'] = run_Lorenz

1574 elif sys.argv[i + 1] == 'run_vdp':

1575 kwargs['prob'] = run_vdp

1576 elif sys.argv[i + 1] == 'run_Schroedinger':

1577 kwargs['prob'] = run_Schroedinger

1578 elif sys.argv[i + 1] == 'run_quench':

1579 kwargs['prob'] = run_quench

1580 elif sys.argv[i + 1] == 'run_AC':

1581 kwargs['prob'] = run_AC

1582 elif sys.argv[i + 1] == 'run_RBC':

1583 kwargs['prob'] = run_RBC

1584 elif sys.argv[i + 1] == 'run_GS':

1585 kwargs['prob'] = run_GS

1586 else:

1587 raise NotImplementedError

1588 elif 'num_procs' in sys.argv[i]:

1589 kwargs['num_procs'] = int(sys.argv[i + 1])

1590 elif 'runs' in sys.argv[i]:

1591 kwargs['runs'] = int(sys.argv[i + 1])

1592 elif 'mode' in sys.argv[i]:

1593 kwargs['mode'] = str(sys.argv[i + 1])

1594 elif 'reload' in sys.argv[i]:

1595 kwargs['reload'] = False if sys.argv[i + 1] == 'False' else True

1596

1597 return kwargs

1598

1599

1600def compare_adaptivity_modes():

1601 from pySDC.projects.Resilience.strategies import AdaptivityRestartFirstStep

1602

1603 fig, axs = plt.subplots(2, 2, sharey=True)

1604 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]

1605 titles = ['Van der Pol', 'Lorenz attractor', r'Schr\"odinger', 'Quench']

1606

1607 for i in range(len(problems)):

1608 ax = axs.flatten()[i]

1609

1610 ax.set_title(titles[i])

1611

1612 stats_analyser = FaultStats(

1613 prob=problems[i],

1614 strategies=[AdaptivityStrategy()],

1615 faults=[False, True],

1616 reload=True,

1617 recovery_thresh=1.1,

1618 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),

1619 num_procs=1,

1620 mode='default',

1621 stats_path='data/stats-jusuf',

1622 )

1623 stats_analyser.get_recovered()

1624

1625 for strategy in stats_analyser.strategies:

1626 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)

1627 stats_analyser.plot_things_per_things(

1628 'recovered',

1629 'bit',

1630 False,

1631 strategies=[strategy],

1632 op=stats_analyser.rec_rate,

1633 mask=fixable,

1634 args={'ylabel': 'recovery rate', 'label': 'bla'},

1635 name='fixable_recovery',

1636 ax=ax,

1637 )

1638 single_proc = ax.lines[-1]

1639 single_proc.set_label('single process')

1640 single_proc.set_color('black')

1641

1642 stats_analyser = FaultStats(

1643 prob=problems[i],

1644 strategies=[AdaptivityStrategy(), AdaptivityRestartFirstStep()],

1645 faults=[False, True],

1646 reload=True,

1647 recovery_thresh=1.5,

1648 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),

1649 num_procs=4,

1650 mode='default',

1651 stats_path='data/stats-jusuf',

1652 )

1653 stats_analyser.get_recovered()

1654

1655 for strategy in stats_analyser.strategies:

1656 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)

1657 stats_analyser.plot_things_per_things(

1658 'recovered',

1659 'bit',

1660 False,

1661 strategies=[strategy],

1662 op=stats_analyser.rec_rate,

1663 mask=fixable,

1664 args={'ylabel': 'recovery rate'},

1665 name='fixable_recovery',

1666 ax=ax,

1667 )

1668 fig.suptitle('Comparison of restart modes with adaptivity with 4 ranks')

1669 fig.tight_layout()

1670 plt.show()

1671

1672

1673def main():

1674 kwargs = {

1675 'prob': run_vdp,

1676 'num_procs': 1,

1677 'mode': 'default',

1678 'runs': 4000,

1679 'reload': True,

1680 **parse_args(),

1681 }

1682

1683 strategy_args = {

1684 'stop_at_nan': True,

1685 }

1686

1687 from pySDC.projects.Resilience.strategies import AdaptivityPolynomialError, kAdaptivityStrategy

1688

1689 stats_analyser = FaultStats(

1690 strategies=[

1691 BaseStrategy(**strategy_args),

1692 AdaptivityStrategy(**strategy_args),

1693 kAdaptivityStrategy(**strategy_args),

1694 HotRodStrategy(**strategy_args),

1695 AdaptivityPolynomialError(**strategy_args),

1696 ],

1697 faults=[False, True],

1698 recovery_thresh=1.1,

1699 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(kwargs.get('prob', None), 0),

1700 stats_path='data/stats-jusuf',

1701 **kwargs,

1702 )

1703 stats_analyser.get_recovered()

1704

1705 stats_analyser.run_stats_generation(runs=kwargs['runs'], step=8)

1706

1707 if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data

1708 return None

1709

1710 stats_analyser.get_recovered()

1711 mask = None

1712

1713 # stats_analyser.compare_strategies()

1714 stats_analyser.plot_things_per_things(

1715 'recovered', 'node', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}

1716 )

1717 stats_analyser.plot_things_per_things(

1718 'recovered', 'iteration', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}

1719 )

1720

1721 stats_analyser.plot_things_per_things(

1722 'recovered', 'bit', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}

1723 )

1724

1725 # make a plot for only the faults that can be recovered

1726 fig, ax = plt.subplots(1, 1)

1727 for strategy in stats_analyser.strategies:

1728 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)

1729 stats_analyser.plot_things_per_things(

1730 'recovered',

1731 'bit',

1732 False,

1733 strategies=[strategy],

1734 op=stats_analyser.rec_rate,

1735 mask=fixable,

1736 args={'ylabel': 'recovery rate'},

1737 name='fixable_recovery',

1738 ax=ax,

1739 )

1740 fig.tight_layout()

1741 fig.savefig(f'data/{stats_analyser.get_name()}-recoverable.pdf', transparent=True)

1742

1743 fig, ax = plt.subplots(1, 1, figsize=(13, 4))

1744 stats_analyser.plot_recovery_thresholds(ax=ax, thresh_range=np.logspace(-1, 1, 1000))

1745 # ax.axvline(stats_analyser.get_thresh(BaseStrategy()), color='grey', ls=':', label='recovery threshold')

1746 ax.set_xscale('log')

1747 ax.legend(frameon=False)

1748 fig.tight_layout()

1749 fig.savefig(f'data/{stats_analyser.get_name()}-threshold.pdf', transparent=True)

1750

1751 stats_analyser.plot_things_per_things(

1752 'total_iteration',

1753 'bit',

1754 True,

1755 op=stats_analyser.mean,

1756 mask=mask,

1757 args={'yscale': 'log', 'ylabel': 'total iterations'},

1758 )

1759 stats_analyser.plot_things_per_things(

1760 'total_iteration',

1761 'bit',

1762 True,

1763 op=stats_analyser.extra_mean,

1764 mask=mask,

1765 args={'yscale': 'linear', 'ylabel': 'extra iterations'},

1766 name='extra_iter',

1767 )

1768 stats_analyser.plot_things_per_things(

1769 'error', 'bit', True, op=stats_analyser.mean, mask=mask, args={'yscale': 'log'}

1770 )

1771

1772 stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.9, 1.3, 100))

1773 plt.show()

1774

1775

1776if __name__ == "__main__":

1777 main()

1778 # compare_adaptivity_modes()

1779 # check_local_error()

Coverage for pySDC/projects/Resilience/fault_stats.py: 56%

451 statements