Coverage for pySDC / projects / Resilience / fault_stats.py: 56%

451 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-27 07:06 +0000

1import numpy as np 

2import matplotlib.pyplot as plt 

3from mpi4py import MPI 

4import pickle 

5 

6import pySDC.helpers.plot_helper as plot_helper 

7from pySDC.helpers.stats_helper import get_sorted 

8 

9from pySDC.projects.Resilience.hook import hook_collection, LogUAllIter, LogData 

10from pySDC.projects.Resilience.fault_injection import get_fault_injector_hook 

11from pySDC.implementations.convergence_controller_classes.hotrod import HotRod 

12from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity 

13from pySDC.implementations.hooks.log_errors import LogLocalErrorPostStep 

14from pySDC.implementations.hooks.log_work import LogWork 

15 

16# these problems are available for testing 

17from pySDC.projects.Resilience.advection import run_advection 

18from pySDC.projects.Resilience.vdp import run_vdp 

19from pySDC.projects.Resilience.piline import run_piline 

20from pySDC.projects.Resilience.Lorenz import run_Lorenz 

21from pySDC.projects.Resilience.Schroedinger import run_Schroedinger 

22from pySDC.projects.Resilience.quench import run_quench 

23from pySDC.projects.Resilience.AC import run_AC 

24from pySDC.projects.Resilience.RBC import run_RBC 

25from pySDC.projects.Resilience.GS import run_GS 

26 

27from pySDC.projects.Resilience.strategies import BaseStrategy, AdaptivityStrategy, IterateStrategy, HotRodStrategy 

28import logging 

29 

30plot_helper.setup_mpl(reset=True) 

31 

32LOGGER_LEVEL = 40 

33 

34RECOVERY_THRESH_ABS = { 

35 # run_quench: 5e-3, 

36 # run_Schroedinger: 1.7e-6, 

37} 

38 

39 

40class FaultStats: 

41 ''' 

42 Class to generate and analyse fault statistics 

43 ''' 

44 

45 def __init__( 

46 self, 

47 prob=None, 

48 strategies=None, 

49 faults=None, 

50 reload=True, 

51 recovery_thresh=1 + 1e-3, 

52 recovery_thresh_abs=0.0, 

53 num_procs=1, 

54 mode='default', 

55 stats_path='data/stats', 

56 use_MPI=False, 

57 **kwargs, 

58 ): 

59 ''' 

60 Initialization routine 

61 

62 Args: 

63 prob: A function that runs a pySDC problem, see imports for available problems 

64 strategies (list): List of resilience strategies 

65 faults (list): List of booleans that describe whether to use faults or not 

66 reload (bool): Load previously computed statistics and continue from there or start from scratch 

67 recovery_thresh (float): Relative threshold for recovery 

68 num_procs (int): Number of processes 

69 mode (str): Mode for fault generation: Either 'random' or 'combination' 

70 ''' 

71 self.prob = prob 

72 self.strategies = [None] if strategies is None else strategies 

73 self.faults = [False, True] if faults is None else faults 

74 self.reload = reload 

75 self.recovery_thresh = recovery_thresh 

76 self.recovery_thresh_abs = recovery_thresh_abs 

77 self.num_procs = num_procs 

78 self.use_MPI = use_MPI 

79 self.stats_path = stats_path 

80 self.kwargs = { 

81 'fault_frequency_iter': 500, 

82 **kwargs, 

83 } 

84 

85 # decide mode 

86 if mode == 'default': 

87 if prob.__name__ in ['run_vdp', 'run_Lorenz']: 

88 self.mode = 'combination' 

89 else: 

90 self.mode = 'random' 

91 else: 

92 self.mode = mode 

93 

94 self.logger = logging.getLogger('FaultStats') 

95 self.logger.level = LOGGER_LEVEL 

96 

97 msg = 'Starting FaultStats with attributes' 

98 for key, val in self.__dict__.items(): 

99 if key in ['logger']: 

100 continue 

101 item = [str(me) for me in val] if type(val) == list else str(val) 

102 msg += '\n\t' f'{key}: {item}' 

103 self.logger.log(25, msg) 

104 

105 def get_Tend(self): 

106 ''' 

107 Get the final time of runs for fault stats based on the problem 

108 

109 Returns: 

110 float: Tend to put into the run 

111 ''' 

112 return self.strategies[0].get_Tend(self.prob, self.num_procs, resilience_experiment=True) 

113 

114 def run_stats_generation( 

115 self, runs=1000, step=None, comm=None, kwargs_range=None, faults=None, _reload=False, _runs_partial=0 

116 ): 

117 ''' 

118 Run the generation of stats for all strategies in the `self.strategies` variable 

119 

120 Args: 

121 runs (int): Number of runs you want to do 

122 step (int): Number of runs you want to do between saving 

123 comm (MPI.Communicator): Communicator for distributing runs 

124 faults (bool): Whether to do stats with faults or without 

125 kw_args_range (dict): Range for the parameters 

126 _reload, _runs_partial: Variables only used for recursion. Do not change! 

127 

128 Returns: 

129 None 

130 ''' 

131 if faults is None: 

132 for f in self.faults: 

133 self.run_stats_generation(runs=runs, step=step, comm=comm, kwargs_range=kwargs_range, faults=f) 

134 return None 

135 

136 for key, val in kwargs_range.items() if kwargs_range is not None else {}: 

137 if type(val) == int: 

138 self.kwargs[key] = val 

139 else: 

140 for me in val: 

141 kwargs_range_me = {**kwargs_range, key: me} 

142 self.run_stats_generation( 

143 runs=runs, step=step, comm=comm, kwargs_range=kwargs_range_me, faults=faults 

144 ) 

145 return None 

146 

147 comm = MPI.COMM_WORLD if comm is None else comm 

148 step = (runs if comm.size == 1 else comm.size) if step is None else step 

149 _runs_partial = step if _runs_partial == 0 else _runs_partial 

150 reload = self.reload or _reload 

151 

152 # see if we limit the number of runs we want to do 

153 max_runs = ( 

154 (min(self.get_max_combinations(), runs) if self.mode == 'combination' else runs) if faults else min(runs, 5) 

155 ) 

156 

157 if reload: 

158 # sort the strategies to do some load balancing 

159 sorting_index = None 

160 if comm.rank == 0: 

161 already_completed = np.array( 

162 [self.load(strategy=strategy, faults=faults).get('runs', 0) for strategy in self.strategies] 

163 ) 

164 sorting_index_ = np.argsort(already_completed) 

165 sorting_index = sorting_index_[already_completed[sorting_index_] < max_runs] 

166 _runs_partial = max(min(already_completed), _runs_partial) 

167 

168 # tell all ranks what strategies to use 

169 sorting_index = comm.bcast(sorting_index, root=0) 

170 _runs_partial = comm.bcast(_runs_partial, root=0) 

171 strategies = [self.strategies[i] for i in sorting_index] 

172 if len(strategies) == 0: # check if we are already done 

173 return None 

174 else: 

175 strategies = self.strategies 

176 

177 strategy_comm = comm.Split(comm.rank % len(strategies)) 

178 

179 for j in range(0, len(strategies), comm.size): 

180 self.generate_stats( 

181 strategy=strategies[j + (comm.rank % len(strategies) % (len(strategies) - j))], 

182 runs=min(_runs_partial, max_runs), 

183 faults=faults, 

184 reload=reload, 

185 comm=strategy_comm, 

186 ) 

187 strategy_comm.Free() 

188 self.run_stats_generation( 

189 runs=runs, step=step, comm=comm, faults=faults, _reload=True, _runs_partial=_runs_partial + step 

190 ) 

191 

192 return None 

193 

194 def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, comm=None): 

195 ''' 

196 Generate statistics for recovery from bit flips 

197 ----------------------------------------------- 

198 

199 Every run is given a different random seed such that we have different faults and the results are then stored 

200 

201 Args: 

202 strategy (Strategy): Resilience strategy 

203 runs (int): Number of runs you want to do 

204 reload (bool): Load previously computed statistics and continue from there or start from scratch 

205 faults (bool): Whether to do stats with faults or without 

206 comm (MPI.Communicator): Communicator for distributing runs 

207 

208 Returns: 

209 None 

210 ''' 

211 comm = MPI.COMM_WORLD if comm is None else comm 

212 

213 # initialize dictionary to store the stats in 

214 dat = { 

215 'level': np.zeros(runs), 

216 'iteration': np.zeros(runs), 

217 'node': np.zeros(runs), 

218 'problem_pos': [], 

219 'bit': np.zeros(runs), 

220 'error': np.zeros(runs), 

221 'total_iteration': np.zeros(runs), 

222 'total_newton_iteration': np.zeros(runs), 

223 'restarts': np.zeros(runs), 

224 'target': np.zeros(runs), 

225 'rank': np.zeros(runs), 

226 } 

227 

228 # store arguments for storing and loading 

229 identifier_args = { 

230 'faults': faults, 

231 'strategy': strategy, 

232 } 

233 

234 # reload previously recorded stats and write them to dat 

235 if reload: 

236 already_completed_ = None 

237 if comm.rank == 0: 

238 already_completed_ = self.load(**identifier_args) 

239 already_completed = comm.bcast(already_completed_, root=0) 

240 if already_completed['runs'] > 0 and already_completed['runs'] <= runs and comm.rank == 0: 

241 avail_len = min([already_completed['runs'], runs]) 

242 for k in dat.keys(): 

243 dat[k][:avail_len] = already_completed.get(k, [None] * avail_len) 

244 else: 

245 already_completed = {'runs': 0} 

246 

247 # prepare a message 

248 involved_ranks = comm.gather(MPI.COMM_WORLD.rank, root=0) 

249 msg = f'{comm.size} rank(s) ({involved_ranks}) doing {strategy.name}{" with faults" if faults else ""} with {self.num_procs} ranks for {self.prob.__name__} from {already_completed["runs"]} to {runs}' 

250 if comm.rank == 0 and already_completed['runs'] < runs: 

251 print(msg, flush=True) 

252 

253 space_comm = MPI.COMM_SELF.Split(True) 

254 

255 # perform the remaining experiments 

256 for i in range(already_completed['runs'], runs): 

257 if i % comm.size != comm.rank: 

258 continue 

259 

260 # perform a single experiment with the correct random seed 

261 stats, controller, crash = self.single_run(strategy=strategy, run=i, faults=faults, space_comm=space_comm) 

262 

263 # get the data from the stats 

264 faults_run = get_sorted(stats, type='bitflip') 

265 

266 if faults: 

267 assert len(faults_run) > 0, f"Did not record a fault in run {i} of {strategy.name}!" 

268 dat['level'][i] = faults_run[0][1][0] 

269 dat['iteration'][i] = faults_run[0][1][1] 

270 dat['node'][i] = faults_run[0][1][2] 

271 dat['problem_pos'] += [faults_run[0][1][3]] 

272 dat['bit'][i] = faults_run[0][1][4] 

273 dat['target'][i] = faults_run[0][1][5] 

274 dat['rank'][i] = faults_run[0][1][6] 

275 

276 if crash: 

277 print('Code crashed!') 

278 dat['error'][i] = np.inf 

279 continue 

280 

281 # record the rest of the data 

282 t, u = get_sorted(stats, type='u', recomputed=False)[-1] 

283 error = self.get_error(u, t, controller, strategy, self.get_Tend()) 

284 total_iteration = sum([k[1] for k in get_sorted(stats, type='k')]) 

285 total_newton_iteration = sum([k[1] for k in get_sorted(stats, type='work_newton')]) 

286 

287 dat['error'][i] = error 

288 dat['total_iteration'][i] = total_iteration 

289 dat['total_newton_iteration'][i] = total_newton_iteration 

290 dat['restarts'][i] = sum([me[1] for me in get_sorted(stats, type='restart')]) 

291 

292 # free the space communicator 

293 space_comm.Free() 

294 

295 dat_full = {} 

296 for k in dat.keys(): 

297 dat_full[k] = comm.reduce(dat[k], op=MPI.SUM) 

298 

299 # store the completed stats 

300 dat_full['runs'] = runs 

301 

302 if already_completed['runs'] < runs: 

303 if comm.rank == 0: 

304 self.store(dat_full, **identifier_args) 

305 if self.faults: 

306 self.get_recovered(strategy=strategy) 

307 

308 return None 

309 

310 def get_error(self, u, t, controller, strategy, Tend): 

311 """ 

312 Compute the error. 

313 

314 Args: 

315 u (dtype_u): The solution at the end of the run 

316 t (float): Time at which `u` was recorded 

317 controller (pySDC.controller.controller): The controller 

318 strategy (Strategy): The resilience strategy 

319 Tend (float): Time you want to run to 

320 

321 Returns: 

322 float: Error 

323 """ 

324 lvl = controller.MS[0].levels[0] 

325 

326 # decide if we reached Tend 

327 Tend_reached = t + lvl.params.dt_initial >= Tend 

328 

329 if Tend_reached: 

330 u_star = lvl.prob.u_exact(t=t) 

331 return abs(u - u_star) / abs(u_star) 

332 else: 

333 print(f'Final time was not reached! Code crashed at t={t:.2f} instead of reaching Tend={Tend:.2f}') 

334 return np.inf 

335 

336 def single_run( 

337 self, 

338 strategy, 

339 run=0, 

340 faults=False, 

341 force_params=None, 

342 hook_class=None, 

343 space_comm=None, 

344 Tend=None, 

345 time_comm=None, 

346 ): 

347 ''' 

348 Run the problem once with the specified parameters 

349 

350 Args: 

351 strategy (Strategy): The resilience strategy you plan on using 

352 run (int): Index for fault generation 

353 faults (bool): Whether or not to put faults in 

354 force_params (dict): Change parameters in the description of the problem 

355 space_comm (MPI.Communicator): A communicator for space parallelisation 

356 Tend (float): Time you want to run to 

357 time_comm (MPI.Intracomm.Communicator): Communicator in time 

358 

359 Returns: 

360 dict: Stats object containing statistics for each step, each level and each iteration 

361 pySDC.Controller: The controller of the run 

362 float: The time the problem should have run to 

363 ''' 

364 hook_class = hook_collection + [LogWork] + ([LogData] if hook_class is None else hook_class) 

365 force_params = {} if force_params is None else force_params 

366 

367 # build the custom description 

368 custom_description = strategy.get_custom_description_for_faults(self.prob, self.num_procs) 

369 for k in force_params.keys(): 

370 custom_description[k] = {**custom_description.get(k, {}), **force_params[k]} 

371 

372 custom_controller_params = { 

373 'logger_level': LOGGER_LEVEL, 

374 **force_params.get('controller_params', {}), 

375 } 

376 

377 if faults: 

378 fault_stuff = { 

379 'rng': None, 

380 'args': strategy.get_fault_args(self.prob, self.num_procs), 

381 'rnd_params': strategy.get_random_params(self.prob, self.num_procs), 

382 } 

383 

384 # make parameters for faults: 

385 if self.mode == 'random': 

386 fault_stuff['rng'] = np.random.RandomState(run) 

387 elif self.mode == 'combination': 

388 fault_stuff['rng'] = run 

389 elif self.mode == 'regular': 

390 fault_stuff['rng'] = np.random.RandomState(run) 

391 fault_stuff['fault_frequency_iter'] = self.kwargs['fault_frequency_iter'] 

392 fault_stuff['rnd_params'] = { 

393 'bit': 12, 

394 'min_node': 1, 

395 } 

396 else: 

397 raise NotImplementedError(f'Don\'t know how to add faults in mode {self.mode}') 

398 

399 else: 

400 fault_stuff = None 

401 

402 return self.prob( 

403 custom_description=custom_description, 

404 num_procs=self.num_procs, 

405 hook_class=hook_class, 

406 fault_stuff=fault_stuff, 

407 Tend=self.get_Tend() if Tend is None else Tend, 

408 custom_controller_params=custom_controller_params, 

409 space_comm=space_comm, 

410 comm=time_comm, 

411 use_MPI=time_comm is not None, 

412 ) 

413 

414 def compare_strategies(self, run=0, faults=False, ax=None): # pragma: no cover 

415 ''' 

416 Take a closer look at how the strategies compare for a specific run 

417 

418 Args: 

419 run (int): The number of the run to get the appropriate random generator 

420 faults (bool): Whether or not to include faults 

421 ax (Matplotlib.axes): Somewhere to plot 

422 

423 Returns: 

424 None 

425 ''' 

426 if ax is None: 

427 fig, ax = plt.subplots(1, 1) 

428 store = True 

429 else: 

430 store = False 

431 

432 k_ax = ax.twinx() 

433 ls = ['-.' if type(strategy) == HotRodStrategy else '-' for strategy in self.strategies] 

434 [self.scrutinize_visual(self.strategies[i], run, faults, ax, k_ax, ls[i]) for i in range(len(self.strategies))] 

435 

436 # make a legend 

437 [k_ax.plot([None], [None], label=strategy.label, color=strategy.color) for strategy in self.strategies] 

438 k_ax.legend(frameon=True) 

439 

440 if store: 

441 fig.tight_layout() 

442 plt.savefig(f'data/{self.get_name()}-comparison.pdf', transparent=True) 

443 

444 def scrutinize_visual( 

445 self, strategy, run, faults, ax=None, k_ax=None, ls='-', plot_restarts=False 

446 ): # pragma: no cover 

447 ''' 

448 Take a closer look at a specific run with a plot 

449 

450 Args: 

451 strategy (Strategy): The resilience strategy you plan on using 

452 run (int): The number of the run to get the appropriate random generator 

453 faults (bool): Whether or not to include faults 

454 ax (Matplotlib.axes): Somewhere to plot the error 

455 k_ax (Matplotlib.axes): Somewhere to plot the iterations 

456 plot_restarts (bool): Make vertical lines wherever restarts happened 

457 

458 Returns: 

459 dict: Stats from the run 

460 ''' 

461 if ax is None: 

462 fig, ax = plt.subplots(1, 1) 

463 store = True 

464 else: 

465 store = False 

466 

467 force_params = {} 

468 

469 stats, controller, Tend = self.single_run( 

470 strategy=strategy, 

471 run=run, 

472 faults=faults, 

473 force_params=force_params, 

474 hook_class=hook_collection + [LogLocalErrorPostStep, LogData], 

475 ) 

476 

477 # plot the local error 

478 e_loc = get_sorted(stats, type='e_local_post_step', recomputed=False) 

479 ax.plot([me[0] for me in e_loc], [me[1] for me in e_loc], color=strategy.color, ls=ls) 

480 

481 # plot the iterations 

482 k_ax = ax.twinx() if k_ax is None else k_ax 

483 k = get_sorted(stats, type='k') 

484 k_ax.plot([me[0] for me in k], np.cumsum([me[1] for me in k]), color=strategy.color, ls='--') 

485 

486 # plot the faults 

487 faults = get_sorted(stats, type='bitflip') 

488 for fault_time in [me[0] for me in faults]: 

489 ax.axvline(fault_time, color='grey', ls=':') 

490 

491 # plot restarts 

492 if plot_restarts: 

493 restarts = get_sorted(stats, type='restart') 

494 [ax.axvline(me[0], color='black', ls='-.') if me[1] else '' for me in restarts] 

495 

496 # decorate 

497 ax.set_yscale('log') 

498 ax.set_ylabel(r'$\epsilon$') 

499 k_ax.set_ylabel('cumulative iterations (dashed)') 

500 ax.set_xlabel(r'$t$') 

501 

502 if store: 

503 fig.tight_layout() 

504 plt.savefig(f'data/{self.get_name()}-{strategy.name}-details.pdf', transparent=True) 

505 

506 def scrutinize(self, strategy, logger_level=15, **kwargs): 

507 ''' 

508 Take a closer look at a specific run 

509 

510 Args: 

511 strategy (Strategy): The resilience strategy you plan on using 

512 

513 Returns: 

514 None 

515 ''' 

516 from pySDC.projects.Resilience.fault_injection import FaultInjector 

517 

518 force_params = {} 

519 force_params['controller_params'] = {'logger_level': logger_level} 

520 force_params['sweeper_params'] = {'skip_residual_computation': ()} 

521 

522 stats, controller, Tend = self.single_run(strategy=strategy, force_params=force_params, **kwargs) 

523 

524 u_all = get_sorted(stats, type='u', recomputed=False) 

525 t, u = get_sorted(stats, type='u', recomputed=False)[np.argmax([me[0] for me in u_all])] 

526 k = [me[1] for me in get_sorted(stats, type='k')] 

527 

528 fault_hook = [me for me in controller.hooks if type(me) == FaultInjector] 

529 

530 unhappened_faults = fault_hook[0].faults if len(fault_hook) > 0 else [] 

531 

532 print(f'\nOverview for {strategy.name} strategy') 

533 # print faults 

534 faults = get_sorted(stats, type='bitflip') 

535 print('\nfaults:') 

536 print(' t | level | iter | node | bit | trgt | rank | pos') 

537 print('--------+-------+------+------+-----+------+---------------------') 

538 for f in faults: 

539 print( 

540 f' {f[0]:6.2f} | {f[1][0]:5d} | {f[1][1]:4d} | {f[1][2]:4d} | {f[1][4]:3d} | {f[1][5]:4d} | {f[1][6]:4d} |', 

541 f[1][3], 

542 ) 

543 print('--------+-------+------+------+-----+------+---------------------') 

544 for f in unhappened_faults: 

545 print( 

546 f'!{f.time:6.2f} | {f.level_number:5d} | {f.iteration:4d} | {f.node:4d} | {f.bit:3d} | {f.target:4d} | {f.rank:4d} |', 

547 f.problem_pos, 

548 ) 

549 

550 # see if we can determine if the faults where recovered 

551 no_faults = self.load(strategy=strategy, faults=False) 

552 e_star = np.mean(no_faults.get('error', [0])) 

553 error = self.get_error(u, t, controller, strategy, Tend) 

554 recovery_thresh = self.get_thresh(strategy) 

555 

556 print( 

557 f'e={error:.2e}, e^*={e_star:.2e}, thresh: {recovery_thresh:.2e} -> recovered: {error < recovery_thresh}, e_rel = {error/abs(u):.2e}' 

558 ) 

559 print(f'k: sum: {np.sum(k)}, min: {np.min(k)}, max: {np.max(k)}, mean: {np.mean(k):.2f},') 

560 

561 _newton_iter = get_sorted(stats, type='work_newton') 

562 if len(_newton_iter) > 0: 

563 newton_iter = [me[1] for me in _newton_iter] 

564 print( 

565 f'Newton: k: sum: {np.sum(newton_iter)}, min: {np.min(newton_iter)}, max: {np.max(newton_iter)}, mean: {np.mean(newton_iter):.2f},' 

566 ) 

567 

568 # checkout the step size 

569 dt = [me[1] for me in get_sorted(stats, type='dt')] 

570 print(f't: {t:.2f}, dt: min: {np.min(dt):.2e}, max: {np.max(dt):.2e}, mean: {np.mean(dt):.2e}') 

571 

572 # restarts 

573 restarts = [me[1] for me in get_sorted(stats, type='restart')] 

574 print(f'restarts: {sum(restarts)}, without faults: {no_faults["restarts"][0]}') 

575 

576 return stats 

577 

578 def convert_faults(self, faults): 

579 ''' 

580 Make arrays of useable data from an entry in the stats object returned by pySDC 

581 

582 Args: 

583 faults (list): The entry for faults returned by the pySDC run 

584 

585 Returns: 

586 list: The times when the faults happened 

587 list: The levels in which the faults happened 

588 list: The iterations in which the faults happened 

589 list: The nodes in which the faults happened 

590 list: The problem positions in which the faults happened 

591 list: The bits in which the faults happened 

592 ''' 

593 time = [faults[i][0] for i in range(len(faults))] 

594 level = [faults[i][1][0] for i in range(len(faults))] 

595 iteration = [faults[i][1][1] for i in range(len(faults))] 

596 node = [faults[i][1][2] for i in range(len(faults))] 

597 problem_pos = [faults[i][1][3] for i in range(len(faults))] 

598 bit = [faults[i][1][4] for i in range(len(faults))] 

599 return time, level, iteration, node, problem_pos, bit 

600 

601 def get_path(self, **kwargs): 

602 ''' 

603 Get the path to where the stats are stored 

604 

605 Args: 

606 strategy (Strategy): The resilience strategy 

607 faults (bool): Whether or not faults have been activated 

608 

609 Returns: 

610 str: The path to what you are looking for 

611 ''' 

612 return f'{self.stats_path}/{self.get_name(**kwargs)}.pickle' 

613 

614 def get_name(self, strategy=None, faults=True, mode=None): 

615 ''' 

616 Function to get a unique name for a kind of statistics based on the problem and strategy that was used 

617 

618 Args: 

619 strategy (Strategy): Resilience strategy 

620 faults (bool): Whether or not faults where inserted 

621 

622 Returns: 

623 str: The unique identifier 

624 ''' 

625 if self.prob == run_advection: 

626 prob_name = 'advection' 

627 elif self.prob == run_vdp: 

628 prob_name = 'vdp' 

629 elif self.prob == run_piline: 

630 prob_name = 'piline' 

631 elif self.prob == run_Lorenz: 

632 prob_name = 'Lorenz' 

633 elif self.prob == run_Schroedinger: 

634 prob_name = 'Schroedinger' 

635 elif self.prob == run_quench: 

636 prob_name = 'Quench' 

637 elif self.prob.__name__ == 'run_AC': 

638 prob_name = 'Allen-Cahn' 

639 elif self.prob.__name__ == 'run_RBC': 

640 prob_name = 'Rayleigh-Benard' 

641 elif self.prob.__name__ == 'run_GS': 

642 prob_name = 'Gray-Scott' 

643 else: 

644 raise NotImplementedError(f'Name not implemented for problem {self.prob}') 

645 

646 if faults: 

647 fault_name = '-faults' 

648 else: 

649 fault_name = '' 

650 

651 if strategy is not None: 

652 strategy_name = f'-{strategy.name}' 

653 else: 

654 strategy_name = '' 

655 

656 mode = self.mode if mode is None else mode 

657 if mode == 'regular': 

658 mode_thing = f'-regular{self.kwargs["fault_frequency_iter"] if faults else ""}' 

659 else: 

660 mode_thing = '' 

661 

662 mpi_thing = '-MPI' if self.use_MPI else '' 

663 return f'{prob_name}{strategy_name}{fault_name}-{self.num_procs}procs{mode_thing}{mpi_thing}' 

664 

665 def store(self, dat, **kwargs): 

666 ''' 

667 Stores the data for a run at a predefined path 

668 

669 Args: 

670 dat (dict): The data of the recorded statistics 

671 

672 Returns: 

673 None 

674 ''' 

675 with open(self.get_path(**kwargs), 'wb') as f: 

676 pickle.dump(dat, f) 

677 return None 

678 

679 def load(self, **kwargs): 

680 ''' 

681 Loads the stats belonging to a specific strategy and whether or not faults where inserted. 

682 When no data has been generated yet, a dictionary is returned which only contains the number of completed runs, 

683 which is 0 of course. 

684 

685 Returns: 

686 dict: Data from previous run or if it is not available a placeholder dictionary 

687 ''' 

688 kwargs['strategy'] = kwargs.get('strategy', self.strategies[MPI.COMM_WORLD.rank % len(self.strategies)]) 

689 

690 try: 

691 with open(self.get_path(**kwargs), 'rb') as f: 

692 dat = pickle.load(f) 

693 except FileNotFoundError: 

694 self.logger.debug(f'File \"{self.get_path(**kwargs)}\" not found!') 

695 return {'runs': 0} 

696 return dat 

697 

698 def get_thresh(self, strategy=None): 

699 """ 

700 Get recovery threshold based on relative and absolute tolerances 

701 

702 Args: 

703 strategy (Strategy): The resilience strategy 

704 """ 

705 fault_free = self.load(strategy=strategy, faults=False) 

706 assert ( 

707 fault_free['error'].std() / fault_free['error'].mean() < 1e-5 

708 ), f'Got too large variation of errors in {strategy} strategy!' 

709 return max([self.recovery_thresh_abs, self.recovery_thresh * fault_free["error"].mean()]) 

710 

711 def get_recovered(self, **kwargs): 

712 ''' 

713 Determine the recovery rate for a specific strategy and store it to disk. 

714 

715 Returns: 

716 None 

717 ''' 

718 if 'strategy' not in kwargs.keys(): 

719 [self.get_recovered(strategy=strat, **kwargs) for strat in self.strategies] 

720 else: 

721 try: 

722 with_faults = self.load(faults=True, **kwargs) 

723 with_faults['recovered'] = with_faults['error'] <= self.get_thresh(kwargs['strategy']) 

724 self.store(faults=True, dat=with_faults, **kwargs) 

725 except KeyError as error: 

726 print( 

727 f'Warning: Can\'t compute recovery rate for strategy {kwargs["strategy"].name} in {self.prob.__name__} problem right now: KeyError: {error}' 

728 ) 

729 

730 return None 

731 

732 def crash_rate(self, dat, no_faults, thingA, mask): 

733 ''' 

734 Determine the rate of runs that crashed 

735 

736 Args: 

737 dat (dict): The data of the recorded statistics with faults 

738 no_faults (dict): The data of the corresponding fault-free stats 

739 thingA (str): Some key stored in the stats that will go on the y-axis 

740 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply before determining the rate 

741 

742 Returns: 

743 int: Ratio of the runs which crashed and fall under the specific criteria set by the mask 

744 ''' 

745 if len(dat[thingA][mask]) > 0: 

746 crash = dat['error'] == np.inf 

747 return len(dat[thingA][mask & crash]) / len(dat[thingA][mask]) 

748 else: 

749 return None 

750 

751 def rec_rate(self, dat, no_faults, thingA, mask): 

752 ''' 

753 Operation for plotting which returns the recovery rate for a given mask. 

754 Which thingA you apply this to actually does not matter here since we compute a rate. 

755 

756 Args: 

757 dat (dict): The recorded statistics 

758 no_faults (dict): The corresponding fault-free stats 

759 thingA (str): Some key stored in the stats 

760 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

761 

762 Returns: 

763 float: Recovery rate 

764 ''' 

765 if len(dat[thingA][mask]) > 0: 

766 return len(dat[thingA][mask & dat['recovered']]) / len(dat[thingA][mask]) 

767 else: 

768 return None 

769 

770 def mean(self, dat, no_faults, thingA, mask): 

771 ''' 

772 Operation for plotting which returns the mean of thingA after applying the mask 

773 

774 Args: 

775 dat (dict): The recorded statistics 

776 no_faults (dict): The corresponding fault-free stats 

777 thingA (str): Some key stored in the stats 

778 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

779 

780 Returns: 

781 float: Mean of thingA after applying mask 

782 ''' 

783 return np.mean(dat[thingA][mask]) 

784 

785 def extra_mean(self, dat, no_faults, thingA, mask): 

786 ''' 

787 Operation for plotting which returns the difference in mean of thingA between runs with and without faults after 

788 applying the mask 

789 

790 Args: 

791 dat (dict): The recorded statistics 

792 no_faults (dict): The corresponding fault-free stats 

793 thingA (str): Some key stored in the stats 

794 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

795 

796 Returns: 

797 float: Difference in mean of thingA between runs with and without faults after applying mask 

798 ''' 

799 if True in mask or int in [type(me) for me in mask]: 

800 return np.mean(dat[thingA][mask]) - np.mean(no_faults[thingA]) 

801 else: 

802 return None 

803 

804 def plot_thingA_per_thingB( 

805 self, 

806 strategy, 

807 thingA, 

808 thingB, 

809 ax=None, 

810 mask=None, 

811 recovered=False, 

812 op=None, 

813 **kwargs, 

814 ): # pragma: no cover 

815 ''' 

816 Plot thingA vs. thingB for a single strategy 

817 

818 Args: 

819 strategy (Strategy): The resilience strategy you want to plot 

820 thingA (str): Some key stored in the stats that will go on the y-axis 

821 thingB (str): Some key stored in the stats that will go on the x-axis 

822 ax (Matplotlib.axes): Somewhere to plot 

823 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes 

824 recovered (bool): Show the plot for both all runs and only the recovered ones 

825 op (function): Operation that is applied to thingA before plotting default is recovery rate 

826 

827 Returns: 

828 None 

829 ''' 

830 op = self.rec_rate if op is None else op 

831 dat = self.load(strategy=strategy, faults=True) 

832 no_faults = self.load(strategy=strategy, faults=False) 

833 

834 if mask is None: 

835 mask = np.ones_like(dat[thingB], dtype=bool) 

836 

837 admissable_thingB = np.unique(dat[thingB][mask]) 

838 me = np.zeros(len(admissable_thingB)) 

839 me_recovered = np.zeros_like(me) 

840 

841 for i in range(len(me)): 

842 _mask = np.logical_and((dat[thingB] == admissable_thingB[i]), mask) 

843 if _mask.any(): 

844 me[i] = op(dat, no_faults, thingA, _mask) 

845 me_recovered[i] = op(dat, no_faults, thingA, np.logical_and(_mask, dat['recovered'])) 

846 

847 if recovered: 

848 ax.plot( 

849 admissable_thingB, 

850 me_recovered, 

851 label=f'{strategy.label} (only recovered)', 

852 color=strategy.color, 

853 marker=strategy.marker, 

854 ls='--', 

855 linewidth=3, 

856 **kwargs, 

857 ) 

858 

859 ax.plot( 

860 admissable_thingB, 

861 me, 

862 label=f'{strategy.label}', 

863 color=strategy.color, 

864 marker=strategy.marker, 

865 linewidth=2, 

866 **kwargs, 

867 ) 

868 

869 ax.legend(frameon=False) 

870 ax.set_xlabel(thingB) 

871 ax.set_ylabel(thingA) 

872 return None 

873 

874 def plot_things_per_things( 

875 self, 

876 thingA='bit', 

877 thingB='bit', 

878 recovered=False, 

879 mask=None, 

880 op=None, 

881 args=None, 

882 strategies=None, 

883 name=None, 

884 store=False, 

885 ax=None, 

886 fig=None, 

887 plotting_args=None, 

888 ): # pragma: no cover 

889 ''' 

890 Plot thingA vs thingB for multiple strategies 

891 

892 Args: 

893 thingA (str): Some key stored in the stats that will go on the y-axis 

894 thingB (str): Some key stored in the stats that will go on the x-axis 

895 recovered (bool): Show the plot for both all runs and only the recovered ones 

896 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes 

897 op (function): Operation that is applied to thingA before plotting default is recovery rate 

898 args (dict): Parameters for how the plot should look 

899 strategies (list): List of the strategies you want to plot, if None, all will be plotted 

900 name (str): Optional name for the plot 

901 store (bool): Store the plot at a predefined path or not (for jupyter notebooks) 

902 ax (Matplotlib.axes): Somewhere to plot 

903 fig (Matplotlib.figure): Figure of the ax 

904 

905 Returns 

906 None 

907 ''' 

908 strategies = self.strategies if strategies is None else strategies 

909 args = {} if args is None else args 

910 

911 # make sure we have something to plot in 

912 if ax is None: 

913 fig, ax = plt.subplots(1, 1) 

914 elif fig is None: 

915 store = False 

916 

917 # execute the plots for all strategies 

918 for s in strategies: 

919 self.plot_thingA_per_thingB( 

920 s, 

921 thingA=thingA, 

922 thingB=thingB, 

923 recovered=recovered, 

924 ax=ax, 

925 mask=mask, 

926 op=op, 

927 **(plotting_args if plotting_args else {}), 

928 ) 

929 

930 # set the parameters 

931 [plt.setp(ax, k, v) for k, v in args.items()] 

932 

933 if store: 

934 fig.tight_layout() 

935 plt.savefig(f'data/{self.get_name()}-{thingA if name is None else name}_per_{thingB}.pdf', transparent=True) 

936 plt.close(fig) 

937 

938 return None 

939 

940 def plot_recovery_thresholds( 

941 self, strategies=None, thresh_range=None, ax=None, recoverable_only=False, **kwargs 

942 ): # pragma: no cover 

943 ''' 

944 Plot the recovery rate for a range of thresholds 

945 

946 Args: 

947 strategies (list): List of the strategies you want to plot, if None, all will be plotted 

948 thresh_range (list): thresholds for deciding whether to accept as recovered 

949 ax (Matplotlib.axes): Somewhere to plot 

950 

951 Returns: 

952 None 

953 ''' 

954 # fill default values if nothing is specified 

955 strategies = self.strategies if strategies is None else strategies 

956 

957 thresh_range = 1 + np.linspace(-4e-2, 4e-2, 100) if thresh_range is None else thresh_range 

958 if ax is None: 

959 fig, ax = plt.subplots(1, 1) 

960 

961 rec_rates = [[None] * len(thresh_range)] * len(strategies) 

962 for strategy_idx in range(len(strategies)): 

963 strategy = strategies[strategy_idx] 

964 # load the stats 

965 fault_free = self.load(strategy=strategy, faults=False) 

966 with_faults = self.load(strategy=strategy, faults=True) 

967 

968 if recoverable_only: 

969 recoverable_mask = self.get_fixable_faults_only(strategy) 

970 else: 

971 recoverable_mask = self.get_mask(strategy=strategy) 

972 

973 for thresh_idx in range(len(thresh_range)): 

974 rec_mask = self.get_mask( 

975 strategy=strategy, 

976 key='error', 

977 val=(thresh_range[thresh_idx] * fault_free['error'].mean()), 

978 op='gt', 

979 old_mask=recoverable_mask, 

980 ) 

981 rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len( 

982 with_faults['error'][recoverable_mask] 

983 ) 

984 

985 ax.plot( 

986 thresh_range, rec_rates[strategy_idx], **{'color': strategy.color, 'label': strategy.label, **kwargs} 

987 ) 

988 ax.legend(frameon=False) 

989 ax.set_ylabel('recovery rate') 

990 ax.set_xlabel('threshold as ratio to fault-free error') 

991 

992 return None 

993 

994 def analyse_adaptivity(self, mask): # pragma: no cover 

995 ''' 

996 Analyse a set of runs with adaptivity 

997 

998 Args: 

999 mask (Numpy.ndarray of shape (n)): The mask you want to know about 

1000 

1001 Returns: 

1002 None 

1003 ''' 

1004 index = self.get_index(mask) 

1005 dat = self.load() 

1006 

1007 # make a header 

1008 print(' run | bit | node | iter | e_em^* | e_em | e_glob^* | e_glob ') 

1009 print('-------+-----+------+------+----------+----------+----------+----------') 

1010 for i in index: 

1011 e_em, e_glob = self.analyse_adaptivity_single(int(i)) 

1012 print(f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_em[1]:.2e}\ 

1013 | {e_em[0]:.2e} | {e_glob[1]:.2e} | {e_glob[0]:.2e}') 

1014 

1015 e_tol = AdaptivityStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][ 

1016 Adaptivity 

1017 ]['e_tol'] 

1018 print(f'We only restart when e_em > e_tol = {e_tol:.2e}!') 

1019 return None 

1020 

1021 def analyse_adaptivity_single(self, run): # pragma: no cover 

1022 ''' 

1023 Compute what the difference in embedded and global error are for a specific run with adaptivity 

1024 

1025 Args: 

1026 run (int): The run you want to know about 

1027 

1028 Returns: 

1029 list: Embedded error with fault and without for the last iteration in the step with a fault 

1030 list: Global error with and without fault at the end of the run 

1031 ''' 

1032 # perform one run with and one without faults 

1033 stats = [] 

1034 controllers = [] 

1035 for faults in [True, False]: 

1036 s, c, _ = self.single_run( 

1037 strategy=AdaptivityStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter] 

1038 ) 

1039 stats += [s] 

1040 controllers += [c] 

1041 

1042 # figure out when the fault happened 

1043 t_fault = get_sorted(stats[0], type='bitflip')[0][0] 

1044 

1045 # get embedded error 

1046 e_em = [ 

1047 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')] 

1048 for stat in stats 

1049 ] 

1050 

1051 # compute the global error 

1052 u_end = [get_sorted(stat, type='u')[-1] for stat in stats] 

1053 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]] 

1054 

1055 return [e_em[i][-1] for i in [0, 1]], e_glob 

1056 

1057 def analyse_HotRod(self, mask): # pragma: no cover 

1058 ''' 

1059 Analyse a set of runs with Hot Rod 

1060 

1061 Args: 

1062 mask (Numpy.ndarray of shape (n)): The mask you want to know about 

1063 

1064 Returns: 

1065 None 

1066 ''' 

1067 index = self.get_index(mask) 

1068 dat = self.load() 

1069 

1070 # make a header 

1071 print(' run | bit | node | iter | e_ex^* | e_ex | e_em^* | e_em | diff* | diff | e_glob^* \ 

1072| e_glob ') 

1073 print('-------+-----+------+------+----------+----------+----------+----------+----------+----------+----------\ 

1074+----------') 

1075 for i in index: 

1076 e_em, e_ex, e_glob = self.analyse_HotRod_single(int(i)) 

1077 print(f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_ex[1]:.2e}\ 

1078 | {e_ex[0]:.2e} | {e_em[1]:.2e} | {e_em[0]:.2e} | {abs(e_em[1]-e_ex[1]):.2e} | {abs(e_em[0]-e_ex[0]):.2e} | \ 

1079{e_glob[1]:.2e} | {e_glob[0]:.2e}') 

1080 

1081 tol = HotRodStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][HotRod][ 

1082 'HotRod_tol' 

1083 ] 

1084 print(f'We only restart when diff > tol = {tol:.2e}!') 

1085 return None 

1086 

1087 def analyse_HotRod_single(self, run): # pragma: no cover 

1088 ''' 

1089 Compute what the difference in embedded, extrapolated and global error are for a specific run with Hot Rod 

1090 

1091 Args: 

1092 run (int): The run you want to know about 

1093 

1094 Returns: 

1095 list: Embedded error with fault and without for the last iteration in the step with a fault 

1096 list: Extrapolation error with fault and without for the last iteration in the step with a fault 

1097 list: Global error with and without fault at the end of the run 

1098 ''' 

1099 # perform one run with and one without faults 

1100 stats = [] 

1101 controllers = [] 

1102 for faults in [True, False]: 

1103 s, c, _ = self.single_run( 

1104 strategy=HotRodStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter] 

1105 ) 

1106 stats += [s] 

1107 controllers += [c] 

1108 

1109 # figure out when the fault happened 

1110 t_fault = get_sorted(stats[0], type='bitflip')[0][0] 

1111 

1112 # get embedded error 

1113 e_em = [ 

1114 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')] 

1115 for stat in stats 

1116 ] 

1117 # get extrapolated error 

1118 e_ex = [ 

1119 [me[1] for me in get_sorted(stat, type='error_extrapolation_estimate', time=t_fault, sortby='iter')] 

1120 for stat in stats 

1121 ] 

1122 

1123 # compute the global error 

1124 u_end = [get_sorted(stat, type='u')[-1] for stat in stats] 

1125 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]] 

1126 

1127 return [e_em[i][-1] for i in [0, 1]], [e_ex[i][-1] for i in [0, 1]], e_glob 

1128 

1129 def print_faults(self, mask=None, strategy=None): # pragma: no cover 

1130 ''' 

1131 Print all faults that happened within a certain mask 

1132 

1133 Args: 

1134 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of 

1135 strategy (Strategy): The resilience strategy you want to see the error for 

1136 

1137 Returns: 

1138 None 

1139 ''' 

1140 strategy = BaseStrategy() if strategy is None else strategy 

1141 

1142 index = self.get_index(mask) 

1143 dat = self.load(strategy=strategy) 

1144 

1145 e_star = np.mean(self.load(faults=False, strategy=strategy).get('error', [0])) 

1146 

1147 # make a header 

1148 print(' run | bit | node | iter | rank | rel err | space pos') 

1149 print('-------+-----+------+------+------+---------+-----------') 

1150 for i in index: 

1151 print( 

1152 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {dat.get("rank", np.zeros_like(dat["iteration"]))[i]:4.0f} | {dat["error"][i] / e_star:.1e} | {dat["problem_pos"][i]}' 

1153 ) 

1154 

1155 return None 

1156 

1157 def get_mask(self, strategy=None, key=None, val=None, op='eq', old_mask=None, compare_faults=False): 

1158 ''' 

1159 Make a mask to apply to stored data to filter out certain things 

1160 

1161 Args: 

1162 strategy (Strategy): The resilience strategy you want to apply the mask to. Most masks are the same for all 

1163 strategies so None is fine 

1164 key (str): The key in the stored statistics that you want to filter for some value 

1165 val (str, float, int, bool): A value that you want to use for filtering. Dtype depends on key 

1166 op (str): Operation that is applied for filtering 

1167 old_mask (Numpy.ndarray of shape (n)): Apply this mask on top of the filter 

1168 compare_faults (bool): instead of comparing to val, compare to the mean value for fault free runs 

1169 

1170 Returns: 

1171 Numpy.ndarray with boolean entries that can be used as a mask 

1172 ''' 

1173 strategy = self.strategies[0] if strategy is None else strategy 

1174 dat = self.load(strategy=strategy, faults=True) 

1175 

1176 if compare_faults: 

1177 if val is not None: 

1178 raise ValueError('Can\'t use val and compare_faults in get_mask at the same time!') 

1179 else: 

1180 vals = self.load(strategy=strategy, faults=False)[key] 

1181 val = sum(vals) / len(vals) 

1182 

1183 if None in [key, val] and op not in ['isfinite']: 

1184 mask = dat['bit'] == dat['bit'] 

1185 else: 

1186 if op == 'uneq': 

1187 mask = dat[key] != val 

1188 elif op == 'eq': 

1189 mask = dat[key] == val 

1190 elif op == 'leq': 

1191 mask = dat[key] <= val 

1192 elif op == 'geq': 

1193 mask = dat[key] >= val 

1194 elif op == 'lt': 

1195 mask = dat[key] < val 

1196 elif op == 'gt': 

1197 mask = dat[key] > val 

1198 elif op == 'isfinite': 

1199 mask = np.isfinite(dat[key]) 

1200 else: 

1201 raise NotImplementedError(f'Please implement op={op}!') 

1202 

1203 if old_mask is not None: 

1204 return mask & old_mask 

1205 else: 

1206 return mask 

1207 

1208 def get_fixable_faults_only(self, strategy): 

1209 """ 

1210 Return a mask of only faults that can be fixed with a given strategy. 

1211 

1212 Args: 

1213 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all 

1214 

1215 Returns: 

1216 Numpy.ndarray with boolean entries that can be used as a mask 

1217 """ 

1218 fixable = strategy.get_fixable_params( 

1219 maxiter=strategy.get_custom_description(self.prob, self.num_procs)['step_params']['maxiter'] 

1220 ) 

1221 mask = self.get_mask(strategy=strategy) 

1222 

1223 for kwargs in fixable: 

1224 mask = self.get_mask(strategy=strategy, **kwargs, old_mask=mask) 

1225 

1226 return mask 

1227 

1228 def get_index(self, mask=None): 

1229 ''' 

1230 Get the indeces of all runs in mask 

1231 

1232 Args: 

1233 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of 

1234 

1235 Returns: 

1236 Numpy.ndarray: Array of indeces 

1237 ''' 

1238 if mask is None: 

1239 dat = self.load() 

1240 return np.arange(len(dat['iteration'])) 

1241 else: 

1242 return np.arange(len(mask))[mask] 

1243 

1244 def get_statistics_info(self, mask=None, strategy=None, print_all=False, ax=None): # pragma: no cover 

1245 ''' 

1246 Get information about how many data points for faults we have given a particular mask 

1247 

1248 Args: 

1249 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1250 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all 

1251 strategies, so you don't need to supply this argument 

1252 print_all (bool): Whether to add information that is normally not useful to the table 

1253 ax (Matplotlib.axes): Somewhere to plot the combinations histogram 

1254 

1255 Returns: 

1256 None 

1257 ''' 

1258 

1259 # load some data from which to infer the number occurrences of some event 

1260 strategy = self.strategies[0] if strategy is None else strategy 

1261 dat = self.load(stratagy=strategy, faults=True) 

1262 

1263 # make a dummy mask in case none is supplied 

1264 if mask is None: 

1265 mask = np.ones_like(dat['error'], dtype=bool) 

1266 

1267 # print a header 

1268 print(f' tot: {len(dat["error"][mask]):6} | avg. counts | mean deviation | unique entries') 

1269 print('-------------------------------------------------------------') 

1270 

1271 # make a list of all keys that you want to look at 

1272 keys = ['iteration', 'bit', 'node'] 

1273 if print_all: 

1274 keys += ['problem_pos', 'level', 'target'] 

1275 

1276 # print the table 

1277 for key in keys: 

1278 counts, dev, unique = self.count_occurrences(dat[key][mask]) 

1279 print(f' {key:11} | {counts:11.1f} | {dev:14.2f} | {unique:14}') 

1280 

1281 return None 

1282 

1283 def combinations_histogram(self, dat=None, keys=None, mask=None, ax=None): # pragma: no cover 

1284 ''' 

1285 Make a histogram ouf of the occurrences of combinations 

1286 

1287 Args: 

1288 dat (dict): The data of the recorded statistics 

1289 keys (list): The keys in dat that you want to know the combinations of 

1290 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1291 

1292 Returns: 

1293 Matplotlib.axes: The plot 

1294 ''' 

1295 if ax is None: 

1296 fig, ax = plt.subplots(1, 1) 

1297 

1298 occurrences, bins = self.get_combination_histogram(dat, keys, mask) 

1299 

1300 ax.bar(bins[:-1], occurrences) 

1301 

1302 ax.set_xlabel('Occurrence of combinations') 

1303 

1304 return ax 

1305 

1306 def get_combination_histogram(self, dat=None, keys=None, mask=None): # pragma: no cover 

1307 ''' 

1308 Check how many combinations of values we expect and how many we find to see if we need to do more experiments. 

1309 It is assumed that each allowed value for each key appears at least once in dat after the mask was applied 

1310 

1311 Args: 

1312 dat (dict): The data of the recorded statistics 

1313 keys (list): The keys in dat that you want to know the combinations of 

1314 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1315 

1316 Returns: 

1317 Numpy.ndarray: Number of occurrences of combinations 

1318 Numpy.ndarray: Bins 

1319 ''' 

1320 

1321 # load default values 

1322 dat = self.load(strategy=self.strategies[0], faults=True) if dat is None else dat 

1323 keys = ['iteration', 'bit', 'node'] if keys is None else keys 

1324 if mask is None: 

1325 mask = np.ones_like(dat['error'], dtype=bool) 

1326 

1327 # get unique values and compute how many combinations you expect 

1328 num_unique = [len(np.unique(dat[key][mask])) for key in keys] 

1329 expected_number_of_combinations = np.prod(num_unique) 

1330 

1331 # test what you actually get 

1332 combination_counts = self.get_combination_counts(dat, keys, mask) 

1333 

1334 # make a histogram with the result 

1335 occurrences, bins = np.histogram(combination_counts, bins=np.arange(max(combination_counts) + 1)) 

1336 occurrences[0] = expected_number_of_combinations - len(combination_counts) 

1337 

1338 return occurrences, bins 

1339 

1340 def get_max_combinations(self, dat=None, strategy=None): 

1341 ''' 

1342 Count how many combinations of parameters for faults are possible 

1343 

1344 Args: 

1345 dat (dict): The recorded statistics 

1346 keys (list): The keys in dat that you want to know the combinations of 

1347 strategy (Strategy): The resilience strategy 

1348 

1349 Returns: 

1350 int: Number of possible combinations 

1351 ''' 

1352 strategy = self.strategies[0] if strategy is None else strategy 

1353 stats, controller, Tend = self.single_run(strategy=strategy, run=0, faults=True) 

1354 faultHook = get_fault_injector_hook(controller) 

1355 ranges = [ 

1356 (0, faultHook.rnd_params['level_number']), 

1357 (faultHook.rnd_params.get('min_node', 0), faultHook.rnd_params['node'] + 1), 

1358 (1, faultHook.rnd_params['iteration'] + 1), 

1359 (0, faultHook.rnd_params['bit']), 

1360 (0, faultHook.rnd_params['rank']), 

1361 ] 

1362 ranges += [(0, i) for i in faultHook.rnd_params['problem_pos']] 

1363 return np.prod([me[1] - me[0] for me in ranges], dtype=int) 

1364 

1365 def get_combination_counts(self, dat, keys, mask): 

1366 ''' 

1367 Get counts of how often all combinations of values of keys appear. This is done recursively to support arbitrary 

1368 numbers of keys 

1369 

1370 Args: 

1371 dat (dict): The data of the recorded statistics 

1372 keys (list): The keys in dat that you want to know the combinations of 

1373 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1374 

1375 Returns: 

1376 list: Occurrences of all combinations 

1377 ''' 

1378 key = keys[0] 

1379 unique_vals = np.unique(dat[key][mask]) 

1380 res = [] 

1381 

1382 for i in range(len(unique_vals)): 

1383 inner_mask = self.get_mask(key=key, val=unique_vals[i], op='eq', old_mask=mask) 

1384 if len(keys) > 1: 

1385 res += self.get_combination_counts(dat, keys[1:], inner_mask) 

1386 else: 

1387 res += [self.count_occurrences(dat[key][inner_mask])[0]] 

1388 return res 

1389 

1390 def count_occurrences(self, vals): 

1391 ''' 

1392 Count the occurrences of unique values in vals and compute average deviation from mean 

1393 

1394 Args: 

1395 vals (list): Values you want to check 

1396 

1397 Returns: 

1398 float: Mean of number of occurrences of unique values in vals 

1399 float: Average deviation from mean number of occurrences 

1400 int: Number of unique entries 

1401 ''' 

1402 unique_vals, counts = np.unique(vals, return_counts=True) 

1403 

1404 if len(counts) > 0: 

1405 return counts.mean(), sum(abs(counts - counts.mean())) / len(counts), len(counts) 

1406 else: 

1407 return None, None, 0 

1408 

1409 def bar_plot_thing( 

1410 self, x=None, thing=None, ax=None, mask=None, store=False, faults=False, name=None, op=None, args=None 

1411 ): # pragma: no cover 

1412 ''' 

1413 Make a bar plot about something! 

1414 

1415 Args: 

1416 x (Numpy.ndarray of dimension 1): x values for bar plot 

1417 thing (str): Some key stored in the stats that will go on the y-axis 

1418 mask (Numpy.ndarray of shape (n)): The mask you want to apply before plotting 

1419 store (bool): Store the plot at a predefined path or not (for jupyter notebooks) 

1420 faults (bool): Whether to load stats with faults or without 

1421 name (str): Optional name for the plot 

1422 op (function): Operation that is applied to thing before plotting default is recovery rate 

1423 args (dict): Parameters for how the plot should look 

1424 

1425 Returns: 

1426 None 

1427 ''' 

1428 if ax is None: 

1429 fig, ax = plt.subplots(1, 1) 

1430 store = True 

1431 op = self.mean if op is None else op 

1432 

1433 # get the values for the bars 

1434 height = np.zeros(len(self.strategies)) 

1435 for strategy_idx in range(len(self.strategies)): 

1436 strategy = self.strategies[strategy_idx] 

1437 

1438 # load the values 

1439 dat = self.load(strategy=strategy, faults=faults) 

1440 no_faults = self.load(strategy=strategy, faults=False) 

1441 

1442 # check if we have a mask 

1443 if mask is None: 

1444 mask = np.ones_like(dat[thing], dtype=bool) 

1445 

1446 height[strategy_idx] = op(dat, no_faults, thing, mask) 

1447 

1448 # get some x values 

1449 x = np.arange(len(self.strategies)) if x is None else x 

1450 

1451 # prepare labels 

1452 ticks = [strategy.bar_plot_x_label for strategy in self.strategies] 

1453 

1454 ax.bar(x, height, tick_label=ticks) 

1455 

1456 # set the parameters 

1457 ax.set_ylabel(thing) 

1458 args = {} if args is None else args 

1459 [plt.setp(ax, k, v) for k, v in args.items()] 

1460 

1461 if store: 

1462 fig.tight_layout() 

1463 plt.savefig(f'data/{self.get_name()}-{thing if name is None else name}-barplot.pdf', transparent=True) 

1464 plt.close(fig) 

1465 

1466 return None 

1467 

1468 def fault_frequency_plot(self, ax, iter_ax, kwargs_range, strategy=None): # pragma: no cover 

1469 func_args = locals() 

1470 func_args.pop('self', None) 

1471 if strategy is None: 

1472 for strat in self.strategies: 

1473 args = {**func_args, 'strategy': strat} 

1474 self.fault_frequency_plot(**args) 

1475 return None 

1476 

1477 # load data 

1478 all_data = {} 

1479 for me in kwargs_range['fault_frequency_iter']: 

1480 self.kwargs['fault_frequency_iter'] = me 

1481 self.get_recovered() 

1482 all_data[me] = self.load(strategy=strategy, faults=True, mode='regular') 

1483 

1484 # get_recovery_rate 

1485 results = {} 

1486 results['frequencies'] = list(all_data.keys()) 

1487 results['recovery_rate'] = [ 

1488 len(all_data[key]['recovered'][all_data[key]['recovered'] is True]) / len(all_data[key]['recovered']) 

1489 for key in all_data.keys() 

1490 ] 

1491 # results['iterations'] = [np.mean(all_data[key]['total_iteration']) for key in all_data.keys()] 

1492 results['iterations'] = [ 

1493 np.mean(all_data[key]['total_iteration'][all_data[key]['error'] != np.inf]) for key in all_data.keys() 

1494 ] 

1495 

1496 ax.plot(results['frequencies'], results['recovery_rate'], **strategy.style) 

1497 iter_ax.plot(results['frequencies'], results['iterations'], **{**strategy.style, 'ls': '--'}) 

1498 

1499 ax.set_xscale('log') 

1500 ax.set_xlabel('iterations between fault') 

1501 ax.set_ylabel('recovery rate') 

1502 iter_ax.set_ylabel('average total iterations if not crashed (dashed)') 

1503 ax.legend(frameon=False) 

1504 

1505 return None 

1506 

1507 def get_HR_tol(self, verbose=False): 

1508 from pySDC.implementations.convergence_controller_classes.hotrod import HotRod 

1509 

1510 HR_strategy = HotRodStrategy(useMPI=self.use_MPI) 

1511 

1512 description = HR_strategy.get_custom_description_for_faults(self.prob, self.num_procs) 

1513 description['convergence_controllers'][HotRod]['HotRod_tol'] = 1e2 

1514 

1515 stats, _, _ = self.single_run(HR_strategy, force_params=description) 

1516 

1517 e_extrapolation = get_sorted(stats, type='error_extrapolation_estimate') 

1518 diff = [] 

1519 for i in range(len(e_extrapolation)): 

1520 if e_extrapolation[i][1] is not None: 

1521 e_embedded = get_sorted(stats, type='error_embedded_estimate', time=e_extrapolation[i][0]) 

1522 diff += [abs(e_extrapolation[i][1] - e_embedded[-1][1])] 

1523 

1524 max_diff = max(diff) 

1525 proposed_tol = 2 * max_diff 

1526 if verbose: 

1527 print( 

1528 f'Max. diff: {max_diff:.6e} -> proposed HR tolerance: {proposed_tol:.6e} for {self.prob.__name__} problem with {self.num_procs} procs.' 

1529 ) 

1530 else: 

1531 print(f'{proposed_tol:.6e}') 

1532 

1533 

1534def check_local_error(): # pragma: no cover 

1535 """ 

1536 Make a plot of the resolution over time for all problems 

1537 """ 

1538 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench] 

1539 problems = [run_quench] 

1540 strategies = [BaseStrategy(), AdaptivityStrategy(), IterateStrategy()] 

1541 

1542 for i in range(len(problems)): 

1543 stats_analyser = FaultStats( 

1544 prob=problems[i], 

1545 strategies=strategies, 

1546 faults=[False], 

1547 reload=True, 

1548 recovery_thresh=1.1, 

1549 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1550 num_procs=1, 

1551 mode='random', 

1552 ) 

1553 stats_analyser.compare_strategies() 

1554 plt.show() 

1555 

1556 

1557def parse_args(): 

1558 import sys 

1559 

1560 kwargs = {} 

1561 

1562 for i in range(len(sys.argv)): 

1563 if 'prob' in sys.argv[i]: 

1564 if sys.argv[i + 1] == 'run_Lorenz': 

1565 kwargs['prob'] = run_Lorenz 

1566 elif sys.argv[i + 1] == 'run_vdp': 

1567 kwargs['prob'] = run_vdp 

1568 elif sys.argv[i + 1] == 'run_Schroedinger': 

1569 kwargs['prob'] = run_Schroedinger 

1570 elif sys.argv[i + 1] == 'run_quench': 

1571 kwargs['prob'] = run_quench 

1572 elif sys.argv[i + 1] == 'run_AC': 

1573 kwargs['prob'] = run_AC 

1574 elif sys.argv[i + 1] == 'run_RBC': 

1575 kwargs['prob'] = run_RBC 

1576 elif sys.argv[i + 1] == 'run_GS': 

1577 kwargs['prob'] = run_GS 

1578 else: 

1579 raise NotImplementedError 

1580 elif 'num_procs' in sys.argv[i]: 

1581 kwargs['num_procs'] = int(sys.argv[i + 1]) 

1582 elif 'runs' in sys.argv[i]: 

1583 kwargs['runs'] = int(sys.argv[i + 1]) 

1584 elif 'mode' in sys.argv[i]: 

1585 kwargs['mode'] = str(sys.argv[i + 1]) 

1586 elif 'reload' in sys.argv[i]: 

1587 kwargs['reload'] = False if sys.argv[i + 1] == 'False' else True 

1588 

1589 return kwargs 

1590 

1591 

1592def compare_adaptivity_modes(): 

1593 from pySDC.projects.Resilience.strategies import AdaptivityRestartFirstStep 

1594 

1595 fig, axs = plt.subplots(2, 2, sharey=True) 

1596 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench] 

1597 titles = ['Van der Pol', 'Lorenz attractor', r'Schr\"odinger', 'Quench'] 

1598 

1599 for i in range(len(problems)): 

1600 ax = axs.flatten()[i] 

1601 

1602 ax.set_title(titles[i]) 

1603 

1604 stats_analyser = FaultStats( 

1605 prob=problems[i], 

1606 strategies=[AdaptivityStrategy()], 

1607 faults=[False, True], 

1608 reload=True, 

1609 recovery_thresh=1.1, 

1610 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1611 num_procs=1, 

1612 mode='default', 

1613 stats_path='data/stats-jusuf', 

1614 ) 

1615 stats_analyser.get_recovered() 

1616 

1617 for strategy in stats_analyser.strategies: 

1618 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1619 stats_analyser.plot_things_per_things( 

1620 'recovered', 

1621 'bit', 

1622 False, 

1623 strategies=[strategy], 

1624 op=stats_analyser.rec_rate, 

1625 mask=fixable, 

1626 args={'ylabel': 'recovery rate', 'label': 'bla'}, 

1627 name='fixable_recovery', 

1628 ax=ax, 

1629 ) 

1630 single_proc = ax.lines[-1] 

1631 single_proc.set_label('single process') 

1632 single_proc.set_color('black') 

1633 

1634 stats_analyser = FaultStats( 

1635 prob=problems[i], 

1636 strategies=[AdaptivityStrategy(), AdaptivityRestartFirstStep()], 

1637 faults=[False, True], 

1638 reload=True, 

1639 recovery_thresh=1.5, 

1640 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1641 num_procs=4, 

1642 mode='default', 

1643 stats_path='data/stats-jusuf', 

1644 ) 

1645 stats_analyser.get_recovered() 

1646 

1647 for strategy in stats_analyser.strategies: 

1648 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1649 stats_analyser.plot_things_per_things( 

1650 'recovered', 

1651 'bit', 

1652 False, 

1653 strategies=[strategy], 

1654 op=stats_analyser.rec_rate, 

1655 mask=fixable, 

1656 args={'ylabel': 'recovery rate'}, 

1657 name='fixable_recovery', 

1658 ax=ax, 

1659 ) 

1660 fig.suptitle('Comparison of restart modes with adaptivity with 4 ranks') 

1661 fig.tight_layout() 

1662 plt.show() 

1663 

1664 

1665def main(): 

1666 kwargs = { 

1667 'prob': run_vdp, 

1668 'num_procs': 1, 

1669 'mode': 'default', 

1670 'runs': 4000, 

1671 'reload': True, 

1672 **parse_args(), 

1673 } 

1674 

1675 strategy_args = { 

1676 'stop_at_nan': True, 

1677 } 

1678 

1679 from pySDC.projects.Resilience.strategies import AdaptivityPolynomialError, kAdaptivityStrategy 

1680 

1681 stats_analyser = FaultStats( 

1682 strategies=[ 

1683 BaseStrategy(**strategy_args), 

1684 AdaptivityStrategy(**strategy_args), 

1685 kAdaptivityStrategy(**strategy_args), 

1686 HotRodStrategy(**strategy_args), 

1687 AdaptivityPolynomialError(**strategy_args), 

1688 ], 

1689 faults=[False, True], 

1690 recovery_thresh=1.1, 

1691 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(kwargs.get('prob', None), 0), 

1692 stats_path='data/stats-jusuf', 

1693 **kwargs, 

1694 ) 

1695 stats_analyser.get_recovered() 

1696 

1697 stats_analyser.run_stats_generation(runs=kwargs['runs'], step=8) 

1698 

1699 if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data 

1700 return None 

1701 

1702 stats_analyser.get_recovered() 

1703 mask = None 

1704 

1705 # stats_analyser.compare_strategies() 

1706 stats_analyser.plot_things_per_things( 

1707 'recovered', 'node', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1708 ) 

1709 stats_analyser.plot_things_per_things( 

1710 'recovered', 'iteration', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1711 ) 

1712 

1713 stats_analyser.plot_things_per_things( 

1714 'recovered', 'bit', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1715 ) 

1716 

1717 # make a plot for only the faults that can be recovered 

1718 fig, ax = plt.subplots(1, 1) 

1719 for strategy in stats_analyser.strategies: 

1720 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1721 stats_analyser.plot_things_per_things( 

1722 'recovered', 

1723 'bit', 

1724 False, 

1725 strategies=[strategy], 

1726 op=stats_analyser.rec_rate, 

1727 mask=fixable, 

1728 args={'ylabel': 'recovery rate'}, 

1729 name='fixable_recovery', 

1730 ax=ax, 

1731 ) 

1732 fig.tight_layout() 

1733 fig.savefig(f'data/{stats_analyser.get_name()}-recoverable.pdf', transparent=True) 

1734 

1735 fig, ax = plt.subplots(1, 1, figsize=(13, 4)) 

1736 stats_analyser.plot_recovery_thresholds(ax=ax, thresh_range=np.logspace(-1, 1, 1000)) 

1737 # ax.axvline(stats_analyser.get_thresh(BaseStrategy()), color='grey', ls=':', label='recovery threshold') 

1738 ax.set_xscale('log') 

1739 ax.legend(frameon=False) 

1740 fig.tight_layout() 

1741 fig.savefig(f'data/{stats_analyser.get_name()}-threshold.pdf', transparent=True) 

1742 

1743 stats_analyser.plot_things_per_things( 

1744 'total_iteration', 

1745 'bit', 

1746 True, 

1747 op=stats_analyser.mean, 

1748 mask=mask, 

1749 args={'yscale': 'log', 'ylabel': 'total iterations'}, 

1750 ) 

1751 stats_analyser.plot_things_per_things( 

1752 'total_iteration', 

1753 'bit', 

1754 True, 

1755 op=stats_analyser.extra_mean, 

1756 mask=mask, 

1757 args={'yscale': 'linear', 'ylabel': 'extra iterations'}, 

1758 name='extra_iter', 

1759 ) 

1760 stats_analyser.plot_things_per_things( 

1761 'error', 'bit', True, op=stats_analyser.mean, mask=mask, args={'yscale': 'log'} 

1762 ) 

1763 

1764 stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.9, 1.3, 100)) 

1765 plt.show() 

1766 

1767 

1768if __name__ == "__main__": 

1769 main() 

1770 # compare_adaptivity_modes() 

1771 # check_local_error()