Coverage for pySDC/projects/Resilience/fault_stats.py: 56%

451 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-01 13:12 +0000

1import numpy as np 

2import matplotlib.pyplot as plt 

3from mpi4py import MPI 

4import pickle 

5 

6import pySDC.helpers.plot_helper as plot_helper 

7from pySDC.helpers.stats_helper import get_sorted 

8 

9from pySDC.projects.Resilience.hook import hook_collection, LogUAllIter, LogData 

10from pySDC.projects.Resilience.fault_injection import get_fault_injector_hook 

11from pySDC.implementations.convergence_controller_classes.hotrod import HotRod 

12from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity 

13from pySDC.implementations.hooks.log_errors import LogLocalErrorPostStep 

14from pySDC.implementations.hooks.log_work import LogWork 

15 

16# these problems are available for testing 

17from pySDC.projects.Resilience.advection import run_advection 

18from pySDC.projects.Resilience.vdp import run_vdp 

19from pySDC.projects.Resilience.piline import run_piline 

20from pySDC.projects.Resilience.Lorenz import run_Lorenz 

21from pySDC.projects.Resilience.Schroedinger import run_Schroedinger 

22from pySDC.projects.Resilience.quench import run_quench 

23from pySDC.projects.Resilience.AC import run_AC 

24from pySDC.projects.Resilience.RBC import run_RBC 

25from pySDC.projects.Resilience.GS import run_GS 

26 

27from pySDC.projects.Resilience.strategies import BaseStrategy, AdaptivityStrategy, IterateStrategy, HotRodStrategy 

28import logging 

29 

30plot_helper.setup_mpl(reset=True) 

31 

32LOGGER_LEVEL = 40 

33 

34RECOVERY_THRESH_ABS = { 

35 # run_quench: 5e-3, 

36 # run_Schroedinger: 1.7e-6, 

37} 

38 

39 

40class FaultStats: 

41 ''' 

42 Class to generate and analyse fault statistics 

43 ''' 

44 

45 def __init__( 

46 self, 

47 prob=None, 

48 strategies=None, 

49 faults=None, 

50 reload=True, 

51 recovery_thresh=1 + 1e-3, 

52 recovery_thresh_abs=0.0, 

53 num_procs=1, 

54 mode='default', 

55 stats_path='data/stats', 

56 use_MPI=False, 

57 **kwargs, 

58 ): 

59 ''' 

60 Initialization routine 

61 

62 Args: 

63 prob: A function that runs a pySDC problem, see imports for available problems 

64 strategies (list): List of resilience strategies 

65 faults (list): List of booleans that describe whether to use faults or not 

66 reload (bool): Load previously computed statistics and continue from there or start from scratch 

67 recovery_thresh (float): Relative threshold for recovery 

68 num_procs (int): Number of processes 

69 mode (str): Mode for fault generation: Either 'random' or 'combination' 

70 ''' 

71 self.prob = prob 

72 self.strategies = [None] if strategies is None else strategies 

73 self.faults = [False, True] if faults is None else faults 

74 self.reload = reload 

75 self.recovery_thresh = recovery_thresh 

76 self.recovery_thresh_abs = recovery_thresh_abs 

77 self.num_procs = num_procs 

78 self.use_MPI = use_MPI 

79 self.stats_path = stats_path 

80 self.kwargs = { 

81 'fault_frequency_iter': 500, 

82 **kwargs, 

83 } 

84 

85 # decide mode 

86 if mode == 'default': 

87 if prob.__name__ in ['run_vdp', 'run_Lorenz']: 

88 self.mode = 'combination' 

89 else: 

90 self.mode = 'random' 

91 else: 

92 self.mode = mode 

93 

94 self.logger = logging.getLogger('FaultStats') 

95 self.logger.level = LOGGER_LEVEL 

96 

97 msg = 'Starting FaultStats with attributes' 

98 for key, val in self.__dict__.items(): 

99 if key in ['logger']: 

100 continue 

101 item = [str(me) for me in val] if type(val) == list else str(val) 

102 msg += '\n\t' f'{key}: {item}' 

103 self.logger.log(25, msg) 

104 

105 def get_Tend(self): 

106 ''' 

107 Get the final time of runs for fault stats based on the problem 

108 

109 Returns: 

110 float: Tend to put into the run 

111 ''' 

112 return self.strategies[0].get_Tend(self.prob, self.num_procs, resilience_experiment=True) 

113 

114 def run_stats_generation( 

115 self, runs=1000, step=None, comm=None, kwargs_range=None, faults=None, _reload=False, _runs_partial=0 

116 ): 

117 ''' 

118 Run the generation of stats for all strategies in the `self.strategies` variable 

119 

120 Args: 

121 runs (int): Number of runs you want to do 

122 step (int): Number of runs you want to do between saving 

123 comm (MPI.Communicator): Communicator for distributing runs 

124 faults (bool): Whether to do stats with faults or without 

125 kw_args_range (dict): Range for the parameters 

126 _reload, _runs_partial: Variables only used for recursion. Do not change! 

127 

128 Returns: 

129 None 

130 ''' 

131 if faults is None: 

132 for f in self.faults: 

133 self.run_stats_generation(runs=runs, step=step, comm=comm, kwargs_range=kwargs_range, faults=f) 

134 return None 

135 

136 for key, val in kwargs_range.items() if kwargs_range is not None else {}: 

137 if type(val) == int: 

138 self.kwargs[key] = val 

139 else: 

140 for me in val: 

141 kwargs_range_me = {**kwargs_range, key: me} 

142 self.run_stats_generation( 

143 runs=runs, step=step, comm=comm, kwargs_range=kwargs_range_me, faults=faults 

144 ) 

145 return None 

146 

147 comm = MPI.COMM_WORLD if comm is None else comm 

148 step = (runs if comm.size == 1 else comm.size) if step is None else step 

149 _runs_partial = step if _runs_partial == 0 else _runs_partial 

150 reload = self.reload or _reload 

151 

152 # see if we limit the number of runs we want to do 

153 max_runs = ( 

154 (min(self.get_max_combinations(), runs) if self.mode == 'combination' else runs) if faults else min(runs, 5) 

155 ) 

156 

157 if reload: 

158 # sort the strategies to do some load balancing 

159 sorting_index = None 

160 if comm.rank == 0: 

161 already_completed = np.array( 

162 [self.load(strategy=strategy, faults=faults).get('runs', 0) for strategy in self.strategies] 

163 ) 

164 sorting_index_ = np.argsort(already_completed) 

165 sorting_index = sorting_index_[already_completed[sorting_index_] < max_runs] 

166 _runs_partial = max(min(already_completed), _runs_partial) 

167 

168 # tell all ranks what strategies to use 

169 sorting_index = comm.bcast(sorting_index, root=0) 

170 _runs_partial = comm.bcast(_runs_partial, root=0) 

171 strategies = [self.strategies[i] for i in sorting_index] 

172 if len(strategies) == 0: # check if we are already done 

173 return None 

174 else: 

175 strategies = self.strategies 

176 

177 strategy_comm = comm.Split(comm.rank % len(strategies)) 

178 

179 for j in range(0, len(strategies), comm.size): 

180 self.generate_stats( 

181 strategy=strategies[j + (comm.rank % len(strategies) % (len(strategies) - j))], 

182 runs=min(_runs_partial, max_runs), 

183 faults=faults, 

184 reload=reload, 

185 comm=strategy_comm, 

186 ) 

187 strategy_comm.Free() 

188 self.run_stats_generation( 

189 runs=runs, step=step, comm=comm, faults=faults, _reload=True, _runs_partial=_runs_partial + step 

190 ) 

191 

192 return None 

193 

194 def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, comm=None): 

195 ''' 

196 Generate statistics for recovery from bit flips 

197 ----------------------------------------------- 

198 

199 Every run is given a different random seed such that we have different faults and the results are then stored 

200 

201 Args: 

202 strategy (Strategy): Resilience strategy 

203 runs (int): Number of runs you want to do 

204 reload (bool): Load previously computed statistics and continue from there or start from scratch 

205 faults (bool): Whether to do stats with faults or without 

206 comm (MPI.Communicator): Communicator for distributing runs 

207 

208 Returns: 

209 None 

210 ''' 

211 comm = MPI.COMM_WORLD if comm is None else comm 

212 

213 # initialize dictionary to store the stats in 

214 dat = { 

215 'level': np.zeros(runs), 

216 'iteration': np.zeros(runs), 

217 'node': np.zeros(runs), 

218 'problem_pos': [], 

219 'bit': np.zeros(runs), 

220 'error': np.zeros(runs), 

221 'total_iteration': np.zeros(runs), 

222 'total_newton_iteration': np.zeros(runs), 

223 'restarts': np.zeros(runs), 

224 'target': np.zeros(runs), 

225 'rank': np.zeros(runs), 

226 } 

227 

228 # store arguments for storing and loading 

229 identifier_args = { 

230 'faults': faults, 

231 'strategy': strategy, 

232 } 

233 

234 # reload previously recorded stats and write them to dat 

235 if reload: 

236 already_completed_ = None 

237 if comm.rank == 0: 

238 already_completed_ = self.load(**identifier_args) 

239 already_completed = comm.bcast(already_completed_, root=0) 

240 if already_completed['runs'] > 0 and already_completed['runs'] <= runs and comm.rank == 0: 

241 avail_len = min([already_completed['runs'], runs]) 

242 for k in dat.keys(): 

243 dat[k][:avail_len] = already_completed.get(k, [None] * avail_len) 

244 else: 

245 already_completed = {'runs': 0} 

246 

247 # prepare a message 

248 involved_ranks = comm.gather(MPI.COMM_WORLD.rank, root=0) 

249 msg = f'{comm.size} rank(s) ({involved_ranks}) doing {strategy.name}{" with faults" if faults else ""} with {self.num_procs} ranks for {self.prob.__name__} from {already_completed["runs"]} to {runs}' 

250 if comm.rank == 0 and already_completed['runs'] < runs: 

251 print(msg, flush=True) 

252 

253 space_comm = MPI.COMM_SELF.Split(True) 

254 

255 # perform the remaining experiments 

256 for i in range(already_completed['runs'], runs): 

257 if i % comm.size != comm.rank: 

258 continue 

259 

260 # perform a single experiment with the correct random seed 

261 stats, controller, crash = self.single_run(strategy=strategy, run=i, faults=faults, space_comm=space_comm) 

262 

263 # get the data from the stats 

264 faults_run = get_sorted(stats, type='bitflip') 

265 

266 if faults: 

267 assert len(faults_run) > 0, f"Did not record a fault in run {i} of {strategy.name}!" 

268 dat['level'][i] = faults_run[0][1][0] 

269 dat['iteration'][i] = faults_run[0][1][1] 

270 dat['node'][i] = faults_run[0][1][2] 

271 dat['problem_pos'] += [faults_run[0][1][3]] 

272 dat['bit'][i] = faults_run[0][1][4] 

273 dat['target'][i] = faults_run[0][1][5] 

274 dat['rank'][i] = faults_run[0][1][6] 

275 

276 if crash: 

277 print('Code crashed!') 

278 dat['error'][i] = np.inf 

279 continue 

280 

281 # record the rest of the data 

282 t, u = get_sorted(stats, type='u', recomputed=False)[-1] 

283 error = self.get_error(u, t, controller, strategy, self.get_Tend()) 

284 total_iteration = sum([k[1] for k in get_sorted(stats, type='k')]) 

285 total_newton_iteration = sum([k[1] for k in get_sorted(stats, type='work_newton')]) 

286 

287 dat['error'][i] = error 

288 dat['total_iteration'][i] = total_iteration 

289 dat['total_newton_iteration'][i] = total_newton_iteration 

290 dat['restarts'][i] = sum([me[1] for me in get_sorted(stats, type='restart')]) 

291 

292 # free the space communicator 

293 space_comm.Free() 

294 

295 dat_full = {} 

296 for k in dat.keys(): 

297 dat_full[k] = comm.reduce(dat[k], op=MPI.SUM) 

298 

299 # store the completed stats 

300 dat_full['runs'] = runs 

301 

302 if already_completed['runs'] < runs: 

303 if comm.rank == 0: 

304 self.store(dat_full, **identifier_args) 

305 if self.faults: 

306 self.get_recovered(strategy=strategy) 

307 

308 return None 

309 

310 def get_error(self, u, t, controller, strategy, Tend): 

311 """ 

312 Compute the error. 

313 

314 Args: 

315 u (dtype_u): The solution at the end of the run 

316 t (float): Time at which `u` was recorded 

317 controller (pySDC.controller.controller): The controller 

318 strategy (Strategy): The resilience strategy 

319 Tend (float): Time you want to run to 

320 

321 Returns: 

322 float: Error 

323 """ 

324 lvl = controller.MS[0].levels[0] 

325 

326 # decide if we reached Tend 

327 Tend_reached = t + lvl.params.dt_initial >= Tend 

328 

329 if Tend_reached: 

330 u_star = lvl.prob.u_exact(t=t) 

331 return abs(u - u_star) / abs(u_star) 

332 else: 

333 print(f'Final time was not reached! Code crashed at t={t:.2f} instead of reaching Tend={Tend:.2f}') 

334 return np.inf 

335 

336 def single_run( 

337 self, 

338 strategy, 

339 run=0, 

340 faults=False, 

341 force_params=None, 

342 hook_class=None, 

343 space_comm=None, 

344 Tend=None, 

345 time_comm=None, 

346 ): 

347 ''' 

348 Run the problem once with the specified parameters 

349 

350 Args: 

351 strategy (Strategy): The resilience strategy you plan on using 

352 run (int): Index for fault generation 

353 faults (bool): Whether or not to put faults in 

354 force_params (dict): Change parameters in the description of the problem 

355 space_comm (MPI.Communicator): A communicator for space parallelisation 

356 Tend (float): Time you want to run to 

357 time_comm (MPI.Intracomm.Communicator): Communicator in time 

358 

359 Returns: 

360 dict: Stats object containing statistics for each step, each level and each iteration 

361 pySDC.Controller: The controller of the run 

362 float: The time the problem should have run to 

363 ''' 

364 hook_class = hook_collection + [LogWork] + ([LogData] if hook_class is None else hook_class) 

365 force_params = {} if force_params is None else force_params 

366 

367 # build the custom description 

368 custom_description = strategy.get_custom_description_for_faults(self.prob, self.num_procs) 

369 for k in force_params.keys(): 

370 custom_description[k] = {**custom_description.get(k, {}), **force_params[k]} 

371 

372 custom_controller_params = { 

373 'logger_level': LOGGER_LEVEL, 

374 **force_params.get('controller_params', {}), 

375 } 

376 

377 if faults: 

378 fault_stuff = { 

379 'rng': None, 

380 'args': strategy.get_fault_args(self.prob, self.num_procs), 

381 'rnd_params': strategy.get_random_params(self.prob, self.num_procs), 

382 } 

383 

384 # make parameters for faults: 

385 if self.mode == 'random': 

386 fault_stuff['rng'] = np.random.RandomState(run) 

387 elif self.mode == 'combination': 

388 fault_stuff['rng'] = run 

389 elif self.mode == 'regular': 

390 fault_stuff['rng'] = np.random.RandomState(run) 

391 fault_stuff['fault_frequency_iter'] = self.kwargs['fault_frequency_iter'] 

392 fault_stuff['rnd_params'] = { 

393 'bit': 12, 

394 'min_node': 1, 

395 } 

396 else: 

397 raise NotImplementedError(f'Don\'t know how to add faults in mode {self.mode}') 

398 

399 else: 

400 fault_stuff = None 

401 

402 return self.prob( 

403 custom_description=custom_description, 

404 num_procs=self.num_procs, 

405 hook_class=hook_class, 

406 fault_stuff=fault_stuff, 

407 Tend=self.get_Tend() if Tend is None else Tend, 

408 custom_controller_params=custom_controller_params, 

409 space_comm=space_comm, 

410 comm=time_comm, 

411 use_MPI=time_comm is not None, 

412 ) 

413 

414 def compare_strategies(self, run=0, faults=False, ax=None): # pragma: no cover 

415 ''' 

416 Take a closer look at how the strategies compare for a specific run 

417 

418 Args: 

419 run (int): The number of the run to get the appropriate random generator 

420 faults (bool): Whether or not to include faults 

421 ax (Matplotlib.axes): Somewhere to plot 

422 

423 Returns: 

424 None 

425 ''' 

426 if ax is None: 

427 fig, ax = plt.subplots(1, 1) 

428 store = True 

429 else: 

430 store = False 

431 

432 k_ax = ax.twinx() 

433 ls = ['-.' if type(strategy) == HotRodStrategy else '-' for strategy in self.strategies] 

434 [self.scrutinize_visual(self.strategies[i], run, faults, ax, k_ax, ls[i]) for i in range(len(self.strategies))] 

435 

436 # make a legend 

437 [k_ax.plot([None], [None], label=strategy.label, color=strategy.color) for strategy in self.strategies] 

438 k_ax.legend(frameon=True) 

439 

440 if store: 

441 fig.tight_layout() 

442 plt.savefig(f'data/{self.get_name()}-comparison.pdf', transparent=True) 

443 

444 def scrutinize_visual( 

445 self, strategy, run, faults, ax=None, k_ax=None, ls='-', plot_restarts=False 

446 ): # pragma: no cover 

447 ''' 

448 Take a closer look at a specific run with a plot 

449 

450 Args: 

451 strategy (Strategy): The resilience strategy you plan on using 

452 run (int): The number of the run to get the appropriate random generator 

453 faults (bool): Whether or not to include faults 

454 ax (Matplotlib.axes): Somewhere to plot the error 

455 k_ax (Matplotlib.axes): Somewhere to plot the iterations 

456 plot_restarts (bool): Make vertical lines wherever restarts happened 

457 

458 Returns: 

459 dict: Stats from the run 

460 ''' 

461 if ax is None: 

462 fig, ax = plt.subplots(1, 1) 

463 store = True 

464 else: 

465 store = False 

466 

467 force_params = {} 

468 

469 stats, controller, Tend = self.single_run( 

470 strategy=strategy, 

471 run=run, 

472 faults=faults, 

473 force_params=force_params, 

474 hook_class=hook_collection + [LogLocalErrorPostStep, LogData], 

475 ) 

476 

477 # plot the local error 

478 e_loc = get_sorted(stats, type='e_local_post_step', recomputed=False) 

479 ax.plot([me[0] for me in e_loc], [me[1] for me in e_loc], color=strategy.color, ls=ls) 

480 

481 # plot the iterations 

482 k_ax = ax.twinx() if k_ax is None else k_ax 

483 k = get_sorted(stats, type='k') 

484 k_ax.plot([me[0] for me in k], np.cumsum([me[1] for me in k]), color=strategy.color, ls='--') 

485 

486 # plot the faults 

487 faults = get_sorted(stats, type='bitflip') 

488 for fault_time in [me[0] for me in faults]: 

489 ax.axvline(fault_time, color='grey', ls=':') 

490 

491 # plot restarts 

492 if plot_restarts: 

493 restarts = get_sorted(stats, type='restart') 

494 [ax.axvline(me[0], color='black', ls='-.') if me[1] else '' for me in restarts] 

495 

496 # decorate 

497 ax.set_yscale('log') 

498 ax.set_ylabel(r'$\epsilon$') 

499 k_ax.set_ylabel('cumulative iterations (dashed)') 

500 ax.set_xlabel(r'$t$') 

501 

502 if store: 

503 fig.tight_layout() 

504 plt.savefig(f'data/{self.get_name()}-{strategy.name}-details.pdf', transparent=True) 

505 

506 def scrutinize(self, strategy, logger_level=15, **kwargs): 

507 ''' 

508 Take a closer look at a specific run 

509 

510 Args: 

511 strategy (Strategy): The resilience strategy you plan on using 

512 

513 Returns: 

514 None 

515 ''' 

516 from pySDC.projects.Resilience.fault_injection import FaultInjector 

517 

518 force_params = {} 

519 force_params['controller_params'] = {'logger_level': logger_level} 

520 force_params['sweeper_params'] = {'skip_residual_computation': ()} 

521 

522 stats, controller, Tend = self.single_run(strategy=strategy, force_params=force_params, **kwargs) 

523 

524 u_all = get_sorted(stats, type='u', recomputed=False) 

525 t, u = get_sorted(stats, type='u', recomputed=False)[np.argmax([me[0] for me in u_all])] 

526 k = [me[1] for me in get_sorted(stats, type='k')] 

527 

528 fault_hook = [me for me in controller.hooks if type(me) == FaultInjector] 

529 

530 unhappened_faults = fault_hook[0].faults if len(fault_hook) > 0 else [] 

531 

532 print(f'\nOverview for {strategy.name} strategy') 

533 # print faults 

534 faults = get_sorted(stats, type='bitflip') 

535 print('\nfaults:') 

536 print(' t | level | iter | node | bit | trgt | rank | pos') 

537 print('--------+-------+------+------+-----+------+---------------------') 

538 for f in faults: 

539 print( 

540 f' {f[0]:6.2f} | {f[1][0]:5d} | {f[1][1]:4d} | {f[1][2]:4d} | {f[1][4]:3d} | {f[1][5]:4d} | {f[1][6]:4d} |', 

541 f[1][3], 

542 ) 

543 print('--------+-------+------+------+-----+------+---------------------') 

544 for f in unhappened_faults: 

545 print( 

546 f'!{f.time:6.2f} | {f.level_number:5d} | {f.iteration:4d} | {f.node:4d} | {f.bit:3d} | {f.target:4d} | {f.rank:4d} |', 

547 f.problem_pos, 

548 ) 

549 

550 # see if we can determine if the faults where recovered 

551 no_faults = self.load(strategy=strategy, faults=False) 

552 e_star = np.mean(no_faults.get('error', [0])) 

553 error = self.get_error(u, t, controller, strategy, Tend) 

554 recovery_thresh = self.get_thresh(strategy) 

555 

556 print( 

557 f'e={error:.2e}, e^*={e_star:.2e}, thresh: {recovery_thresh:.2e} -> recovered: {error < recovery_thresh}, e_rel = {error/abs(u):.2e}' 

558 ) 

559 print(f'k: sum: {np.sum(k)}, min: {np.min(k)}, max: {np.max(k)}, mean: {np.mean(k):.2f},') 

560 

561 _newton_iter = get_sorted(stats, type='work_newton') 

562 if len(_newton_iter) > 0: 

563 newton_iter = [me[1] for me in _newton_iter] 

564 print( 

565 f'Newton: k: sum: {np.sum(newton_iter)}, min: {np.min(newton_iter)}, max: {np.max(newton_iter)}, mean: {np.mean(newton_iter):.2f},' 

566 ) 

567 

568 # checkout the step size 

569 dt = [me[1] for me in get_sorted(stats, type='dt')] 

570 print(f't: {t:.2f}, dt: min: {np.min(dt):.2e}, max: {np.max(dt):.2e}, mean: {np.mean(dt):.2e}') 

571 

572 # restarts 

573 restarts = [me[1] for me in get_sorted(stats, type='restart')] 

574 print(f'restarts: {sum(restarts)}, without faults: {no_faults["restarts"][0]}') 

575 

576 return stats 

577 

578 def convert_faults(self, faults): 

579 ''' 

580 Make arrays of useable data from an entry in the stats object returned by pySDC 

581 

582 Args: 

583 faults (list): The entry for faults returned by the pySDC run 

584 

585 Returns: 

586 list: The times when the faults happened 

587 list: The levels in which the faults happened 

588 list: The iterations in which the faults happened 

589 list: The nodes in which the faults happened 

590 list: The problem positions in which the faults happened 

591 list: The bits in which the faults happened 

592 ''' 

593 time = [faults[i][0] for i in range(len(faults))] 

594 level = [faults[i][1][0] for i in range(len(faults))] 

595 iteration = [faults[i][1][1] for i in range(len(faults))] 

596 node = [faults[i][1][2] for i in range(len(faults))] 

597 problem_pos = [faults[i][1][3] for i in range(len(faults))] 

598 bit = [faults[i][1][4] for i in range(len(faults))] 

599 return time, level, iteration, node, problem_pos, bit 

600 

601 def get_path(self, **kwargs): 

602 ''' 

603 Get the path to where the stats are stored 

604 

605 Args: 

606 strategy (Strategy): The resilience strategy 

607 faults (bool): Whether or not faults have been activated 

608 

609 Returns: 

610 str: The path to what you are looking for 

611 ''' 

612 return f'{self.stats_path}/{self.get_name(**kwargs)}.pickle' 

613 

614 def get_name(self, strategy=None, faults=True, mode=None): 

615 ''' 

616 Function to get a unique name for a kind of statistics based on the problem and strategy that was used 

617 

618 Args: 

619 strategy (Strategy): Resilience strategy 

620 faults (bool): Whether or not faults where inserted 

621 

622 Returns: 

623 str: The unique identifier 

624 ''' 

625 if self.prob == run_advection: 

626 prob_name = 'advection' 

627 elif self.prob == run_vdp: 

628 prob_name = 'vdp' 

629 elif self.prob == run_piline: 

630 prob_name = 'piline' 

631 elif self.prob == run_Lorenz: 

632 prob_name = 'Lorenz' 

633 elif self.prob == run_Schroedinger: 

634 prob_name = 'Schroedinger' 

635 elif self.prob == run_quench: 

636 prob_name = 'Quench' 

637 elif self.prob.__name__ == 'run_AC': 

638 prob_name = 'Allen-Cahn' 

639 elif self.prob.__name__ == 'run_RBC': 

640 prob_name = 'Rayleigh-Benard' 

641 elif self.prob.__name__ == 'run_GS': 

642 prob_name = 'Gray-Scott' 

643 else: 

644 raise NotImplementedError(f'Name not implemented for problem {self.prob}') 

645 

646 if faults: 

647 fault_name = '-faults' 

648 else: 

649 fault_name = '' 

650 

651 if strategy is not None: 

652 strategy_name = f'-{strategy.name}' 

653 else: 

654 strategy_name = '' 

655 

656 mode = self.mode if mode is None else mode 

657 if mode == 'regular': 

658 mode_thing = f'-regular{self.kwargs["fault_frequency_iter"] if faults else ""}' 

659 else: 

660 mode_thing = '' 

661 

662 mpi_thing = '-MPI' if self.use_MPI else '' 

663 return f'{prob_name}{strategy_name}{fault_name}-{self.num_procs}procs{mode_thing}{mpi_thing}' 

664 

665 def store(self, dat, **kwargs): 

666 ''' 

667 Stores the data for a run at a predefined path 

668 

669 Args: 

670 dat (dict): The data of the recorded statistics 

671 

672 Returns: 

673 None 

674 ''' 

675 with open(self.get_path(**kwargs), 'wb') as f: 

676 pickle.dump(dat, f) 

677 return None 

678 

679 def load(self, **kwargs): 

680 ''' 

681 Loads the stats belonging to a specific strategy and whether or not faults where inserted. 

682 When no data has been generated yet, a dictionary is returned which only contains the number of completed runs, 

683 which is 0 of course. 

684 

685 Returns: 

686 dict: Data from previous run or if it is not available a placeholder dictionary 

687 ''' 

688 kwargs['strategy'] = kwargs.get('strategy', self.strategies[MPI.COMM_WORLD.rank % len(self.strategies)]) 

689 

690 try: 

691 with open(self.get_path(**kwargs), 'rb') as f: 

692 dat = pickle.load(f) 

693 except FileNotFoundError: 

694 self.logger.debug(f'File \"{self.get_path(**kwargs)}\" not found!') 

695 return {'runs': 0} 

696 return dat 

697 

698 def get_thresh(self, strategy=None): 

699 """ 

700 Get recovery threshold based on relative and absolute tolerances 

701 

702 Args: 

703 strategy (Strategy): The resilience strategy 

704 """ 

705 fault_free = self.load(strategy=strategy, faults=False) 

706 assert ( 

707 fault_free['error'].std() / fault_free['error'].mean() < 1e-5 

708 ), f'Got too large variation of errors in {strategy} strategy!' 

709 return max([self.recovery_thresh_abs, self.recovery_thresh * fault_free["error"].mean()]) 

710 

711 def get_recovered(self, **kwargs): 

712 ''' 

713 Determine the recovery rate for a specific strategy and store it to disk. 

714 

715 Returns: 

716 None 

717 ''' 

718 if 'strategy' not in kwargs.keys(): 

719 [self.get_recovered(strategy=strat, **kwargs) for strat in self.strategies] 

720 else: 

721 try: 

722 with_faults = self.load(faults=True, **kwargs) 

723 with_faults['recovered'] = with_faults['error'] <= self.get_thresh(kwargs['strategy']) 

724 self.store(faults=True, dat=with_faults, **kwargs) 

725 except KeyError as error: 

726 print( 

727 f'Warning: Can\'t compute recovery rate for strategy {kwargs["strategy"].name} in {self.prob.__name__} problem right now: KeyError: {error}' 

728 ) 

729 

730 return None 

731 

732 def crash_rate(self, dat, no_faults, thingA, mask): 

733 ''' 

734 Determine the rate of runs that crashed 

735 

736 Args: 

737 dat (dict): The data of the recorded statistics with faults 

738 no_faults (dict): The data of the corresponding fault-free stats 

739 thingA (str): Some key stored in the stats that will go on the y-axis 

740 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply before determining the rate 

741 

742 Returns: 

743 int: Ratio of the runs which crashed and fall under the specific criteria set by the mask 

744 ''' 

745 if len(dat[thingA][mask]) > 0: 

746 crash = dat['error'] == np.inf 

747 return len(dat[thingA][mask & crash]) / len(dat[thingA][mask]) 

748 else: 

749 return None 

750 

751 def rec_rate(self, dat, no_faults, thingA, mask): 

752 ''' 

753 Operation for plotting which returns the recovery rate for a given mask. 

754 Which thingA you apply this to actually does not matter here since we compute a rate. 

755 

756 Args: 

757 dat (dict): The recorded statistics 

758 no_faults (dict): The corresponding fault-free stats 

759 thingA (str): Some key stored in the stats 

760 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

761 

762 Returns: 

763 float: Recovery rate 

764 ''' 

765 if len(dat[thingA][mask]) > 0: 

766 return len(dat[thingA][mask & dat['recovered']]) / len(dat[thingA][mask]) 

767 else: 

768 return None 

769 

770 def mean(self, dat, no_faults, thingA, mask): 

771 ''' 

772 Operation for plotting which returns the mean of thingA after applying the mask 

773 

774 Args: 

775 dat (dict): The recorded statistics 

776 no_faults (dict): The corresponding fault-free stats 

777 thingA (str): Some key stored in the stats 

778 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

779 

780 Returns: 

781 float: Mean of thingA after applying mask 

782 ''' 

783 return np.mean(dat[thingA][mask]) 

784 

785 def extra_mean(self, dat, no_faults, thingA, mask): 

786 ''' 

787 Operation for plotting which returns the difference in mean of thingA between runs with and without faults after 

788 applying the mask 

789 

790 Args: 

791 dat (dict): The recorded statistics 

792 no_faults (dict): The corresponding fault-free stats 

793 thingA (str): Some key stored in the stats 

794 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

795 

796 Returns: 

797 float: Difference in mean of thingA between runs with and without faults after applying mask 

798 ''' 

799 if True in mask or int in [type(me) for me in mask]: 

800 return np.mean(dat[thingA][mask]) - np.mean(no_faults[thingA]) 

801 else: 

802 return None 

803 

804 def plot_thingA_per_thingB( 

805 self, 

806 strategy, 

807 thingA, 

808 thingB, 

809 ax=None, 

810 mask=None, 

811 recovered=False, 

812 op=None, 

813 **kwargs, 

814 ): # pragma: no cover 

815 ''' 

816 Plot thingA vs. thingB for a single strategy 

817 

818 Args: 

819 strategy (Strategy): The resilience strategy you want to plot 

820 thingA (str): Some key stored in the stats that will go on the y-axis 

821 thingB (str): Some key stored in the stats that will go on the x-axis 

822 ax (Matplotlib.axes): Somewhere to plot 

823 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes 

824 recovered (bool): Show the plot for both all runs and only the recovered ones 

825 op (function): Operation that is applied to thingA before plotting default is recovery rate 

826 

827 Returns: 

828 None 

829 ''' 

830 op = self.rec_rate if op is None else op 

831 dat = self.load(strategy=strategy, faults=True) 

832 no_faults = self.load(strategy=strategy, faults=False) 

833 

834 if mask is None: 

835 mask = np.ones_like(dat[thingB], dtype=bool) 

836 

837 admissable_thingB = np.unique(dat[thingB][mask]) 

838 me = np.zeros(len(admissable_thingB)) 

839 me_recovered = np.zeros_like(me) 

840 

841 for i in range(len(me)): 

842 _mask = np.logical_and((dat[thingB] == admissable_thingB[i]), mask) 

843 if _mask.any(): 

844 me[i] = op(dat, no_faults, thingA, _mask) 

845 me_recovered[i] = op(dat, no_faults, thingA, np.logical_and(_mask, dat['recovered'])) 

846 

847 if recovered: 

848 ax.plot( 

849 admissable_thingB, 

850 me_recovered, 

851 label=f'{strategy.label} (only recovered)', 

852 color=strategy.color, 

853 marker=strategy.marker, 

854 ls='--', 

855 linewidth=3, 

856 **kwargs, 

857 ) 

858 

859 ax.plot( 

860 admissable_thingB, 

861 me, 

862 label=f'{strategy.label}', 

863 color=strategy.color, 

864 marker=strategy.marker, 

865 linewidth=2, 

866 **kwargs, 

867 ) 

868 

869 ax.legend(frameon=False) 

870 ax.set_xlabel(thingB) 

871 ax.set_ylabel(thingA) 

872 return None 

873 

874 def plot_things_per_things( 

875 self, 

876 thingA='bit', 

877 thingB='bit', 

878 recovered=False, 

879 mask=None, 

880 op=None, 

881 args=None, 

882 strategies=None, 

883 name=None, 

884 store=False, 

885 ax=None, 

886 fig=None, 

887 plotting_args=None, 

888 ): # pragma: no cover 

889 ''' 

890 Plot thingA vs thingB for multiple strategies 

891 

892 Args: 

893 thingA (str): Some key stored in the stats that will go on the y-axis 

894 thingB (str): Some key stored in the stats that will go on the x-axis 

895 recovered (bool): Show the plot for both all runs and only the recovered ones 

896 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes 

897 op (function): Operation that is applied to thingA before plotting default is recovery rate 

898 args (dict): Parameters for how the plot should look 

899 strategies (list): List of the strategies you want to plot, if None, all will be plotted 

900 name (str): Optional name for the plot 

901 store (bool): Store the plot at a predefined path or not (for jupyter notebooks) 

902 ax (Matplotlib.axes): Somewhere to plot 

903 fig (Matplotlib.figure): Figure of the ax 

904 

905 Returns 

906 None 

907 ''' 

908 strategies = self.strategies if strategies is None else strategies 

909 args = {} if args is None else args 

910 

911 # make sure we have something to plot in 

912 if ax is None: 

913 fig, ax = plt.subplots(1, 1) 

914 elif fig is None: 

915 store = False 

916 

917 # execute the plots for all strategies 

918 for s in strategies: 

919 self.plot_thingA_per_thingB( 

920 s, 

921 thingA=thingA, 

922 thingB=thingB, 

923 recovered=recovered, 

924 ax=ax, 

925 mask=mask, 

926 op=op, 

927 **(plotting_args if plotting_args else {}), 

928 ) 

929 

930 # set the parameters 

931 [plt.setp(ax, k, v) for k, v in args.items()] 

932 

933 if store: 

934 fig.tight_layout() 

935 plt.savefig(f'data/{self.get_name()}-{thingA if name is None else name}_per_{thingB}.pdf', transparent=True) 

936 plt.close(fig) 

937 

938 return None 

939 

940 def plot_recovery_thresholds( 

941 self, strategies=None, thresh_range=None, ax=None, recoverable_only=False, **kwargs 

942 ): # pragma: no cover 

943 ''' 

944 Plot the recovery rate for a range of thresholds 

945 

946 Args: 

947 strategies (list): List of the strategies you want to plot, if None, all will be plotted 

948 thresh_range (list): thresholds for deciding whether to accept as recovered 

949 ax (Matplotlib.axes): Somewhere to plot 

950 

951 Returns: 

952 None 

953 ''' 

954 # fill default values if nothing is specified 

955 strategies = self.strategies if strategies is None else strategies 

956 

957 thresh_range = 1 + np.linspace(-4e-2, 4e-2, 100) if thresh_range is None else thresh_range 

958 if ax is None: 

959 fig, ax = plt.subplots(1, 1) 

960 

961 rec_rates = [[None] * len(thresh_range)] * len(strategies) 

962 for strategy_idx in range(len(strategies)): 

963 strategy = strategies[strategy_idx] 

964 # load the stats 

965 fault_free = self.load(strategy=strategy, faults=False) 

966 with_faults = self.load(strategy=strategy, faults=True) 

967 

968 if recoverable_only: 

969 recoverable_mask = self.get_fixable_faults_only(strategy) 

970 else: 

971 recoverable_mask = self.get_mask(strategy=strategy) 

972 

973 for thresh_idx in range(len(thresh_range)): 

974 rec_mask = self.get_mask( 

975 strategy=strategy, 

976 key='error', 

977 val=(thresh_range[thresh_idx] * fault_free['error'].mean()), 

978 op='gt', 

979 old_mask=recoverable_mask, 

980 ) 

981 rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len( 

982 with_faults['error'][recoverable_mask] 

983 ) 

984 

985 ax.plot( 

986 thresh_range, rec_rates[strategy_idx], **{'color': strategy.color, 'label': strategy.label, **kwargs} 

987 ) 

988 ax.legend(frameon=False) 

989 ax.set_ylabel('recovery rate') 

990 ax.set_xlabel('threshold as ratio to fault-free error') 

991 

992 return None 

993 

994 def analyse_adaptivity(self, mask): # pragma: no cover 

995 ''' 

996 Analyse a set of runs with adaptivity 

997 

998 Args: 

999 mask (Numpy.ndarray of shape (n)): The mask you want to know about 

1000 

1001 Returns: 

1002 None 

1003 ''' 

1004 index = self.get_index(mask) 

1005 dat = self.load() 

1006 

1007 # make a header 

1008 print(' run | bit | node | iter | e_em^* | e_em | e_glob^* | e_glob ') 

1009 print('-------+-----+------+------+----------+----------+----------+----------') 

1010 for i in index: 

1011 e_em, e_glob = self.analyse_adaptivity_single(int(i)) 

1012 print( 

1013 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_em[1]:.2e}\ 

1014 | {e_em[0]:.2e} | {e_glob[1]:.2e} | {e_glob[0]:.2e}' 

1015 ) 

1016 

1017 e_tol = AdaptivityStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][ 

1018 Adaptivity 

1019 ]['e_tol'] 

1020 print(f'We only restart when e_em > e_tol = {e_tol:.2e}!') 

1021 return None 

1022 

1023 def analyse_adaptivity_single(self, run): # pragma: no cover 

1024 ''' 

1025 Compute what the difference in embedded and global error are for a specific run with adaptivity 

1026 

1027 Args: 

1028 run (int): The run you want to know about 

1029 

1030 Returns: 

1031 list: Embedded error with fault and without for the last iteration in the step with a fault 

1032 list: Global error with and without fault at the end of the run 

1033 ''' 

1034 # perform one run with and one without faults 

1035 stats = [] 

1036 controllers = [] 

1037 for faults in [True, False]: 

1038 s, c, _ = self.single_run( 

1039 strategy=AdaptivityStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter] 

1040 ) 

1041 stats += [s] 

1042 controllers += [c] 

1043 

1044 # figure out when the fault happened 

1045 t_fault = get_sorted(stats[0], type='bitflip')[0][0] 

1046 

1047 # get embedded error 

1048 e_em = [ 

1049 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')] 

1050 for stat in stats 

1051 ] 

1052 

1053 # compute the global error 

1054 u_end = [get_sorted(stat, type='u')[-1] for stat in stats] 

1055 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]] 

1056 

1057 return [e_em[i][-1] for i in [0, 1]], e_glob 

1058 

1059 def analyse_HotRod(self, mask): # pragma: no cover 

1060 ''' 

1061 Analyse a set of runs with Hot Rod 

1062 

1063 Args: 

1064 mask (Numpy.ndarray of shape (n)): The mask you want to know about 

1065 

1066 Returns: 

1067 None 

1068 ''' 

1069 index = self.get_index(mask) 

1070 dat = self.load() 

1071 

1072 # make a header 

1073 print( 

1074 ' run | bit | node | iter | e_ex^* | e_ex | e_em^* | e_em | diff* | diff | e_glob^* \ 

1075| e_glob ' 

1076 ) 

1077 print( 

1078 '-------+-----+------+------+----------+----------+----------+----------+----------+----------+----------\ 

1079+----------' 

1080 ) 

1081 for i in index: 

1082 e_em, e_ex, e_glob = self.analyse_HotRod_single(int(i)) 

1083 print( 

1084 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_ex[1]:.2e}\ 

1085 | {e_ex[0]:.2e} | {e_em[1]:.2e} | {e_em[0]:.2e} | {abs(e_em[1]-e_ex[1]):.2e} | {abs(e_em[0]-e_ex[0]):.2e} | \ 

1086{e_glob[1]:.2e} | {e_glob[0]:.2e}' 

1087 ) 

1088 

1089 tol = HotRodStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][HotRod][ 

1090 'HotRod_tol' 

1091 ] 

1092 print(f'We only restart when diff > tol = {tol:.2e}!') 

1093 return None 

1094 

1095 def analyse_HotRod_single(self, run): # pragma: no cover 

1096 ''' 

1097 Compute what the difference in embedded, extrapolated and global error are for a specific run with Hot Rod 

1098 

1099 Args: 

1100 run (int): The run you want to know about 

1101 

1102 Returns: 

1103 list: Embedded error with fault and without for the last iteration in the step with a fault 

1104 list: Extrapolation error with fault and without for the last iteration in the step with a fault 

1105 list: Global error with and without fault at the end of the run 

1106 ''' 

1107 # perform one run with and one without faults 

1108 stats = [] 

1109 controllers = [] 

1110 for faults in [True, False]: 

1111 s, c, _ = self.single_run( 

1112 strategy=HotRodStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter] 

1113 ) 

1114 stats += [s] 

1115 controllers += [c] 

1116 

1117 # figure out when the fault happened 

1118 t_fault = get_sorted(stats[0], type='bitflip')[0][0] 

1119 

1120 # get embedded error 

1121 e_em = [ 

1122 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')] 

1123 for stat in stats 

1124 ] 

1125 # get extrapolated error 

1126 e_ex = [ 

1127 [me[1] for me in get_sorted(stat, type='error_extrapolation_estimate', time=t_fault, sortby='iter')] 

1128 for stat in stats 

1129 ] 

1130 

1131 # compute the global error 

1132 u_end = [get_sorted(stat, type='u')[-1] for stat in stats] 

1133 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]] 

1134 

1135 return [e_em[i][-1] for i in [0, 1]], [e_ex[i][-1] for i in [0, 1]], e_glob 

1136 

1137 def print_faults(self, mask=None, strategy=None): # pragma: no cover 

1138 ''' 

1139 Print all faults that happened within a certain mask 

1140 

1141 Args: 

1142 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of 

1143 strategy (Strategy): The resilience strategy you want to see the error for 

1144 

1145 Returns: 

1146 None 

1147 ''' 

1148 strategy = BaseStrategy() if strategy is None else strategy 

1149 

1150 index = self.get_index(mask) 

1151 dat = self.load(strategy=strategy) 

1152 

1153 e_star = np.mean(self.load(faults=False, strategy=strategy).get('error', [0])) 

1154 

1155 # make a header 

1156 print(' run | bit | node | iter | rank | rel err | space pos') 

1157 print('-------+-----+------+------+------+---------+-----------') 

1158 for i in index: 

1159 print( 

1160 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {dat.get("rank", np.zeros_like(dat["iteration"]))[i]:4.0f} | {dat["error"][i] / e_star:.1e} | {dat["problem_pos"][i]}' 

1161 ) 

1162 

1163 return None 

1164 

1165 def get_mask(self, strategy=None, key=None, val=None, op='eq', old_mask=None, compare_faults=False): 

1166 ''' 

1167 Make a mask to apply to stored data to filter out certain things 

1168 

1169 Args: 

1170 strategy (Strategy): The resilience strategy you want to apply the mask to. Most masks are the same for all 

1171 strategies so None is fine 

1172 key (str): The key in the stored statistics that you want to filter for some value 

1173 val (str, float, int, bool): A value that you want to use for filtering. Dtype depends on key 

1174 op (str): Operation that is applied for filtering 

1175 old_mask (Numpy.ndarray of shape (n)): Apply this mask on top of the filter 

1176 compare_faults (bool): instead of comparing to val, compare to the mean value for fault free runs 

1177 

1178 Returns: 

1179 Numpy.ndarray with boolean entries that can be used as a mask 

1180 ''' 

1181 strategy = self.strategies[0] if strategy is None else strategy 

1182 dat = self.load(strategy=strategy, faults=True) 

1183 

1184 if compare_faults: 

1185 if val is not None: 

1186 raise ValueError('Can\'t use val and compare_faults in get_mask at the same time!') 

1187 else: 

1188 vals = self.load(strategy=strategy, faults=False)[key] 

1189 val = sum(vals) / len(vals) 

1190 

1191 if None in [key, val] and op not in ['isfinite']: 

1192 mask = dat['bit'] == dat['bit'] 

1193 else: 

1194 if op == 'uneq': 

1195 mask = dat[key] != val 

1196 elif op == 'eq': 

1197 mask = dat[key] == val 

1198 elif op == 'leq': 

1199 mask = dat[key] <= val 

1200 elif op == 'geq': 

1201 mask = dat[key] >= val 

1202 elif op == 'lt': 

1203 mask = dat[key] < val 

1204 elif op == 'gt': 

1205 mask = dat[key] > val 

1206 elif op == 'isfinite': 

1207 mask = np.isfinite(dat[key]) 

1208 else: 

1209 raise NotImplementedError(f'Please implement op={op}!') 

1210 

1211 if old_mask is not None: 

1212 return mask & old_mask 

1213 else: 

1214 return mask 

1215 

1216 def get_fixable_faults_only(self, strategy): 

1217 """ 

1218 Return a mask of only faults that can be fixed with a given strategy. 

1219 

1220 Args: 

1221 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all 

1222 

1223 Returns: 

1224 Numpy.ndarray with boolean entries that can be used as a mask 

1225 """ 

1226 fixable = strategy.get_fixable_params( 

1227 maxiter=strategy.get_custom_description(self.prob, self.num_procs)['step_params']['maxiter'] 

1228 ) 

1229 mask = self.get_mask(strategy=strategy) 

1230 

1231 for kwargs in fixable: 

1232 mask = self.get_mask(strategy=strategy, **kwargs, old_mask=mask) 

1233 

1234 return mask 

1235 

1236 def get_index(self, mask=None): 

1237 ''' 

1238 Get the indeces of all runs in mask 

1239 

1240 Args: 

1241 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of 

1242 

1243 Returns: 

1244 Numpy.ndarray: Array of indeces 

1245 ''' 

1246 if mask is None: 

1247 dat = self.load() 

1248 return np.arange(len(dat['iteration'])) 

1249 else: 

1250 return np.arange(len(mask))[mask] 

1251 

1252 def get_statistics_info(self, mask=None, strategy=None, print_all=False, ax=None): # pragma: no cover 

1253 ''' 

1254 Get information about how many data points for faults we have given a particular mask 

1255 

1256 Args: 

1257 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1258 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all 

1259 strategies, so you don't need to supply this argument 

1260 print_all (bool): Whether to add information that is normally not useful to the table 

1261 ax (Matplotlib.axes): Somewhere to plot the combinations histogram 

1262 

1263 Returns: 

1264 None 

1265 ''' 

1266 

1267 # load some data from which to infer the number occurrences of some event 

1268 strategy = self.strategies[0] if strategy is None else strategy 

1269 dat = self.load(stratagy=strategy, faults=True) 

1270 

1271 # make a dummy mask in case none is supplied 

1272 if mask is None: 

1273 mask = np.ones_like(dat['error'], dtype=bool) 

1274 

1275 # print a header 

1276 print(f' tot: {len(dat["error"][mask]):6} | avg. counts | mean deviation | unique entries') 

1277 print('-------------------------------------------------------------') 

1278 

1279 # make a list of all keys that you want to look at 

1280 keys = ['iteration', 'bit', 'node'] 

1281 if print_all: 

1282 keys += ['problem_pos', 'level', 'target'] 

1283 

1284 # print the table 

1285 for key in keys: 

1286 counts, dev, unique = self.count_occurrences(dat[key][mask]) 

1287 print(f' {key:11} | {counts:11.1f} | {dev:14.2f} | {unique:14}') 

1288 

1289 return None 

1290 

1291 def combinations_histogram(self, dat=None, keys=None, mask=None, ax=None): # pragma: no cover 

1292 ''' 

1293 Make a histogram ouf of the occurrences of combinations 

1294 

1295 Args: 

1296 dat (dict): The data of the recorded statistics 

1297 keys (list): The keys in dat that you want to know the combinations of 

1298 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1299 

1300 Returns: 

1301 Matplotlib.axes: The plot 

1302 ''' 

1303 if ax is None: 

1304 fig, ax = plt.subplots(1, 1) 

1305 

1306 occurrences, bins = self.get_combination_histogram(dat, keys, mask) 

1307 

1308 ax.bar(bins[:-1], occurrences) 

1309 

1310 ax.set_xlabel('Occurrence of combinations') 

1311 

1312 return ax 

1313 

1314 def get_combination_histogram(self, dat=None, keys=None, mask=None): # pragma: no cover 

1315 ''' 

1316 Check how many combinations of values we expect and how many we find to see if we need to do more experiments. 

1317 It is assumed that each allowed value for each key appears at least once in dat after the mask was applied 

1318 

1319 Args: 

1320 dat (dict): The data of the recorded statistics 

1321 keys (list): The keys in dat that you want to know the combinations of 

1322 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1323 

1324 Returns: 

1325 Numpy.ndarray: Number of occurrences of combinations 

1326 Numpy.ndarray: Bins 

1327 ''' 

1328 

1329 # load default values 

1330 dat = self.load(strategy=self.strategies[0], faults=True) if dat is None else dat 

1331 keys = ['iteration', 'bit', 'node'] if keys is None else keys 

1332 if mask is None: 

1333 mask = np.ones_like(dat['error'], dtype=bool) 

1334 

1335 # get unique values and compute how many combinations you expect 

1336 num_unique = [len(np.unique(dat[key][mask])) for key in keys] 

1337 expected_number_of_combinations = np.prod(num_unique) 

1338 

1339 # test what you actually get 

1340 combination_counts = self.get_combination_counts(dat, keys, mask) 

1341 

1342 # make a histogram with the result 

1343 occurrences, bins = np.histogram(combination_counts, bins=np.arange(max(combination_counts) + 1)) 

1344 occurrences[0] = expected_number_of_combinations - len(combination_counts) 

1345 

1346 return occurrences, bins 

1347 

1348 def get_max_combinations(self, dat=None, strategy=None): 

1349 ''' 

1350 Count how many combinations of parameters for faults are possible 

1351 

1352 Args: 

1353 dat (dict): The recorded statistics 

1354 keys (list): The keys in dat that you want to know the combinations of 

1355 strategy (Strategy): The resilience strategy 

1356 

1357 Returns: 

1358 int: Number of possible combinations 

1359 ''' 

1360 strategy = self.strategies[0] if strategy is None else strategy 

1361 stats, controller, Tend = self.single_run(strategy=strategy, run=0, faults=True) 

1362 faultHook = get_fault_injector_hook(controller) 

1363 ranges = [ 

1364 (0, faultHook.rnd_params['level_number']), 

1365 (faultHook.rnd_params.get('min_node', 0), faultHook.rnd_params['node'] + 1), 

1366 (1, faultHook.rnd_params['iteration'] + 1), 

1367 (0, faultHook.rnd_params['bit']), 

1368 (0, faultHook.rnd_params['rank']), 

1369 ] 

1370 ranges += [(0, i) for i in faultHook.rnd_params['problem_pos']] 

1371 return np.prod([me[1] - me[0] for me in ranges], dtype=int) 

1372 

1373 def get_combination_counts(self, dat, keys, mask): 

1374 ''' 

1375 Get counts of how often all combinations of values of keys appear. This is done recursively to support arbitrary 

1376 numbers of keys 

1377 

1378 Args: 

1379 dat (dict): The data of the recorded statistics 

1380 keys (list): The keys in dat that you want to know the combinations of 

1381 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1382 

1383 Returns: 

1384 list: Occurrences of all combinations 

1385 ''' 

1386 key = keys[0] 

1387 unique_vals = np.unique(dat[key][mask]) 

1388 res = [] 

1389 

1390 for i in range(len(unique_vals)): 

1391 inner_mask = self.get_mask(key=key, val=unique_vals[i], op='eq', old_mask=mask) 

1392 if len(keys) > 1: 

1393 res += self.get_combination_counts(dat, keys[1:], inner_mask) 

1394 else: 

1395 res += [self.count_occurrences(dat[key][inner_mask])[0]] 

1396 return res 

1397 

1398 def count_occurrences(self, vals): 

1399 ''' 

1400 Count the occurrences of unique values in vals and compute average deviation from mean 

1401 

1402 Args: 

1403 vals (list): Values you want to check 

1404 

1405 Returns: 

1406 float: Mean of number of occurrences of unique values in vals 

1407 float: Average deviation from mean number of occurrences 

1408 int: Number of unique entries 

1409 ''' 

1410 unique_vals, counts = np.unique(vals, return_counts=True) 

1411 

1412 if len(counts) > 0: 

1413 return counts.mean(), sum(abs(counts - counts.mean())) / len(counts), len(counts) 

1414 else: 

1415 return None, None, 0 

1416 

1417 def bar_plot_thing( 

1418 self, x=None, thing=None, ax=None, mask=None, store=False, faults=False, name=None, op=None, args=None 

1419 ): # pragma: no cover 

1420 ''' 

1421 Make a bar plot about something! 

1422 

1423 Args: 

1424 x (Numpy.ndarray of dimension 1): x values for bar plot 

1425 thing (str): Some key stored in the stats that will go on the y-axis 

1426 mask (Numpy.ndarray of shape (n)): The mask you want to apply before plotting 

1427 store (bool): Store the plot at a predefined path or not (for jupyter notebooks) 

1428 faults (bool): Whether to load stats with faults or without 

1429 name (str): Optional name for the plot 

1430 op (function): Operation that is applied to thing before plotting default is recovery rate 

1431 args (dict): Parameters for how the plot should look 

1432 

1433 Returns: 

1434 None 

1435 ''' 

1436 if ax is None: 

1437 fig, ax = plt.subplots(1, 1) 

1438 store = True 

1439 op = self.mean if op is None else op 

1440 

1441 # get the values for the bars 

1442 height = np.zeros(len(self.strategies)) 

1443 for strategy_idx in range(len(self.strategies)): 

1444 strategy = self.strategies[strategy_idx] 

1445 

1446 # load the values 

1447 dat = self.load(strategy=strategy, faults=faults) 

1448 no_faults = self.load(strategy=strategy, faults=False) 

1449 

1450 # check if we have a mask 

1451 if mask is None: 

1452 mask = np.ones_like(dat[thing], dtype=bool) 

1453 

1454 height[strategy_idx] = op(dat, no_faults, thing, mask) 

1455 

1456 # get some x values 

1457 x = np.arange(len(self.strategies)) if x is None else x 

1458 

1459 # prepare labels 

1460 ticks = [strategy.bar_plot_x_label for strategy in self.strategies] 

1461 

1462 ax.bar(x, height, tick_label=ticks) 

1463 

1464 # set the parameters 

1465 ax.set_ylabel(thing) 

1466 args = {} if args is None else args 

1467 [plt.setp(ax, k, v) for k, v in args.items()] 

1468 

1469 if store: 

1470 fig.tight_layout() 

1471 plt.savefig(f'data/{self.get_name()}-{thing if name is None else name}-barplot.pdf', transparent=True) 

1472 plt.close(fig) 

1473 

1474 return None 

1475 

1476 def fault_frequency_plot(self, ax, iter_ax, kwargs_range, strategy=None): # pragma: no cover 

1477 func_args = locals() 

1478 func_args.pop('self', None) 

1479 if strategy is None: 

1480 for strat in self.strategies: 

1481 args = {**func_args, 'strategy': strat} 

1482 self.fault_frequency_plot(**args) 

1483 return None 

1484 

1485 # load data 

1486 all_data = {} 

1487 for me in kwargs_range['fault_frequency_iter']: 

1488 self.kwargs['fault_frequency_iter'] = me 

1489 self.get_recovered() 

1490 all_data[me] = self.load(strategy=strategy, faults=True, mode='regular') 

1491 

1492 # get_recovery_rate 

1493 results = {} 

1494 results['frequencies'] = list(all_data.keys()) 

1495 results['recovery_rate'] = [ 

1496 len(all_data[key]['recovered'][all_data[key]['recovered'] is True]) / len(all_data[key]['recovered']) 

1497 for key in all_data.keys() 

1498 ] 

1499 # results['iterations'] = [np.mean(all_data[key]['total_iteration']) for key in all_data.keys()] 

1500 results['iterations'] = [ 

1501 np.mean(all_data[key]['total_iteration'][all_data[key]['error'] != np.inf]) for key in all_data.keys() 

1502 ] 

1503 

1504 ax.plot(results['frequencies'], results['recovery_rate'], **strategy.style) 

1505 iter_ax.plot(results['frequencies'], results['iterations'], **{**strategy.style, 'ls': '--'}) 

1506 

1507 ax.set_xscale('log') 

1508 ax.set_xlabel('iterations between fault') 

1509 ax.set_ylabel('recovery rate') 

1510 iter_ax.set_ylabel('average total iterations if not crashed (dashed)') 

1511 ax.legend(frameon=False) 

1512 

1513 return None 

1514 

1515 def get_HR_tol(self, verbose=False): 

1516 from pySDC.implementations.convergence_controller_classes.hotrod import HotRod 

1517 

1518 HR_strategy = HotRodStrategy(useMPI=self.use_MPI) 

1519 

1520 description = HR_strategy.get_custom_description_for_faults(self.prob, self.num_procs) 

1521 description['convergence_controllers'][HotRod]['HotRod_tol'] = 1e2 

1522 

1523 stats, _, _ = self.single_run(HR_strategy, force_params=description) 

1524 

1525 e_extrapolation = get_sorted(stats, type='error_extrapolation_estimate') 

1526 diff = [] 

1527 for i in range(len(e_extrapolation)): 

1528 if e_extrapolation[i][1] is not None: 

1529 e_embedded = get_sorted(stats, type='error_embedded_estimate', time=e_extrapolation[i][0]) 

1530 diff += [abs(e_extrapolation[i][1] - e_embedded[-1][1])] 

1531 

1532 max_diff = max(diff) 

1533 proposed_tol = 2 * max_diff 

1534 if verbose: 

1535 print( 

1536 f'Max. diff: {max_diff:.6e} -> proposed HR tolerance: {proposed_tol:.6e} for {self.prob.__name__} problem with {self.num_procs} procs.' 

1537 ) 

1538 else: 

1539 print(f'{proposed_tol:.6e}') 

1540 

1541 

1542def check_local_error(): # pragma: no cover 

1543 """ 

1544 Make a plot of the resolution over time for all problems 

1545 """ 

1546 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench] 

1547 problems = [run_quench] 

1548 strategies = [BaseStrategy(), AdaptivityStrategy(), IterateStrategy()] 

1549 

1550 for i in range(len(problems)): 

1551 stats_analyser = FaultStats( 

1552 prob=problems[i], 

1553 strategies=strategies, 

1554 faults=[False], 

1555 reload=True, 

1556 recovery_thresh=1.1, 

1557 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1558 num_procs=1, 

1559 mode='random', 

1560 ) 

1561 stats_analyser.compare_strategies() 

1562 plt.show() 

1563 

1564 

1565def parse_args(): 

1566 import sys 

1567 

1568 kwargs = {} 

1569 

1570 for i in range(len(sys.argv)): 

1571 if 'prob' in sys.argv[i]: 

1572 if sys.argv[i + 1] == 'run_Lorenz': 

1573 kwargs['prob'] = run_Lorenz 

1574 elif sys.argv[i + 1] == 'run_vdp': 

1575 kwargs['prob'] = run_vdp 

1576 elif sys.argv[i + 1] == 'run_Schroedinger': 

1577 kwargs['prob'] = run_Schroedinger 

1578 elif sys.argv[i + 1] == 'run_quench': 

1579 kwargs['prob'] = run_quench 

1580 elif sys.argv[i + 1] == 'run_AC': 

1581 kwargs['prob'] = run_AC 

1582 elif sys.argv[i + 1] == 'run_RBC': 

1583 kwargs['prob'] = run_RBC 

1584 elif sys.argv[i + 1] == 'run_GS': 

1585 kwargs['prob'] = run_GS 

1586 else: 

1587 raise NotImplementedError 

1588 elif 'num_procs' in sys.argv[i]: 

1589 kwargs['num_procs'] = int(sys.argv[i + 1]) 

1590 elif 'runs' in sys.argv[i]: 

1591 kwargs['runs'] = int(sys.argv[i + 1]) 

1592 elif 'mode' in sys.argv[i]: 

1593 kwargs['mode'] = str(sys.argv[i + 1]) 

1594 elif 'reload' in sys.argv[i]: 

1595 kwargs['reload'] = False if sys.argv[i + 1] == 'False' else True 

1596 

1597 return kwargs 

1598 

1599 

1600def compare_adaptivity_modes(): 

1601 from pySDC.projects.Resilience.strategies import AdaptivityRestartFirstStep 

1602 

1603 fig, axs = plt.subplots(2, 2, sharey=True) 

1604 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench] 

1605 titles = ['Van der Pol', 'Lorenz attractor', r'Schr\"odinger', 'Quench'] 

1606 

1607 for i in range(len(problems)): 

1608 ax = axs.flatten()[i] 

1609 

1610 ax.set_title(titles[i]) 

1611 

1612 stats_analyser = FaultStats( 

1613 prob=problems[i], 

1614 strategies=[AdaptivityStrategy()], 

1615 faults=[False, True], 

1616 reload=True, 

1617 recovery_thresh=1.1, 

1618 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1619 num_procs=1, 

1620 mode='default', 

1621 stats_path='data/stats-jusuf', 

1622 ) 

1623 stats_analyser.get_recovered() 

1624 

1625 for strategy in stats_analyser.strategies: 

1626 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1627 stats_analyser.plot_things_per_things( 

1628 'recovered', 

1629 'bit', 

1630 False, 

1631 strategies=[strategy], 

1632 op=stats_analyser.rec_rate, 

1633 mask=fixable, 

1634 args={'ylabel': 'recovery rate', 'label': 'bla'}, 

1635 name='fixable_recovery', 

1636 ax=ax, 

1637 ) 

1638 single_proc = ax.lines[-1] 

1639 single_proc.set_label('single process') 

1640 single_proc.set_color('black') 

1641 

1642 stats_analyser = FaultStats( 

1643 prob=problems[i], 

1644 strategies=[AdaptivityStrategy(), AdaptivityRestartFirstStep()], 

1645 faults=[False, True], 

1646 reload=True, 

1647 recovery_thresh=1.5, 

1648 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1649 num_procs=4, 

1650 mode='default', 

1651 stats_path='data/stats-jusuf', 

1652 ) 

1653 stats_analyser.get_recovered() 

1654 

1655 for strategy in stats_analyser.strategies: 

1656 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1657 stats_analyser.plot_things_per_things( 

1658 'recovered', 

1659 'bit', 

1660 False, 

1661 strategies=[strategy], 

1662 op=stats_analyser.rec_rate, 

1663 mask=fixable, 

1664 args={'ylabel': 'recovery rate'}, 

1665 name='fixable_recovery', 

1666 ax=ax, 

1667 ) 

1668 fig.suptitle('Comparison of restart modes with adaptivity with 4 ranks') 

1669 fig.tight_layout() 

1670 plt.show() 

1671 

1672 

1673def main(): 

1674 kwargs = { 

1675 'prob': run_vdp, 

1676 'num_procs': 1, 

1677 'mode': 'default', 

1678 'runs': 4000, 

1679 'reload': True, 

1680 **parse_args(), 

1681 } 

1682 

1683 strategy_args = { 

1684 'stop_at_nan': True, 

1685 } 

1686 

1687 from pySDC.projects.Resilience.strategies import AdaptivityPolynomialError, kAdaptivityStrategy 

1688 

1689 stats_analyser = FaultStats( 

1690 strategies=[ 

1691 BaseStrategy(**strategy_args), 

1692 AdaptivityStrategy(**strategy_args), 

1693 kAdaptivityStrategy(**strategy_args), 

1694 HotRodStrategy(**strategy_args), 

1695 AdaptivityPolynomialError(**strategy_args), 

1696 ], 

1697 faults=[False, True], 

1698 recovery_thresh=1.1, 

1699 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(kwargs.get('prob', None), 0), 

1700 stats_path='data/stats-jusuf', 

1701 **kwargs, 

1702 ) 

1703 stats_analyser.get_recovered() 

1704 

1705 stats_analyser.run_stats_generation(runs=kwargs['runs'], step=8) 

1706 

1707 if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data 

1708 return None 

1709 

1710 stats_analyser.get_recovered() 

1711 mask = None 

1712 

1713 # stats_analyser.compare_strategies() 

1714 stats_analyser.plot_things_per_things( 

1715 'recovered', 'node', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1716 ) 

1717 stats_analyser.plot_things_per_things( 

1718 'recovered', 'iteration', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1719 ) 

1720 

1721 stats_analyser.plot_things_per_things( 

1722 'recovered', 'bit', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1723 ) 

1724 

1725 # make a plot for only the faults that can be recovered 

1726 fig, ax = plt.subplots(1, 1) 

1727 for strategy in stats_analyser.strategies: 

1728 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1729 stats_analyser.plot_things_per_things( 

1730 'recovered', 

1731 'bit', 

1732 False, 

1733 strategies=[strategy], 

1734 op=stats_analyser.rec_rate, 

1735 mask=fixable, 

1736 args={'ylabel': 'recovery rate'}, 

1737 name='fixable_recovery', 

1738 ax=ax, 

1739 ) 

1740 fig.tight_layout() 

1741 fig.savefig(f'data/{stats_analyser.get_name()}-recoverable.pdf', transparent=True) 

1742 

1743 fig, ax = plt.subplots(1, 1, figsize=(13, 4)) 

1744 stats_analyser.plot_recovery_thresholds(ax=ax, thresh_range=np.logspace(-1, 1, 1000)) 

1745 # ax.axvline(stats_analyser.get_thresh(BaseStrategy()), color='grey', ls=':', label='recovery threshold') 

1746 ax.set_xscale('log') 

1747 ax.legend(frameon=False) 

1748 fig.tight_layout() 

1749 fig.savefig(f'data/{stats_analyser.get_name()}-threshold.pdf', transparent=True) 

1750 

1751 stats_analyser.plot_things_per_things( 

1752 'total_iteration', 

1753 'bit', 

1754 True, 

1755 op=stats_analyser.mean, 

1756 mask=mask, 

1757 args={'yscale': 'log', 'ylabel': 'total iterations'}, 

1758 ) 

1759 stats_analyser.plot_things_per_things( 

1760 'total_iteration', 

1761 'bit', 

1762 True, 

1763 op=stats_analyser.extra_mean, 

1764 mask=mask, 

1765 args={'yscale': 'linear', 'ylabel': 'extra iterations'}, 

1766 name='extra_iter', 

1767 ) 

1768 stats_analyser.plot_things_per_things( 

1769 'error', 'bit', True, op=stats_analyser.mean, mask=mask, args={'yscale': 'log'} 

1770 ) 

1771 

1772 stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.9, 1.3, 100)) 

1773 plt.show() 

1774 

1775 

1776if __name__ == "__main__": 

1777 main() 

1778 # compare_adaptivity_modes() 

1779 # check_local_error()