Coverage for pySDC/projects/Resilience/fault_stats.py: 57%

445 statements  

« prev     ^ index     » next       coverage.py v7.6.7, created at 2024-11-16 14:51 +0000

1import numpy as np 

2import matplotlib.pyplot as plt 

3from mpi4py import MPI 

4import pickle 

5 

6import pySDC.helpers.plot_helper as plot_helper 

7from pySDC.helpers.stats_helper import get_sorted 

8 

9from pySDC.projects.Resilience.hook import hook_collection, LogUAllIter, LogData 

10from pySDC.projects.Resilience.fault_injection import get_fault_injector_hook 

11from pySDC.implementations.convergence_controller_classes.hotrod import HotRod 

12from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity 

13from pySDC.implementations.hooks.log_errors import LogLocalErrorPostStep 

14from pySDC.implementations.hooks.log_work import LogWork 

15 

16# these problems are available for testing 

17from pySDC.projects.Resilience.advection import run_advection 

18from pySDC.projects.Resilience.vdp import run_vdp 

19from pySDC.projects.Resilience.piline import run_piline 

20from pySDC.projects.Resilience.Lorenz import run_Lorenz 

21from pySDC.projects.Resilience.Schroedinger import run_Schroedinger 

22from pySDC.projects.Resilience.quench import run_quench 

23from pySDC.projects.Resilience.AC import run_AC 

24from pySDC.projects.Resilience.RBC import run_RBC 

25 

26from pySDC.projects.Resilience.strategies import BaseStrategy, AdaptivityStrategy, IterateStrategy, HotRodStrategy 

27import logging 

28 

29plot_helper.setup_mpl(reset=True) 

30 

31LOGGER_LEVEL = 40 

32 

33RECOVERY_THRESH_ABS = { 

34 # run_quench: 5e-3, 

35 # run_Schroedinger: 1.7e-6, 

36} 

37 

38 

39class FaultStats: 

40 ''' 

41 Class to generate and analyse fault statistics 

42 ''' 

43 

44 def __init__( 

45 self, 

46 prob=None, 

47 strategies=None, 

48 faults=None, 

49 reload=True, 

50 recovery_thresh=1 + 1e-3, 

51 recovery_thresh_abs=0.0, 

52 num_procs=1, 

53 mode='default', 

54 stats_path='data/stats', 

55 use_MPI=False, 

56 **kwargs, 

57 ): 

58 ''' 

59 Initialization routine 

60 

61 Args: 

62 prob: A function that runs a pySDC problem, see imports for available problems 

63 strategies (list): List of resilience strategies 

64 faults (list): List of booleans that describe whether to use faults or not 

65 reload (bool): Load previously computed statistics and continue from there or start from scratch 

66 recovery_thresh (float): Relative threshold for recovery 

67 num_procs (int): Number of processes 

68 mode (str): Mode for fault generation: Either 'random' or 'combination' 

69 ''' 

70 self.prob = prob 

71 self.strategies = [None] if strategies is None else strategies 

72 self.faults = [False, True] if faults is None else faults 

73 self.reload = reload 

74 self.recovery_thresh = recovery_thresh 

75 self.recovery_thresh_abs = recovery_thresh_abs 

76 self.num_procs = num_procs 

77 self.use_MPI = use_MPI 

78 self.stats_path = stats_path 

79 self.kwargs = { 

80 'fault_frequency_iter': 500, 

81 **kwargs, 

82 } 

83 

84 # decide mode 

85 if mode == 'default': 

86 if prob.__name__ in ['run_vdp', 'run_Lorenz']: 

87 self.mode = 'combination' 

88 else: 

89 self.mode = 'random' 

90 else: 

91 self.mode = mode 

92 

93 self.logger = logging.getLogger('FaultStats') 

94 self.logger.level = LOGGER_LEVEL 

95 

96 msg = 'Starting FaultStats with attributes' 

97 for key, val in self.__dict__.items(): 

98 if key in ['logger']: 

99 continue 

100 item = [str(me) for me in val] if type(val) == list else str(val) 

101 msg += '\n\t' f'{key}: {item}' 

102 self.logger.log(25, msg) 

103 

104 def get_Tend(self): 

105 ''' 

106 Get the final time of runs for fault stats based on the problem 

107 

108 Returns: 

109 float: Tend to put into the run 

110 ''' 

111 return self.strategies[0].get_Tend(self.prob, self.num_procs) 

112 

113 def run_stats_generation( 

114 self, runs=1000, step=None, comm=None, kwargs_range=None, faults=None, _reload=False, _runs_partial=0 

115 ): 

116 ''' 

117 Run the generation of stats for all strategies in the `self.strategies` variable 

118 

119 Args: 

120 runs (int): Number of runs you want to do 

121 step (int): Number of runs you want to do between saving 

122 comm (MPI.Communicator): Communicator for distributing runs 

123 faults (bool): Whether to do stats with faults or without 

124 kw_args_range (dict): Range for the parameters 

125 _reload, _runs_partial: Variables only used for recursion. Do not change! 

126 

127 Returns: 

128 None 

129 ''' 

130 if faults is None: 

131 for f in self.faults: 

132 self.run_stats_generation(runs=runs, step=step, comm=comm, kwargs_range=kwargs_range, faults=f) 

133 return None 

134 

135 for key, val in kwargs_range.items() if kwargs_range is not None else {}: 

136 if type(val) == int: 

137 self.kwargs[key] = val 

138 else: 

139 for me in val: 

140 kwargs_range_me = {**kwargs_range, key: me} 

141 self.run_stats_generation( 

142 runs=runs, step=step, comm=comm, kwargs_range=kwargs_range_me, faults=faults 

143 ) 

144 return None 

145 

146 comm = MPI.COMM_WORLD if comm is None else comm 

147 step = (runs if comm.size == 1 else comm.size) if step is None else step 

148 _runs_partial = step if _runs_partial == 0 else _runs_partial 

149 reload = self.reload or _reload 

150 

151 # see if we limit the number of runs we want to do 

152 max_runs = ( 

153 (min(self.get_max_combinations(), runs) if self.mode == 'combination' else runs) if faults else min(runs, 5) 

154 ) 

155 

156 if reload: 

157 # sort the strategies to do some load balancing 

158 sorting_index = None 

159 if comm.rank == 0: 

160 already_completed = np.array( 

161 [self.load(strategy=strategy, faults=faults).get('runs', 0) for strategy in self.strategies] 

162 ) 

163 sorting_index_ = np.argsort(already_completed) 

164 sorting_index = sorting_index_[already_completed[sorting_index_] < max_runs] 

165 _runs_partial = max(min(already_completed), _runs_partial) 

166 

167 # tell all ranks what strategies to use 

168 sorting_index = comm.bcast(sorting_index, root=0) 

169 _runs_partial = comm.bcast(_runs_partial, root=0) 

170 strategies = [self.strategies[i] for i in sorting_index] 

171 if len(strategies) == 0: # check if we are already done 

172 return None 

173 else: 

174 strategies = self.strategies 

175 

176 strategy_comm = comm.Split(comm.rank % len(strategies)) 

177 

178 for j in range(0, len(strategies), comm.size): 

179 self.generate_stats( 

180 strategy=strategies[j + (comm.rank % len(strategies) % (len(strategies) - j))], 

181 runs=min(_runs_partial, max_runs), 

182 faults=faults, 

183 reload=reload, 

184 comm=strategy_comm, 

185 ) 

186 strategy_comm.Free() 

187 self.run_stats_generation( 

188 runs=runs, step=step, comm=comm, faults=faults, _reload=True, _runs_partial=_runs_partial + step 

189 ) 

190 

191 return None 

192 

193 def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, comm=None): 

194 ''' 

195 Generate statistics for recovery from bit flips 

196 ----------------------------------------------- 

197 

198 Every run is given a different random seed such that we have different faults and the results are then stored 

199 

200 Args: 

201 strategy (Strategy): Resilience strategy 

202 runs (int): Number of runs you want to do 

203 reload (bool): Load previously computed statistics and continue from there or start from scratch 

204 faults (bool): Whether to do stats with faults or without 

205 comm (MPI.Communicator): Communicator for distributing runs 

206 

207 Returns: 

208 None 

209 ''' 

210 comm = MPI.COMM_WORLD if comm is None else comm 

211 

212 # initialize dictionary to store the stats in 

213 dat = { 

214 'level': np.zeros(runs), 

215 'iteration': np.zeros(runs), 

216 'node': np.zeros(runs), 

217 'problem_pos': [], 

218 'bit': np.zeros(runs), 

219 'error': np.zeros(runs), 

220 'total_iteration': np.zeros(runs), 

221 'total_newton_iteration': np.zeros(runs), 

222 'restarts': np.zeros(runs), 

223 'target': np.zeros(runs), 

224 'rank': np.zeros(runs), 

225 } 

226 

227 # store arguments for storing and loading 

228 identifier_args = { 

229 'faults': faults, 

230 'strategy': strategy, 

231 } 

232 

233 # reload previously recorded stats and write them to dat 

234 if reload: 

235 already_completed_ = None 

236 if comm.rank == 0: 

237 already_completed_ = self.load(**identifier_args) 

238 already_completed = comm.bcast(already_completed_, root=0) 

239 if already_completed['runs'] > 0 and already_completed['runs'] <= runs and comm.rank == 0: 

240 avail_len = min([already_completed['runs'], runs]) 

241 for k in dat.keys(): 

242 dat[k][:avail_len] = already_completed.get(k, [None] * avail_len) 

243 else: 

244 already_completed = {'runs': 0} 

245 

246 # prepare a message 

247 involved_ranks = comm.gather(MPI.COMM_WORLD.rank, root=0) 

248 msg = f'{comm.size} rank(s) ({involved_ranks}) doing {strategy.name}{" with faults" if faults else ""} with {self.num_procs} ranks for {self.prob.__name__} from {already_completed["runs"]} to {runs}' 

249 if comm.rank == 0 and already_completed['runs'] < runs: 

250 print(msg, flush=True) 

251 

252 space_comm = MPI.COMM_SELF.Split(True) 

253 

254 # perform the remaining experiments 

255 for i in range(already_completed['runs'], runs): 

256 if i % comm.size != comm.rank: 

257 continue 

258 

259 # perform a single experiment with the correct random seed 

260 stats, controller, crash = self.single_run(strategy=strategy, run=i, faults=faults, space_comm=space_comm) 

261 

262 # get the data from the stats 

263 faults_run = get_sorted(stats, type='bitflip') 

264 

265 if faults: 

266 assert len(faults_run) > 0, f"Did not record a fault in run {i} of {strategy.name}!" 

267 dat['level'][i] = faults_run[0][1][0] 

268 dat['iteration'][i] = faults_run[0][1][1] 

269 dat['node'][i] = faults_run[0][1][2] 

270 dat['problem_pos'] += [faults_run[0][1][3]] 

271 dat['bit'][i] = faults_run[0][1][4] 

272 dat['target'][i] = faults_run[0][1][5] 

273 dat['rank'][i] = faults_run[0][1][6] 

274 

275 if crash: 

276 print('Code crashed!') 

277 dat['error'][i] = np.inf 

278 continue 

279 

280 # record the rest of the data 

281 t, u = get_sorted(stats, type='u', recomputed=False)[-1] 

282 error = self.get_error(u, t, controller, strategy, self.get_Tend()) 

283 total_iteration = sum([k[1] for k in get_sorted(stats, type='k')]) 

284 total_newton_iteration = sum([k[1] for k in get_sorted(stats, type='work_newton')]) 

285 

286 dat['error'][i] = error 

287 dat['total_iteration'][i] = total_iteration 

288 dat['total_newton_iteration'][i] = total_newton_iteration 

289 dat['restarts'][i] = sum([me[1] for me in get_sorted(stats, type='restart')]) 

290 

291 # free the space communicator 

292 space_comm.Free() 

293 

294 dat_full = {} 

295 for k in dat.keys(): 

296 dat_full[k] = comm.reduce(dat[k], op=MPI.SUM) 

297 

298 # store the completed stats 

299 dat_full['runs'] = runs 

300 

301 if already_completed['runs'] < runs: 

302 if comm.rank == 0: 

303 self.store(dat_full, **identifier_args) 

304 if self.faults: 

305 self.get_recovered(strategy=strategy) 

306 

307 return None 

308 

309 def get_error(self, u, t, controller, strategy, Tend): 

310 """ 

311 Compute the error. 

312 

313 Args: 

314 u (dtype_u): The solution at the end of the run 

315 t (float): Time at which `u` was recorded 

316 controller (pySDC.controller.controller): The controller 

317 strategy (Strategy): The resilience strategy 

318 Tend (float): Time you want to run to 

319 

320 Returns: 

321 float: Error 

322 """ 

323 lvl = controller.MS[0].levels[0] 

324 

325 # decide if we reached Tend 

326 Tend_reached = t + lvl.params.dt_initial >= Tend 

327 

328 if Tend_reached: 

329 u_star = lvl.prob.u_exact(t=t) 

330 return abs(u - u_star) / abs(u_star) 

331 else: 

332 print(f'Final time was not reached! Code crashed at t={t:.2f} instead of reaching Tend={Tend:.2f}') 

333 return np.inf 

334 

335 def single_run( 

336 self, 

337 strategy, 

338 run=0, 

339 faults=False, 

340 force_params=None, 

341 hook_class=None, 

342 space_comm=None, 

343 Tend=None, 

344 time_comm=None, 

345 ): 

346 ''' 

347 Run the problem once with the specified parameters 

348 

349 Args: 

350 strategy (Strategy): The resilience strategy you plan on using 

351 run (int): Index for fault generation 

352 faults (bool): Whether or not to put faults in 

353 force_params (dict): Change parameters in the description of the problem 

354 space_comm (MPI.Communicator): A communicator for space parallelisation 

355 Tend (float): Time you want to run to 

356 time_comm (MPI.Intracomm.Communicator): Communicator in time 

357 

358 Returns: 

359 dict: Stats object containing statistics for each step, each level and each iteration 

360 pySDC.Controller: The controller of the run 

361 float: The time the problem should have run to 

362 ''' 

363 hook_class = hook_collection + [LogWork] + ([LogData] if hook_class is None else hook_class) 

364 force_params = {} if force_params is None else force_params 

365 

366 # build the custom description 

367 custom_description = strategy.get_custom_description_for_faults(self.prob, self.num_procs) 

368 for k in force_params.keys(): 

369 custom_description[k] = {**custom_description.get(k, {}), **force_params[k]} 

370 

371 custom_controller_params = { 

372 'logger_level': LOGGER_LEVEL, 

373 **force_params.get('controller_params', {}), 

374 } 

375 

376 if faults: 

377 fault_stuff = { 

378 'rng': None, 

379 'args': strategy.get_fault_args(self.prob, self.num_procs), 

380 'rnd_params': strategy.get_random_params(self.prob, self.num_procs), 

381 } 

382 

383 # make parameters for faults: 

384 if self.mode == 'random': 

385 fault_stuff['rng'] = np.random.RandomState(run) 

386 elif self.mode == 'combination': 

387 fault_stuff['rng'] = run 

388 elif self.mode == 'regular': 

389 fault_stuff['rng'] = np.random.RandomState(run) 

390 fault_stuff['fault_frequency_iter'] = self.kwargs['fault_frequency_iter'] 

391 fault_stuff['rnd_params'] = { 

392 'bit': 12, 

393 'min_node': 1, 

394 } 

395 else: 

396 raise NotImplementedError(f'Don\'t know how to add faults in mode {self.mode}') 

397 

398 else: 

399 fault_stuff = None 

400 

401 return self.prob( 

402 custom_description=custom_description, 

403 num_procs=self.num_procs, 

404 hook_class=hook_class, 

405 fault_stuff=fault_stuff, 

406 Tend=self.get_Tend() if Tend is None else Tend, 

407 custom_controller_params=custom_controller_params, 

408 space_comm=space_comm, 

409 comm=time_comm, 

410 use_MPI=time_comm is not None, 

411 ) 

412 

413 def compare_strategies(self, run=0, faults=False, ax=None): # pragma: no cover 

414 ''' 

415 Take a closer look at how the strategies compare for a specific run 

416 

417 Args: 

418 run (int): The number of the run to get the appropriate random generator 

419 faults (bool): Whether or not to include faults 

420 ax (Matplotlib.axes): Somewhere to plot 

421 

422 Returns: 

423 None 

424 ''' 

425 if ax is None: 

426 fig, ax = plt.subplots(1, 1) 

427 store = True 

428 else: 

429 store = False 

430 

431 k_ax = ax.twinx() 

432 ls = ['-.' if type(strategy) == HotRodStrategy else '-' for strategy in self.strategies] 

433 [self.scrutinize_visual(self.strategies[i], run, faults, ax, k_ax, ls[i]) for i in range(len(self.strategies))] 

434 

435 # make a legend 

436 [k_ax.plot([None], [None], label=strategy.label, color=strategy.color) for strategy in self.strategies] 

437 k_ax.legend(frameon=True) 

438 

439 if store: 

440 fig.tight_layout() 

441 plt.savefig(f'data/{self.get_name()}-comparison.pdf', transparent=True) 

442 

443 def scrutinize_visual( 

444 self, strategy, run, faults, ax=None, k_ax=None, ls='-', plot_restarts=False 

445 ): # pragma: no cover 

446 ''' 

447 Take a closer look at a specific run with a plot 

448 

449 Args: 

450 strategy (Strategy): The resilience strategy you plan on using 

451 run (int): The number of the run to get the appropriate random generator 

452 faults (bool): Whether or not to include faults 

453 ax (Matplotlib.axes): Somewhere to plot the error 

454 k_ax (Matplotlib.axes): Somewhere to plot the iterations 

455 plot_restarts (bool): Make vertical lines wherever restarts happened 

456 

457 Returns: 

458 dict: Stats from the run 

459 ''' 

460 if ax is None: 

461 fig, ax = plt.subplots(1, 1) 

462 store = True 

463 else: 

464 store = False 

465 

466 force_params = {} 

467 

468 stats, controller, Tend = self.single_run( 

469 strategy=strategy, 

470 run=run, 

471 faults=faults, 

472 force_params=force_params, 

473 hook_class=hook_collection + [LogLocalErrorPostStep, LogData], 

474 ) 

475 

476 # plot the local error 

477 e_loc = get_sorted(stats, type='e_local_post_step', recomputed=False) 

478 ax.plot([me[0] for me in e_loc], [me[1] for me in e_loc], color=strategy.color, ls=ls) 

479 

480 # plot the iterations 

481 k_ax = ax.twinx() if k_ax is None else k_ax 

482 k = get_sorted(stats, type='k') 

483 k_ax.plot([me[0] for me in k], np.cumsum([me[1] for me in k]), color=strategy.color, ls='--') 

484 

485 # plot the faults 

486 faults = get_sorted(stats, type='bitflip') 

487 for fault_time in [me[0] for me in faults]: 

488 ax.axvline(fault_time, color='grey', ls=':') 

489 

490 # plot restarts 

491 if plot_restarts: 

492 restarts = get_sorted(stats, type='restart') 

493 [ax.axvline(me[0], color='black', ls='-.') if me[1] else '' for me in restarts] 

494 

495 # decorate 

496 ax.set_yscale('log') 

497 ax.set_ylabel(r'$\epsilon$') 

498 k_ax.set_ylabel('cumulative iterations (dashed)') 

499 ax.set_xlabel(r'$t$') 

500 

501 if store: 

502 fig.tight_layout() 

503 plt.savefig(f'data/{self.get_name()}-{strategy.name}-details.pdf', transparent=True) 

504 

505 def scrutinize(self, strategy, logger_level=15, **kwargs): 

506 ''' 

507 Take a closer look at a specific run 

508 

509 Args: 

510 strategy (Strategy): The resilience strategy you plan on using 

511 

512 Returns: 

513 None 

514 ''' 

515 from pySDC.projects.Resilience.fault_injection import FaultInjector 

516 

517 force_params = {} 

518 force_params['controller_params'] = {'logger_level': logger_level} 

519 force_params['sweeper_params'] = {'skip_residual_computation': ()} 

520 

521 stats, controller, Tend = self.single_run(strategy=strategy, force_params=force_params, **kwargs) 

522 

523 u_all = get_sorted(stats, type='u', recomputed=False) 

524 t, u = get_sorted(stats, type='u', recomputed=False)[np.argmax([me[0] for me in u_all])] 

525 k = [me[1] for me in get_sorted(stats, type='k')] 

526 

527 fault_hook = [me for me in controller.hooks if type(me) == FaultInjector] 

528 

529 unhappened_faults = fault_hook[0].faults if len(fault_hook) > 0 else [] 

530 

531 print(f'\nOverview for {strategy.name} strategy') 

532 # print faults 

533 faults = get_sorted(stats, type='bitflip') 

534 print('\nfaults:') 

535 print(' t | level | iter | node | bit | trgt | rank | pos') 

536 print('--------+-------+------+------+-----+------+---------------------') 

537 for f in faults: 

538 print( 

539 f' {f[0]:6.2f} | {f[1][0]:5d} | {f[1][1]:4d} | {f[1][2]:4d} | {f[1][4]:3d} | {f[1][5]:4d} | {f[1][6]:4d} |', 

540 f[1][3], 

541 ) 

542 print('--------+-------+------+------+-----+------+---------------------') 

543 for f in unhappened_faults: 

544 print( 

545 f'!{f.time:6.2f} | {f.level_number:5d} | {f.iteration:4d} | {f.node:4d} | {f.bit:3d} | {f.target:4d} | {f.rank:4d} |', 

546 f.problem_pos, 

547 ) 

548 

549 # see if we can determine if the faults where recovered 

550 no_faults = self.load(strategy=strategy, faults=False) 

551 e_star = np.mean(no_faults.get('error', [0])) 

552 error = self.get_error(u, t, controller, strategy, Tend) 

553 recovery_thresh = self.get_thresh(strategy) 

554 

555 print( 

556 f'e={error:.2e}, e^*={e_star:.2e}, thresh: {recovery_thresh:.2e} -> recovered: {error < recovery_thresh}, e_rel = {error/abs(u):.2e}' 

557 ) 

558 print(f'k: sum: {np.sum(k)}, min: {np.min(k)}, max: {np.max(k)}, mean: {np.mean(k):.2f},') 

559 

560 _newton_iter = get_sorted(stats, type='work_newton') 

561 if len(_newton_iter) > 0: 

562 newton_iter = [me[1] for me in _newton_iter] 

563 print( 

564 f'Newton: k: sum: {np.sum(newton_iter)}, min: {np.min(newton_iter)}, max: {np.max(newton_iter)}, mean: {np.mean(newton_iter):.2f},' 

565 ) 

566 

567 # checkout the step size 

568 dt = [me[1] for me in get_sorted(stats, type='dt')] 

569 print(f't: {t:.2f}, dt: min: {np.min(dt):.2e}, max: {np.max(dt):.2e}, mean: {np.mean(dt):.2e}') 

570 

571 # restarts 

572 restarts = [me[1] for me in get_sorted(stats, type='restart')] 

573 print(f'restarts: {sum(restarts)}, without faults: {no_faults["restarts"][0]}') 

574 

575 return stats 

576 

577 def convert_faults(self, faults): 

578 ''' 

579 Make arrays of useable data from an entry in the stats object returned by pySDC 

580 

581 Args: 

582 faults (list): The entry for faults returned by the pySDC run 

583 

584 Returns: 

585 list: The times when the faults happened 

586 list: The levels in which the faults happened 

587 list: The iterations in which the faults happened 

588 list: The nodes in which the faults happened 

589 list: The problem positions in which the faults happened 

590 list: The bits in which the faults happened 

591 ''' 

592 time = [faults[i][0] for i in range(len(faults))] 

593 level = [faults[i][1][0] for i in range(len(faults))] 

594 iteration = [faults[i][1][1] for i in range(len(faults))] 

595 node = [faults[i][1][2] for i in range(len(faults))] 

596 problem_pos = [faults[i][1][3] for i in range(len(faults))] 

597 bit = [faults[i][1][4] for i in range(len(faults))] 

598 return time, level, iteration, node, problem_pos, bit 

599 

600 def get_path(self, **kwargs): 

601 ''' 

602 Get the path to where the stats are stored 

603 

604 Args: 

605 strategy (Strategy): The resilience strategy 

606 faults (bool): Whether or not faults have been activated 

607 

608 Returns: 

609 str: The path to what you are looking for 

610 ''' 

611 return f'{self.stats_path}/{self.get_name(**kwargs)}.pickle' 

612 

613 def get_name(self, strategy=None, faults=True, mode=None): 

614 ''' 

615 Function to get a unique name for a kind of statistics based on the problem and strategy that was used 

616 

617 Args: 

618 strategy (Strategy): Resilience strategy 

619 faults (bool): Whether or not faults where inserted 

620 

621 Returns: 

622 str: The unique identifier 

623 ''' 

624 if self.prob == run_advection: 

625 prob_name = 'advection' 

626 elif self.prob == run_vdp: 

627 prob_name = 'vdp' 

628 elif self.prob == run_piline: 

629 prob_name = 'piline' 

630 elif self.prob == run_Lorenz: 

631 prob_name = 'Lorenz' 

632 elif self.prob == run_Schroedinger: 

633 prob_name = 'Schroedinger' 

634 elif self.prob == run_quench: 

635 prob_name = 'Quench' 

636 elif self.prob.__name__ == 'run_AC': 

637 prob_name = 'Allen-Cahn' 

638 elif self.prob.__name__ == 'run_RBC': 

639 prob_name = 'Rayleigh-Benard' 

640 else: 

641 raise NotImplementedError(f'Name not implemented for problem {self.prob}') 

642 

643 if faults: 

644 fault_name = '-faults' 

645 else: 

646 fault_name = '' 

647 

648 if strategy is not None: 

649 strategy_name = f'-{strategy.name}' 

650 else: 

651 strategy_name = '' 

652 

653 mode = self.mode if mode is None else mode 

654 if mode == 'regular': 

655 mode_thing = f'-regular{self.kwargs["fault_frequency_iter"] if faults else ""}' 

656 else: 

657 mode_thing = '' 

658 

659 mpi_thing = '-MPI' if self.use_MPI else '' 

660 return f'{prob_name}{strategy_name}{fault_name}-{self.num_procs}procs{mode_thing}{mpi_thing}' 

661 

662 def store(self, dat, **kwargs): 

663 ''' 

664 Stores the data for a run at a predefined path 

665 

666 Args: 

667 dat (dict): The data of the recorded statistics 

668 

669 Returns: 

670 None 

671 ''' 

672 with open(self.get_path(**kwargs), 'wb') as f: 

673 pickle.dump(dat, f) 

674 return None 

675 

676 def load(self, **kwargs): 

677 ''' 

678 Loads the stats belonging to a specific strategy and whether or not faults where inserted. 

679 When no data has been generated yet, a dictionary is returned which only contains the number of completed runs, 

680 which is 0 of course. 

681 

682 Returns: 

683 dict: Data from previous run or if it is not available a placeholder dictionary 

684 ''' 

685 kwargs['strategy'] = kwargs.get('strategy', self.strategies[MPI.COMM_WORLD.rank % len(self.strategies)]) 

686 

687 try: 

688 with open(self.get_path(**kwargs), 'rb') as f: 

689 dat = pickle.load(f) 

690 except FileNotFoundError: 

691 self.logger.debug(f'File \"{self.get_path(**kwargs)}\" not found!') 

692 return {'runs': 0} 

693 return dat 

694 

695 def get_thresh(self, strategy=None): 

696 """ 

697 Get recovery threshold based on relative and absolute tolerances 

698 

699 Args: 

700 strategy (Strategy): The resilience strategy 

701 """ 

702 fault_free = self.load(strategy=strategy, faults=False) 

703 assert ( 

704 fault_free['error'].std() / fault_free['error'].mean() < 1e-5 

705 ), f'Got too large variation of errors in {strategy} strategy!' 

706 return max([self.recovery_thresh_abs, self.recovery_thresh * fault_free["error"].mean()]) 

707 

708 def get_recovered(self, **kwargs): 

709 ''' 

710 Determine the recovery rate for a specific strategy and store it to disk. 

711 

712 Returns: 

713 None 

714 ''' 

715 if 'strategy' not in kwargs.keys(): 

716 [self.get_recovered(strategy=strat, **kwargs) for strat in self.strategies] 

717 else: 

718 try: 

719 with_faults = self.load(faults=True, **kwargs) 

720 with_faults['recovered'] = with_faults['error'] < self.get_thresh(kwargs['strategy']) 

721 self.store(faults=True, dat=with_faults, **kwargs) 

722 except KeyError as error: 

723 print( 

724 f'Warning: Can\'t compute recovery rate for strategy {kwargs["strategy"].name} in {self.prob.__name__} problem right now: KeyError: {error}' 

725 ) 

726 

727 return None 

728 

729 def crash_rate(self, dat, no_faults, thingA, mask): 

730 ''' 

731 Determine the rate of runs that crashed 

732 

733 Args: 

734 dat (dict): The data of the recorded statistics with faults 

735 no_faults (dict): The data of the corresponding fault-free stats 

736 thingA (str): Some key stored in the stats that will go on the y-axis 

737 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply before determining the rate 

738 

739 Returns: 

740 int: Ratio of the runs which crashed and fall under the specific criteria set by the mask 

741 ''' 

742 if len(dat[thingA][mask]) > 0: 

743 crash = dat['error'] == np.inf 

744 return len(dat[thingA][mask & crash]) / len(dat[thingA][mask]) 

745 else: 

746 return None 

747 

748 def rec_rate(self, dat, no_faults, thingA, mask): 

749 ''' 

750 Operation for plotting which returns the recovery rate for a given mask. 

751 Which thingA you apply this to actually does not matter here since we compute a rate. 

752 

753 Args: 

754 dat (dict): The recorded statistics 

755 no_faults (dict): The corresponding fault-free stats 

756 thingA (str): Some key stored in the stats 

757 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

758 

759 Returns: 

760 float: Recovery rate 

761 ''' 

762 if len(dat[thingA][mask]) > 0: 

763 return len(dat[thingA][mask & dat['recovered']]) / len(dat[thingA][mask]) 

764 else: 

765 return None 

766 

767 def mean(self, dat, no_faults, thingA, mask): 

768 ''' 

769 Operation for plotting which returns the mean of thingA after applying the mask 

770 

771 Args: 

772 dat (dict): The recorded statistics 

773 no_faults (dict): The corresponding fault-free stats 

774 thingA (str): Some key stored in the stats 

775 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

776 

777 Returns: 

778 float: Mean of thingA after applying mask 

779 ''' 

780 return np.mean(dat[thingA][mask]) 

781 

782 def extra_mean(self, dat, no_faults, thingA, mask): 

783 ''' 

784 Operation for plotting which returns the difference in mean of thingA between runs with and without faults after 

785 applying the mask 

786 

787 Args: 

788 dat (dict): The recorded statistics 

789 no_faults (dict): The corresponding fault-free stats 

790 thingA (str): Some key stored in the stats 

791 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

792 

793 Returns: 

794 float: Difference in mean of thingA between runs with and without faults after applying mask 

795 ''' 

796 if True in mask or int in [type(me) for me in mask]: 

797 return np.mean(dat[thingA][mask]) - np.mean(no_faults[thingA]) 

798 else: 

799 return None 

800 

801 def plot_thingA_per_thingB( 

802 self, 

803 strategy, 

804 thingA, 

805 thingB, 

806 ax=None, 

807 mask=None, 

808 recovered=False, 

809 op=None, 

810 **kwargs, 

811 ): # pragma: no cover 

812 ''' 

813 Plot thingA vs. thingB for a single strategy 

814 

815 Args: 

816 strategy (Strategy): The resilience strategy you want to plot 

817 thingA (str): Some key stored in the stats that will go on the y-axis 

818 thingB (str): Some key stored in the stats that will go on the x-axis 

819 ax (Matplotlib.axes): Somewhere to plot 

820 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes 

821 recovered (bool): Show the plot for both all runs and only the recovered ones 

822 op (function): Operation that is applied to thingA before plotting default is recovery rate 

823 

824 Returns: 

825 None 

826 ''' 

827 op = self.rec_rate if op is None else op 

828 dat = self.load(strategy=strategy, faults=True) 

829 no_faults = self.load(strategy=strategy, faults=False) 

830 

831 if mask is None: 

832 mask = np.ones_like(dat[thingB], dtype=bool) 

833 

834 admissable_thingB = np.unique(dat[thingB][mask]) 

835 me = np.zeros(len(admissable_thingB)) 

836 me_recovered = np.zeros_like(me) 

837 

838 for i in range(len(me)): 

839 _mask = (dat[thingB] == admissable_thingB[i]) & mask 

840 if _mask.any(): 

841 me[i] = op(dat, no_faults, thingA, _mask) 

842 me_recovered[i] = op(dat, no_faults, thingA, _mask & dat['recovered']) 

843 

844 if recovered: 

845 ax.plot( 

846 admissable_thingB, 

847 me_recovered, 

848 label=f'{strategy.label} (only recovered)', 

849 color=strategy.color, 

850 marker=strategy.marker, 

851 ls='--', 

852 linewidth=3, 

853 **kwargs, 

854 ) 

855 

856 ax.plot( 

857 admissable_thingB, 

858 me, 

859 label=f'{strategy.label}', 

860 color=strategy.color, 

861 marker=strategy.marker, 

862 linewidth=2, 

863 **kwargs, 

864 ) 

865 

866 ax.legend(frameon=False) 

867 ax.set_xlabel(thingB) 

868 ax.set_ylabel(thingA) 

869 return None 

870 

871 def plot_things_per_things( 

872 self, 

873 thingA='bit', 

874 thingB='bit', 

875 recovered=False, 

876 mask=None, 

877 op=None, 

878 args=None, 

879 strategies=None, 

880 name=None, 

881 store=False, 

882 ax=None, 

883 fig=None, 

884 plotting_args=None, 

885 ): # pragma: no cover 

886 ''' 

887 Plot thingA vs thingB for multiple strategies 

888 

889 Args: 

890 thingA (str): Some key stored in the stats that will go on the y-axis 

891 thingB (str): Some key stored in the stats that will go on the x-axis 

892 recovered (bool): Show the plot for both all runs and only the recovered ones 

893 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes 

894 op (function): Operation that is applied to thingA before plotting default is recovery rate 

895 args (dict): Parameters for how the plot should look 

896 strategies (list): List of the strategies you want to plot, if None, all will be plotted 

897 name (str): Optional name for the plot 

898 store (bool): Store the plot at a predefined path or not (for jupyter notebooks) 

899 ax (Matplotlib.axes): Somewhere to plot 

900 fig (Matplotlib.figure): Figure of the ax 

901 

902 Returns 

903 None 

904 ''' 

905 strategies = self.strategies if strategies is None else strategies 

906 args = {} if args is None else args 

907 

908 # make sure we have something to plot in 

909 if ax is None: 

910 fig, ax = plt.subplots(1, 1) 

911 elif fig is None: 

912 store = False 

913 

914 # execute the plots for all strategies 

915 for s in strategies: 

916 self.plot_thingA_per_thingB( 

917 s, 

918 thingA=thingA, 

919 thingB=thingB, 

920 recovered=recovered, 

921 ax=ax, 

922 mask=mask, 

923 op=op, 

924 **(plotting_args if plotting_args else {}), 

925 ) 

926 

927 # set the parameters 

928 [plt.setp(ax, k, v) for k, v in args.items()] 

929 

930 if store: 

931 fig.tight_layout() 

932 plt.savefig(f'data/{self.get_name()}-{thingA if name is None else name}_per_{thingB}.pdf', transparent=True) 

933 plt.close(fig) 

934 

935 return None 

936 

937 def plot_recovery_thresholds( 

938 self, strategies=None, thresh_range=None, ax=None, recoverable_only=False, **kwargs 

939 ): # pragma: no cover 

940 ''' 

941 Plot the recovery rate for a range of thresholds 

942 

943 Args: 

944 strategies (list): List of the strategies you want to plot, if None, all will be plotted 

945 thresh_range (list): thresholds for deciding whether to accept as recovered 

946 ax (Matplotlib.axes): Somewhere to plot 

947 

948 Returns: 

949 None 

950 ''' 

951 # fill default values if nothing is specified 

952 strategies = self.strategies if strategies is None else strategies 

953 

954 thresh_range = 1 + np.linspace(-4e-2, 4e-2, 100) if thresh_range is None else thresh_range 

955 if ax is None: 

956 fig, ax = plt.subplots(1, 1) 

957 

958 rec_rates = [[None] * len(thresh_range)] * len(strategies) 

959 for strategy_idx in range(len(strategies)): 

960 strategy = strategies[strategy_idx] 

961 # load the stats 

962 fault_free = self.load(strategy=strategy, faults=False) 

963 with_faults = self.load(strategy=strategy, faults=True) 

964 

965 if recoverable_only: 

966 recoverable_mask = self.get_fixable_faults_only(strategy) 

967 else: 

968 recoverable_mask = self.get_mask() 

969 

970 for thresh_idx in range(len(thresh_range)): 

971 rec_mask = self.get_mask( 

972 strategy=strategy, 

973 key='error', 

974 val=(thresh_range[thresh_idx] * fault_free['error'].mean()), 

975 op='gt', 

976 old_mask=recoverable_mask, 

977 ) 

978 rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len( 

979 with_faults['error'][recoverable_mask] 

980 ) 

981 

982 ax.plot( 

983 thresh_range, rec_rates[strategy_idx], **{'color': strategy.color, 'label': strategy.label, **kwargs} 

984 ) 

985 ax.legend(frameon=False) 

986 ax.set_ylabel('recovery rate') 

987 ax.set_xlabel('threshold as ratio to fault-free error') 

988 

989 return None 

990 

991 def analyse_adaptivity(self, mask): # pragma: no cover 

992 ''' 

993 Analyse a set of runs with adaptivity 

994 

995 Args: 

996 mask (Numpy.ndarray of shape (n)): The mask you want to know about 

997 

998 Returns: 

999 None 

1000 ''' 

1001 index = self.get_index(mask) 

1002 dat = self.load() 

1003 

1004 # make a header 

1005 print(' run | bit | node | iter | e_em^* | e_em | e_glob^* | e_glob ') 

1006 print('-------+-----+------+------+----------+----------+----------+----------') 

1007 for i in index: 

1008 e_em, e_glob = self.analyse_adaptivity_single(int(i)) 

1009 print( 

1010 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_em[1]:.2e}\ 

1011 | {e_em[0]:.2e} | {e_glob[1]:.2e} | {e_glob[0]:.2e}' 

1012 ) 

1013 

1014 e_tol = AdaptivityStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][ 

1015 Adaptivity 

1016 ]['e_tol'] 

1017 print(f'We only restart when e_em > e_tol = {e_tol:.2e}!') 

1018 return None 

1019 

1020 def analyse_adaptivity_single(self, run): # pragma: no cover 

1021 ''' 

1022 Compute what the difference in embedded and global error are for a specific run with adaptivity 

1023 

1024 Args: 

1025 run (int): The run you want to know about 

1026 

1027 Returns: 

1028 list: Embedded error with fault and without for the last iteration in the step with a fault 

1029 list: Global error with and without fault at the end of the run 

1030 ''' 

1031 # perform one run with and one without faults 

1032 stats = [] 

1033 controllers = [] 

1034 for faults in [True, False]: 

1035 s, c, _ = self.single_run( 

1036 strategy=AdaptivityStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter] 

1037 ) 

1038 stats += [s] 

1039 controllers += [c] 

1040 

1041 # figure out when the fault happened 

1042 t_fault = get_sorted(stats[0], type='bitflip')[0][0] 

1043 

1044 # get embedded error 

1045 e_em = [ 

1046 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')] 

1047 for stat in stats 

1048 ] 

1049 

1050 # compute the global error 

1051 u_end = [get_sorted(stat, type='u')[-1] for stat in stats] 

1052 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]] 

1053 

1054 return [e_em[i][-1] for i in [0, 1]], e_glob 

1055 

1056 def analyse_HotRod(self, mask): # pragma: no cover 

1057 ''' 

1058 Analyse a set of runs with Hot Rod 

1059 

1060 Args: 

1061 mask (Numpy.ndarray of shape (n)): The mask you want to know about 

1062 

1063 Returns: 

1064 None 

1065 ''' 

1066 index = self.get_index(mask) 

1067 dat = self.load() 

1068 

1069 # make a header 

1070 print( 

1071 ' run | bit | node | iter | e_ex^* | e_ex | e_em^* | e_em | diff* | diff | e_glob^* \ 

1072| e_glob ' 

1073 ) 

1074 print( 

1075 '-------+-----+------+------+----------+----------+----------+----------+----------+----------+----------\ 

1076+----------' 

1077 ) 

1078 for i in index: 

1079 e_em, e_ex, e_glob = self.analyse_HotRod_single(int(i)) 

1080 print( 

1081 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_ex[1]:.2e}\ 

1082 | {e_ex[0]:.2e} | {e_em[1]:.2e} | {e_em[0]:.2e} | {abs(e_em[1]-e_ex[1]):.2e} | {abs(e_em[0]-e_ex[0]):.2e} | \ 

1083{e_glob[1]:.2e} | {e_glob[0]:.2e}' 

1084 ) 

1085 

1086 tol = HotRodStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][HotRod][ 

1087 'HotRod_tol' 

1088 ] 

1089 print(f'We only restart when diff > tol = {tol:.2e}!') 

1090 return None 

1091 

1092 def analyse_HotRod_single(self, run): # pragma: no cover 

1093 ''' 

1094 Compute what the difference in embedded, extrapolated and global error are for a specific run with Hot Rod 

1095 

1096 Args: 

1097 run (int): The run you want to know about 

1098 

1099 Returns: 

1100 list: Embedded error with fault and without for the last iteration in the step with a fault 

1101 list: Extrapolation error with fault and without for the last iteration in the step with a fault 

1102 list: Global error with and without fault at the end of the run 

1103 ''' 

1104 # perform one run with and one without faults 

1105 stats = [] 

1106 controllers = [] 

1107 for faults in [True, False]: 

1108 s, c, _ = self.single_run( 

1109 strategy=HotRodStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter] 

1110 ) 

1111 stats += [s] 

1112 controllers += [c] 

1113 

1114 # figure out when the fault happened 

1115 t_fault = get_sorted(stats[0], type='bitflip')[0][0] 

1116 

1117 # get embedded error 

1118 e_em = [ 

1119 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')] 

1120 for stat in stats 

1121 ] 

1122 # get extrapolated error 

1123 e_ex = [ 

1124 [me[1] for me in get_sorted(stat, type='error_extrapolation_estimate', time=t_fault, sortby='iter')] 

1125 for stat in stats 

1126 ] 

1127 

1128 # compute the global error 

1129 u_end = [get_sorted(stat, type='u')[-1] for stat in stats] 

1130 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]] 

1131 

1132 return [e_em[i][-1] for i in [0, 1]], [e_ex[i][-1] for i in [0, 1]], e_glob 

1133 

1134 def print_faults(self, mask=None, strategy=None): # pragma: no cover 

1135 ''' 

1136 Print all faults that happened within a certain mask 

1137 

1138 Args: 

1139 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of 

1140 strategy (Strategy): The resilience strategy you want to see the error for 

1141 

1142 Returns: 

1143 None 

1144 ''' 

1145 strategy = BaseStrategy() if strategy is None else strategy 

1146 

1147 index = self.get_index(mask) 

1148 dat = self.load(strategy=strategy) 

1149 

1150 e_star = np.mean(self.load(faults=False, strategy=strategy).get('error', [0])) 

1151 

1152 # make a header 

1153 print(' run | bit | node | iter | rank | rel err | space pos') 

1154 print('-------+-----+------+------+------+---------+-----------') 

1155 for i in index: 

1156 print( 

1157 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {dat.get("rank", np.zeros_like(dat["iteration"]))[i]:4.0f} | {dat["error"][i] / e_star:.1e} | {dat["problem_pos"][i]}' 

1158 ) 

1159 

1160 return None 

1161 

1162 def get_mask(self, strategy=None, key=None, val=None, op='eq', old_mask=None, compare_faults=False): 

1163 ''' 

1164 Make a mask to apply to stored data to filter out certain things 

1165 

1166 Args: 

1167 strategy (Strategy): The resilience strategy you want to apply the mask to. Most masks are the same for all 

1168 strategies so None is fine 

1169 key (str): The key in the stored statistics that you want to filter for some value 

1170 val (str, float, int, bool): A value that you want to use for filtering. Dtype depends on key 

1171 op (str): Operation that is applied for filtering 

1172 old_mask (Numpy.ndarray of shape (n)): Apply this mask on top of the filter 

1173 compare_faults (bool): instead of comparing to val, compare to the mean value for fault free runs 

1174 

1175 Returns: 

1176 Numpy.ndarray with boolean entries that can be used as a mask 

1177 ''' 

1178 strategy = self.strategies[0] if strategy is None else strategy 

1179 dat = self.load(strategy=strategy, faults=True) 

1180 

1181 if compare_faults: 

1182 if val is not None: 

1183 raise ValueError('Can\'t use val and compare_faults in get_mask at the same time!') 

1184 else: 

1185 vals = self.load(strategy=strategy, faults=False)[key] 

1186 val = sum(vals) / len(vals) 

1187 

1188 if None in [key, val] and op not in ['isfinite']: 

1189 mask = dat['bit'] == dat['bit'] 

1190 else: 

1191 if op == 'uneq': 

1192 mask = dat[key] != val 

1193 elif op == 'eq': 

1194 mask = dat[key] == val 

1195 elif op == 'leq': 

1196 mask = dat[key] <= val 

1197 elif op == 'geq': 

1198 mask = dat[key] >= val 

1199 elif op == 'lt': 

1200 mask = dat[key] < val 

1201 elif op == 'gt': 

1202 mask = dat[key] > val 

1203 elif op == 'isfinite': 

1204 mask = np.isfinite(dat[key]) 

1205 else: 

1206 raise NotImplementedError(f'Please implement op={op}!') 

1207 

1208 if old_mask is not None: 

1209 return mask & old_mask 

1210 else: 

1211 return mask 

1212 

1213 def get_fixable_faults_only(self, strategy): 

1214 """ 

1215 Return a mask of only faults that can be fixed with a given strategy. 

1216 

1217 Args: 

1218 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all 

1219 

1220 Returns: 

1221 Numpy.ndarray with boolean entries that can be used as a mask 

1222 """ 

1223 fixable = strategy.get_fixable_params( 

1224 maxiter=strategy.get_custom_description(self.prob, self.num_procs)['step_params']['maxiter'] 

1225 ) 

1226 mask = self.get_mask(strategy=strategy) 

1227 

1228 for kwargs in fixable: 

1229 mask = self.get_mask(strategy=strategy, **kwargs, old_mask=mask) 

1230 

1231 return mask 

1232 

1233 def get_index(self, mask=None): 

1234 ''' 

1235 Get the indeces of all runs in mask 

1236 

1237 Args: 

1238 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of 

1239 

1240 Returns: 

1241 Numpy.ndarray: Array of indeces 

1242 ''' 

1243 if mask is None: 

1244 dat = self.load() 

1245 return np.arange(len(dat['iteration'])) 

1246 else: 

1247 return np.arange(len(mask))[mask] 

1248 

1249 def get_statistics_info(self, mask=None, strategy=None, print_all=False, ax=None): # pragma: no cover 

1250 ''' 

1251 Get information about how many data points for faults we have given a particular mask 

1252 

1253 Args: 

1254 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1255 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all 

1256 strategies, so you don't need to supply this argument 

1257 print_all (bool): Whether to add information that is normally not useful to the table 

1258 ax (Matplotlib.axes): Somewhere to plot the combinations histogram 

1259 

1260 Returns: 

1261 None 

1262 ''' 

1263 

1264 # load some data from which to infer the number occurrences of some event 

1265 strategy = self.strategies[0] if strategy is None else strategy 

1266 dat = self.load(stratagy=strategy, faults=True) 

1267 

1268 # make a dummy mask in case none is supplied 

1269 if mask is None: 

1270 mask = np.ones_like(dat['error'], dtype=bool) 

1271 

1272 # print a header 

1273 print(f' tot: {len(dat["error"][mask]):6} | avg. counts | mean deviation | unique entries') 

1274 print('-------------------------------------------------------------') 

1275 

1276 # make a list of all keys that you want to look at 

1277 keys = ['iteration', 'bit', 'node'] 

1278 if print_all: 

1279 keys += ['problem_pos', 'level', 'target'] 

1280 

1281 # print the table 

1282 for key in keys: 

1283 counts, dev, unique = self.count_occurrences(dat[key][mask]) 

1284 print(f' {key:11} | {counts:11.1f} | {dev:14.2f} | {unique:14}') 

1285 

1286 return None 

1287 

1288 def combinations_histogram(self, dat=None, keys=None, mask=None, ax=None): # pragma: no cover 

1289 ''' 

1290 Make a histogram ouf of the occurrences of combinations 

1291 

1292 Args: 

1293 dat (dict): The data of the recorded statistics 

1294 keys (list): The keys in dat that you want to know the combinations of 

1295 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1296 

1297 Returns: 

1298 Matplotlib.axes: The plot 

1299 ''' 

1300 if ax is None: 

1301 fig, ax = plt.subplots(1, 1) 

1302 

1303 occurrences, bins = self.get_combination_histogram(dat, keys, mask) 

1304 

1305 ax.bar(bins[:-1], occurrences) 

1306 

1307 ax.set_xlabel('Occurrence of combinations') 

1308 

1309 return ax 

1310 

1311 def get_combination_histogram(self, dat=None, keys=None, mask=None): # pragma: no cover 

1312 ''' 

1313 Check how many combinations of values we expect and how many we find to see if we need to do more experiments. 

1314 It is assumed that each allowed value for each key appears at least once in dat after the mask was applied 

1315 

1316 Args: 

1317 dat (dict): The data of the recorded statistics 

1318 keys (list): The keys in dat that you want to know the combinations of 

1319 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1320 

1321 Returns: 

1322 Numpy.ndarray: Number of occurrences of combinations 

1323 Numpy.ndarray: Bins 

1324 ''' 

1325 

1326 # load default values 

1327 dat = self.load(strategy=self.strategies[0], faults=True) if dat is None else dat 

1328 keys = ['iteration', 'bit', 'node'] if keys is None else keys 

1329 if mask is None: 

1330 mask = np.ones_like(dat['error'], dtype=bool) 

1331 

1332 # get unique values and compute how many combinations you expect 

1333 num_unique = [len(np.unique(dat[key][mask])) for key in keys] 

1334 expected_number_of_combinations = np.prod(num_unique) 

1335 

1336 # test what you actually get 

1337 combination_counts = self.get_combination_counts(dat, keys, mask) 

1338 

1339 # make a histogram with the result 

1340 occurrences, bins = np.histogram(combination_counts, bins=np.arange(max(combination_counts) + 1)) 

1341 occurrences[0] = expected_number_of_combinations - len(combination_counts) 

1342 

1343 return occurrences, bins 

1344 

1345 def get_max_combinations(self, dat=None, strategy=None): 

1346 ''' 

1347 Count how many combinations of parameters for faults are possible 

1348 

1349 Args: 

1350 dat (dict): The recorded statistics 

1351 keys (list): The keys in dat that you want to know the combinations of 

1352 strategy (Strategy): The resilience strategy 

1353 

1354 Returns: 

1355 int: Number of possible combinations 

1356 ''' 

1357 strategy = self.strategies[0] if strategy is None else strategy 

1358 stats, controller, Tend = self.single_run(strategy=strategy, run=0, faults=True) 

1359 faultHook = get_fault_injector_hook(controller) 

1360 ranges = [ 

1361 (0, faultHook.rnd_params['level_number']), 

1362 (faultHook.rnd_params.get('min_node', 0), faultHook.rnd_params['node'] + 1), 

1363 (1, faultHook.rnd_params['iteration'] + 1), 

1364 (0, faultHook.rnd_params['bit']), 

1365 (0, faultHook.rnd_params['rank']), 

1366 ] 

1367 ranges += [(0, i) for i in faultHook.rnd_params['problem_pos']] 

1368 return np.prod([me[1] - me[0] for me in ranges], dtype=int) 

1369 

1370 def get_combination_counts(self, dat, keys, mask): 

1371 ''' 

1372 Get counts of how often all combinations of values of keys appear. This is done recursively to support arbitrary 

1373 numbers of keys 

1374 

1375 Args: 

1376 dat (dict): The data of the recorded statistics 

1377 keys (list): The keys in dat that you want to know the combinations of 

1378 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1379 

1380 Returns: 

1381 list: Occurrences of all combinations 

1382 ''' 

1383 key = keys[0] 

1384 unique_vals = np.unique(dat[key][mask]) 

1385 res = [] 

1386 

1387 for i in range(len(unique_vals)): 

1388 inner_mask = self.get_mask(key=key, val=unique_vals[i], op='eq', old_mask=mask) 

1389 if len(keys) > 1: 

1390 res += self.get_combination_counts(dat, keys[1:], inner_mask) 

1391 else: 

1392 res += [self.count_occurrences(dat[key][inner_mask])[0]] 

1393 return res 

1394 

1395 def count_occurrences(self, vals): 

1396 ''' 

1397 Count the occurrences of unique values in vals and compute average deviation from mean 

1398 

1399 Args: 

1400 vals (list): Values you want to check 

1401 

1402 Returns: 

1403 float: Mean of number of occurrences of unique values in vals 

1404 float: Average deviation from mean number of occurrences 

1405 int: Number of unique entries 

1406 ''' 

1407 unique_vals, counts = np.unique(vals, return_counts=True) 

1408 

1409 if len(counts) > 0: 

1410 return counts.mean(), sum(abs(counts - counts.mean())) / len(counts), len(counts) 

1411 else: 

1412 return None, None, 0 

1413 

1414 def bar_plot_thing( 

1415 self, x=None, thing=None, ax=None, mask=None, store=False, faults=False, name=None, op=None, args=None 

1416 ): # pragma: no cover 

1417 ''' 

1418 Make a bar plot about something! 

1419 

1420 Args: 

1421 x (Numpy.ndarray of dimension 1): x values for bar plot 

1422 thing (str): Some key stored in the stats that will go on the y-axis 

1423 mask (Numpy.ndarray of shape (n)): The mask you want to apply before plotting 

1424 store (bool): Store the plot at a predefined path or not (for jupyter notebooks) 

1425 faults (bool): Whether to load stats with faults or without 

1426 name (str): Optional name for the plot 

1427 op (function): Operation that is applied to thing before plotting default is recovery rate 

1428 args (dict): Parameters for how the plot should look 

1429 

1430 Returns: 

1431 None 

1432 ''' 

1433 if ax is None: 

1434 fig, ax = plt.subplots(1, 1) 

1435 store = True 

1436 op = self.mean if op is None else op 

1437 

1438 # get the values for the bars 

1439 height = np.zeros(len(self.strategies)) 

1440 for strategy_idx in range(len(self.strategies)): 

1441 strategy = self.strategies[strategy_idx] 

1442 

1443 # load the values 

1444 dat = self.load(strategy=strategy, faults=faults) 

1445 no_faults = self.load(strategy=strategy, faults=False) 

1446 

1447 # check if we have a mask 

1448 if mask is None: 

1449 mask = np.ones_like(dat[thing], dtype=bool) 

1450 

1451 height[strategy_idx] = op(dat, no_faults, thing, mask) 

1452 

1453 # get some x values 

1454 x = np.arange(len(self.strategies)) if x is None else x 

1455 

1456 # prepare labels 

1457 ticks = [strategy.bar_plot_x_label for strategy in self.strategies] 

1458 

1459 ax.bar(x, height, tick_label=ticks) 

1460 

1461 # set the parameters 

1462 ax.set_ylabel(thing) 

1463 args = {} if args is None else args 

1464 [plt.setp(ax, k, v) for k, v in args.items()] 

1465 

1466 if store: 

1467 fig.tight_layout() 

1468 plt.savefig(f'data/{self.get_name()}-{thing if name is None else name}-barplot.pdf', transparent=True) 

1469 plt.close(fig) 

1470 

1471 return None 

1472 

1473 def fault_frequency_plot(self, ax, iter_ax, kwargs_range, strategy=None): # pragma: no cover 

1474 func_args = locals() 

1475 func_args.pop('self', None) 

1476 if strategy is None: 

1477 for strat in self.strategies: 

1478 args = {**func_args, 'strategy': strat} 

1479 self.fault_frequency_plot(**args) 

1480 return None 

1481 

1482 # load data 

1483 all_data = {} 

1484 for me in kwargs_range['fault_frequency_iter']: 

1485 self.kwargs['fault_frequency_iter'] = me 

1486 self.get_recovered() 

1487 all_data[me] = self.load(strategy=strategy, faults=True, mode='regular') 

1488 

1489 # get_recovery_rate 

1490 results = {} 

1491 results['frequencies'] = list(all_data.keys()) 

1492 results['recovery_rate'] = [ 

1493 len(all_data[key]['recovered'][all_data[key]['recovered'] is True]) / len(all_data[key]['recovered']) 

1494 for key in all_data.keys() 

1495 ] 

1496 # results['iterations'] = [np.mean(all_data[key]['total_iteration']) for key in all_data.keys()] 

1497 results['iterations'] = [ 

1498 np.mean(all_data[key]['total_iteration'][all_data[key]['error'] != np.inf]) for key in all_data.keys() 

1499 ] 

1500 

1501 ax.plot(results['frequencies'], results['recovery_rate'], **strategy.style) 

1502 iter_ax.plot(results['frequencies'], results['iterations'], **{**strategy.style, 'ls': '--'}) 

1503 

1504 ax.set_xscale('log') 

1505 ax.set_xlabel('iterations between fault') 

1506 ax.set_ylabel('recovery rate') 

1507 iter_ax.set_ylabel('average total iterations if not crashed (dashed)') 

1508 ax.legend(frameon=False) 

1509 

1510 return None 

1511 

1512 def get_HR_tol(self, verbose=False): 

1513 from pySDC.implementations.convergence_controller_classes.hotrod import HotRod 

1514 

1515 HR_strategy = HotRodStrategy(useMPI=self.use_MPI) 

1516 

1517 description = HR_strategy.get_custom_description(self.prob, self.num_procs) 

1518 description['convergence_controllers'][HotRod]['HotRod_tol'] = 1e2 

1519 

1520 stats, _, _ = self.single_run(HR_strategy, force_params=description) 

1521 

1522 e_extrapolation = get_sorted(stats, type='error_extrapolation_estimate') 

1523 diff = [] 

1524 for i in range(len(e_extrapolation)): 

1525 if e_extrapolation[i][1] is not None: 

1526 e_embedded = get_sorted(stats, type='error_embedded_estimate', time=e_extrapolation[i][0]) 

1527 diff += [abs(e_extrapolation[i][1] - e_embedded[-1][1])] 

1528 

1529 max_diff = max(diff) 

1530 proposed_tol = 2 * max_diff 

1531 if verbose: 

1532 print( 

1533 f'Max. diff: {max_diff:.6e} -> proposed HR tolerance: {proposed_tol:.6e} for {self.prob.__name__} problem with {self.num_procs} procs.' 

1534 ) 

1535 else: 

1536 print(f'{proposed_tol:.6e}') 

1537 

1538 

1539def check_local_error(): # pragma: no cover 

1540 """ 

1541 Make a plot of the resolution over time for all problems 

1542 """ 

1543 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench] 

1544 problems = [run_quench] 

1545 strategies = [BaseStrategy(), AdaptivityStrategy(), IterateStrategy()] 

1546 

1547 for i in range(len(problems)): 

1548 stats_analyser = FaultStats( 

1549 prob=problems[i], 

1550 strategies=strategies, 

1551 faults=[False], 

1552 reload=True, 

1553 recovery_thresh=1.1, 

1554 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1555 num_procs=1, 

1556 mode='random', 

1557 ) 

1558 stats_analyser.compare_strategies() 

1559 plt.show() 

1560 

1561 

1562def parse_args(): 

1563 import sys 

1564 

1565 kwargs = {} 

1566 

1567 for i in range(len(sys.argv)): 

1568 if 'prob' in sys.argv[i]: 

1569 if sys.argv[i + 1] == 'run_Lorenz': 

1570 kwargs['prob'] = run_Lorenz 

1571 elif sys.argv[i + 1] == 'run_vdp': 

1572 kwargs['prob'] = run_vdp 

1573 elif sys.argv[i + 1] == 'run_Schroedinger': 

1574 kwargs['prob'] = run_Schroedinger 

1575 elif sys.argv[i + 1] == 'run_quench': 

1576 kwargs['prob'] = run_quench 

1577 elif sys.argv[i + 1] == 'run_AC': 

1578 kwargs['prob'] = run_AC 

1579 elif sys.argv[i + 1] == 'run_RBC': 

1580 kwargs['prob'] = run_RBC 

1581 else: 

1582 raise NotImplementedError 

1583 elif 'num_procs' in sys.argv[i]: 

1584 kwargs['num_procs'] = int(sys.argv[i + 1]) 

1585 elif 'runs' in sys.argv[i]: 

1586 kwargs['runs'] = int(sys.argv[i + 1]) 

1587 elif 'mode' in sys.argv[i]: 

1588 kwargs['mode'] = str(sys.argv[i + 1]) 

1589 elif 'reload' in sys.argv[i]: 

1590 kwargs['reload'] = False if sys.argv[i + 1] == 'False' else True 

1591 

1592 return kwargs 

1593 

1594 

1595def compare_adaptivity_modes(): 

1596 from pySDC.projects.Resilience.strategies import AdaptivityRestartFirstStep 

1597 

1598 fig, axs = plt.subplots(2, 2, sharey=True) 

1599 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench] 

1600 titles = ['Van der Pol', 'Lorenz attractor', r'Schr\"odinger', 'Quench'] 

1601 

1602 for i in range(len(problems)): 

1603 ax = axs.flatten()[i] 

1604 

1605 ax.set_title(titles[i]) 

1606 

1607 stats_analyser = FaultStats( 

1608 prob=problems[i], 

1609 strategies=[AdaptivityStrategy()], 

1610 faults=[False, True], 

1611 reload=True, 

1612 recovery_thresh=1.1, 

1613 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1614 num_procs=1, 

1615 mode='default', 

1616 stats_path='data/stats-jusuf', 

1617 ) 

1618 stats_analyser.get_recovered() 

1619 

1620 for strategy in stats_analyser.strategies: 

1621 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1622 stats_analyser.plot_things_per_things( 

1623 'recovered', 

1624 'bit', 

1625 False, 

1626 strategies=[strategy], 

1627 op=stats_analyser.rec_rate, 

1628 mask=fixable, 

1629 args={'ylabel': 'recovery rate', 'label': 'bla'}, 

1630 name='fixable_recovery', 

1631 ax=ax, 

1632 ) 

1633 single_proc = ax.lines[-1] 

1634 single_proc.set_label('single process') 

1635 single_proc.set_color('black') 

1636 

1637 stats_analyser = FaultStats( 

1638 prob=problems[i], 

1639 strategies=[AdaptivityStrategy(), AdaptivityRestartFirstStep()], 

1640 faults=[False, True], 

1641 reload=True, 

1642 recovery_thresh=1.5, 

1643 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1644 num_procs=4, 

1645 mode='default', 

1646 stats_path='data/stats-jusuf', 

1647 ) 

1648 stats_analyser.get_recovered() 

1649 

1650 for strategy in stats_analyser.strategies: 

1651 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1652 stats_analyser.plot_things_per_things( 

1653 'recovered', 

1654 'bit', 

1655 False, 

1656 strategies=[strategy], 

1657 op=stats_analyser.rec_rate, 

1658 mask=fixable, 

1659 args={'ylabel': 'recovery rate'}, 

1660 name='fixable_recovery', 

1661 ax=ax, 

1662 ) 

1663 fig.suptitle('Comparison of restart modes with adaptivity with 4 ranks') 

1664 fig.tight_layout() 

1665 plt.show() 

1666 

1667 

1668def main(): 

1669 kwargs = { 

1670 'prob': run_RBC, 

1671 'num_procs': 1, 

1672 'mode': 'default', 

1673 'runs': 2000, 

1674 'reload': False, 

1675 **parse_args(), 

1676 } 

1677 

1678 strategy_args = { 

1679 'stop_at_nan': True, 

1680 } 

1681 

1682 from pySDC.projects.Resilience.strategies import AdaptivityPolynomialError, kAdaptivityStrategy 

1683 

1684 stats_analyser = FaultStats( 

1685 strategies=[ 

1686 BaseStrategy(**strategy_args), 

1687 AdaptivityStrategy(**strategy_args), 

1688 kAdaptivityStrategy(**strategy_args), 

1689 HotRodStrategy(**strategy_args), 

1690 AdaptivityPolynomialError(**strategy_args), 

1691 ], 

1692 faults=[False, True], 

1693 recovery_thresh=1.15, 

1694 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(kwargs.get('prob', None), 0), 

1695 stats_path='data/stats-jusuf', 

1696 **kwargs, 

1697 ) 

1698 stats_analyser.run_stats_generation(runs=kwargs['runs'], step=25) 

1699 

1700 if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data 

1701 return None 

1702 

1703 stats_analyser.get_recovered() 

1704 mask = None 

1705 

1706 # stats_analyser.compare_strategies() 

1707 stats_analyser.plot_things_per_things( 

1708 'recovered', 'node', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1709 ) 

1710 stats_analyser.plot_things_per_things( 

1711 'recovered', 'iteration', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1712 ) 

1713 

1714 stats_analyser.plot_things_per_things( 

1715 'recovered', 'bit', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1716 ) 

1717 

1718 # make a plot for only the faults that can be recovered 

1719 fig, ax = plt.subplots(1, 1) 

1720 for strategy in stats_analyser.strategies: 

1721 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1722 stats_analyser.plot_things_per_things( 

1723 'recovered', 

1724 'bit', 

1725 False, 

1726 strategies=[strategy], 

1727 op=stats_analyser.rec_rate, 

1728 mask=fixable, 

1729 args={'ylabel': 'recovery rate'}, 

1730 name='fixable_recovery', 

1731 ax=ax, 

1732 ) 

1733 fig.tight_layout() 

1734 fig.savefig(f'data/{stats_analyser.get_name()}-recoverable.pdf', transparent=True) 

1735 

1736 fig, ax = plt.subplots(1, 1, figsize=(13, 4)) 

1737 stats_analyser.plot_recovery_thresholds(ax=ax, thresh_range=np.logspace(-1, 1, 1000)) 

1738 # ax.axvline(stats_analyser.get_thresh(BaseStrategy()), color='grey', ls=':', label='recovery threshold') 

1739 ax.set_xscale('log') 

1740 ax.legend(frameon=False) 

1741 fig.tight_layout() 

1742 fig.savefig(f'data/{stats_analyser.get_name()}-threshold.pdf', transparent=True) 

1743 

1744 stats_analyser.plot_things_per_things( 

1745 'total_iteration', 

1746 'bit', 

1747 True, 

1748 op=stats_analyser.mean, 

1749 mask=mask, 

1750 args={'yscale': 'log', 'ylabel': 'total iterations'}, 

1751 ) 

1752 stats_analyser.plot_things_per_things( 

1753 'total_iteration', 

1754 'bit', 

1755 True, 

1756 op=stats_analyser.extra_mean, 

1757 mask=mask, 

1758 args={'yscale': 'linear', 'ylabel': 'extra iterations'}, 

1759 name='extra_iter', 

1760 ) 

1761 stats_analyser.plot_things_per_things( 

1762 'error', 'bit', True, op=stats_analyser.mean, mask=mask, args={'yscale': 'log'} 

1763 ) 

1764 

1765 stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.9, 1.3, 100)) 

1766 plt.show() 

1767 

1768 

1769if __name__ == "__main__": 

1770 main() 

1771 # compare_adaptivity_modes() 

1772 # check_local_error()