Coverage for pySDC/projects/Resilience/fault_stats.py: 59%

437 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-29 09:02 +0000

1import numpy as np 

2import matplotlib.pyplot as plt 

3from mpi4py import MPI 

4import pickle 

5 

6import pySDC.helpers.plot_helper as plot_helper 

7from pySDC.helpers.stats_helper import get_sorted 

8 

9from pySDC.projects.Resilience.hook import hook_collection, LogUAllIter, LogData 

10from pySDC.projects.Resilience.fault_injection import get_fault_injector_hook 

11from pySDC.implementations.convergence_controller_classes.hotrod import HotRod 

12from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity 

13from pySDC.implementations.hooks.log_errors import LogLocalErrorPostStep 

14from pySDC.implementations.hooks.log_work import LogWork 

15 

16# these problems are available for testing 

17from pySDC.projects.Resilience.advection import run_advection 

18from pySDC.projects.Resilience.vdp import run_vdp 

19from pySDC.projects.Resilience.piline import run_piline 

20from pySDC.projects.Resilience.Lorenz import run_Lorenz 

21from pySDC.projects.Resilience.Schroedinger import run_Schroedinger 

22from pySDC.projects.Resilience.quench import run_quench 

23from pySDC.projects.Resilience.AC import run_AC 

24 

25from pySDC.projects.Resilience.strategies import BaseStrategy, AdaptivityStrategy, IterateStrategy, HotRodStrategy 

26import logging 

27 

28plot_helper.setup_mpl(reset=True) 

29 

30LOGGER_LEVEL = 40 

31 

32RECOVERY_THRESH_ABS = { 

33 # run_quench: 5e-3, 

34 # run_Schroedinger: 1.7e-6, 

35} 

36 

37 

38class FaultStats: 

39 ''' 

40 Class to generate and analyse fault statistics 

41 ''' 

42 

43 def __init__( 

44 self, 

45 prob=None, 

46 strategies=None, 

47 faults=None, 

48 reload=True, 

49 recovery_thresh=1 + 1e-3, 

50 recovery_thresh_abs=0.0, 

51 num_procs=1, 

52 mode='default', 

53 stats_path='data/stats', 

54 use_MPI=False, 

55 **kwargs, 

56 ): 

57 ''' 

58 Initialization routine 

59 

60 Args: 

61 prob: A function that runs a pySDC problem, see imports for available problems 

62 strategies (list): List of resilience strategies 

63 faults (list): List of booleans that describe whether to use faults or not 

64 reload (bool): Load previously computed statistics and continue from there or start from scratch 

65 recovery_thresh (float): Relative threshold for recovery 

66 num_procs (int): Number of processes 

67 mode (str): Mode for fault generation: Either 'random' or 'combination' 

68 ''' 

69 self.prob = prob 

70 self.strategies = [None] if strategies is None else strategies 

71 self.faults = [False, True] if faults is None else faults 

72 self.reload = reload 

73 self.recovery_thresh = recovery_thresh 

74 self.recovery_thresh_abs = recovery_thresh_abs 

75 self.num_procs = num_procs 

76 self.use_MPI = use_MPI 

77 self.stats_path = stats_path 

78 self.kwargs = { 

79 'fault_frequency_iter': 500, 

80 **kwargs, 

81 } 

82 

83 # decide mode 

84 if mode == 'default': 

85 if prob.__name__ in ['run_vdp', 'run_Lorenz']: 

86 self.mode = 'combination' 

87 else: 

88 self.mode = 'random' 

89 else: 

90 self.mode = mode 

91 

92 self.logger = logging.getLogger('FaultStats') 

93 self.logger.level = LOGGER_LEVEL 

94 

95 msg = 'Starting FaultStats with attributes' 

96 for key, val in self.__dict__.items(): 

97 if key in ['logger']: 

98 continue 

99 item = [str(me) for me in val] if type(val) == list else str(val) 

100 msg += '\n\t' f'{key}: {item}' 

101 self.logger.log(25, msg) 

102 

103 def get_Tend(self): 

104 ''' 

105 Get the final time of runs for fault stats based on the problem 

106 

107 Returns: 

108 float: Tend to put into the run 

109 ''' 

110 return self.strategies[0].get_Tend(self.prob, self.num_procs) 

111 

112 def run_stats_generation( 

113 self, runs=1000, step=None, comm=None, kwargs_range=None, faults=None, _reload=False, _runs_partial=0 

114 ): 

115 ''' 

116 Run the generation of stats for all strategies in the `self.strategies` variable 

117 

118 Args: 

119 runs (int): Number of runs you want to do 

120 step (int): Number of runs you want to do between saving 

121 comm (MPI.Communicator): Communicator for distributing runs 

122 faults (bool): Whether to do stats with faults or without 

123 kw_args_range (dict): Range for the parameters 

124 _reload, _runs_partial: Variables only used for recursion. Do not change! 

125 

126 Returns: 

127 None 

128 ''' 

129 if faults is None: 

130 for f in self.faults: 

131 self.run_stats_generation(runs=runs, step=step, comm=comm, kwargs_range=kwargs_range, faults=f) 

132 return None 

133 

134 for key, val in kwargs_range.items() if kwargs_range is not None else {}: 

135 if type(val) == int: 

136 self.kwargs[key] = val 

137 else: 

138 for me in val: 

139 kwargs_range_me = {**kwargs_range, key: me} 

140 self.run_stats_generation( 

141 runs=runs, step=step, comm=comm, kwargs_range=kwargs_range_me, faults=faults 

142 ) 

143 return None 

144 

145 comm = MPI.COMM_WORLD if comm is None else comm 

146 step = (runs if comm.size == 1 else comm.size) if step is None else step 

147 _runs_partial = step if _runs_partial == 0 else _runs_partial 

148 reload = self.reload or _reload 

149 

150 # see if we limit the number of runs we want to do 

151 max_runs = ( 

152 (min(self.get_max_combinations(), runs) if self.mode == 'combination' else runs) if faults else min(runs, 5) 

153 ) 

154 

155 if reload: 

156 # sort the strategies to do some load balancing 

157 sorting_index = None 

158 if comm.rank == 0: 

159 already_completed = np.array( 

160 [self.load(strategy=strategy, faults=faults).get('runs', 0) for strategy in self.strategies] 

161 ) 

162 sorting_index_ = np.argsort(already_completed) 

163 sorting_index = sorting_index_[already_completed[sorting_index_] < max_runs] 

164 _runs_partial = max(min(already_completed), _runs_partial) 

165 

166 # tell all ranks what strategies to use 

167 sorting_index = comm.bcast(sorting_index, root=0) 

168 _runs_partial = comm.bcast(_runs_partial, root=0) 

169 strategies = [self.strategies[i] for i in sorting_index] 

170 if len(strategies) == 0: # check if we are already done 

171 return None 

172 else: 

173 strategies = self.strategies 

174 

175 strategy_comm = comm.Split(comm.rank % len(strategies)) 

176 

177 for j in range(0, len(strategies), comm.size): 

178 self.generate_stats( 

179 strategy=strategies[j + (comm.rank % len(strategies) % (len(strategies) - j))], 

180 runs=min(_runs_partial, max_runs), 

181 faults=faults, 

182 reload=reload, 

183 comm=strategy_comm, 

184 ) 

185 strategy_comm.Free() 

186 self.run_stats_generation( 

187 runs=runs, step=step, comm=comm, faults=faults, _reload=True, _runs_partial=_runs_partial + step 

188 ) 

189 

190 return None 

191 

192 def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, comm=None): 

193 ''' 

194 Generate statistics for recovery from bit flips 

195 ----------------------------------------------- 

196 

197 Every run is given a different random seed such that we have different faults and the results are then stored 

198 

199 Args: 

200 strategy (Strategy): Resilience strategy 

201 runs (int): Number of runs you want to do 

202 reload (bool): Load previously computed statistics and continue from there or start from scratch 

203 faults (bool): Whether to do stats with faults or without 

204 comm (MPI.Communicator): Communicator for distributing runs 

205 

206 Returns: 

207 None 

208 ''' 

209 comm = MPI.COMM_WORLD if comm is None else comm 

210 

211 # initialize dictionary to store the stats in 

212 dat = { 

213 'level': np.zeros(runs), 

214 'iteration': np.zeros(runs), 

215 'node': np.zeros(runs), 

216 'problem_pos': [], 

217 'bit': np.zeros(runs), 

218 'error': np.zeros(runs), 

219 'total_iteration': np.zeros(runs), 

220 'total_newton_iteration': np.zeros(runs), 

221 'restarts': np.zeros(runs), 

222 'target': np.zeros(runs), 

223 'rank': np.zeros(runs), 

224 } 

225 

226 # store arguments for storing and loading 

227 identifier_args = { 

228 'faults': faults, 

229 'strategy': strategy, 

230 } 

231 

232 # reload previously recorded stats and write them to dat 

233 if reload: 

234 already_completed_ = None 

235 if comm.rank == 0: 

236 already_completed_ = self.load(**identifier_args) 

237 already_completed = comm.bcast(already_completed_, root=0) 

238 if already_completed['runs'] > 0 and already_completed['runs'] <= runs and comm.rank == 0: 

239 avail_len = min([already_completed['runs'], runs]) 

240 for k in dat.keys(): 

241 dat[k][:avail_len] = already_completed.get(k, [None] * avail_len) 

242 else: 

243 already_completed = {'runs': 0} 

244 

245 # prepare a message 

246 involved_ranks = comm.gather(MPI.COMM_WORLD.rank, root=0) 

247 msg = f'{comm.size} rank(s) ({involved_ranks}) doing {strategy.name}{" with faults" if faults else ""} with {self.num_procs} ranks for {self.prob.__name__} from {already_completed["runs"]} to {runs}' 

248 if comm.rank == 0 and already_completed['runs'] < runs: 

249 print(msg, flush=True) 

250 

251 space_comm = MPI.COMM_SELF.Split(True) 

252 

253 # perform the remaining experiments 

254 for i in range(already_completed['runs'], runs): 

255 if i % comm.size != comm.rank: 

256 continue 

257 

258 # perform a single experiment with the correct random seed 

259 stats, controller, crash = self.single_run(strategy=strategy, run=i, faults=faults, space_comm=space_comm) 

260 

261 # get the data from the stats 

262 faults_run = get_sorted(stats, type='bitflip') 

263 

264 if faults: 

265 assert len(faults_run) > 0, f"Did not record a fault in run {i} of {strategy.name}!" 

266 dat['level'][i] = faults_run[0][1][0] 

267 dat['iteration'][i] = faults_run[0][1][1] 

268 dat['node'][i] = faults_run[0][1][2] 

269 dat['problem_pos'] += [faults_run[0][1][3]] 

270 dat['bit'][i] = faults_run[0][1][4] 

271 dat['target'][i] = faults_run[0][1][5] 

272 dat['rank'][i] = faults_run[0][1][6] 

273 if crash: 

274 print('Code crashed!') 

275 continue 

276 

277 # record the rest of the data 

278 t, u = get_sorted(stats, type='u', recomputed=False)[-1] 

279 error = self.get_error(u, t, controller, strategy, self.get_Tend()) 

280 total_iteration = sum([k[1] for k in get_sorted(stats, type='k')]) 

281 total_newton_iteration = sum([k[1] for k in get_sorted(stats, type='work_newton')]) 

282 

283 dat['error'][i] = error 

284 dat['total_iteration'][i] = total_iteration 

285 dat['total_newton_iteration'][i] = total_newton_iteration 

286 dat['restarts'][i] = sum([me[1] for me in get_sorted(stats, type='restart')]) 

287 

288 # free the space communicator 

289 space_comm.Free() 

290 

291 dat_full = {} 

292 for k in dat.keys(): 

293 dat_full[k] = comm.reduce(dat[k], op=MPI.SUM) 

294 

295 # store the completed stats 

296 dat_full['runs'] = runs 

297 

298 if already_completed['runs'] < runs: 

299 if comm.rank == 0: 

300 self.store(dat_full, **identifier_args) 

301 if self.faults: 

302 self.get_recovered(strategy=strategy) 

303 

304 return None 

305 

306 def get_error(self, u, t, controller, strategy, Tend): 

307 """ 

308 Compute the error. 

309 

310 Args: 

311 u (dtype_u): The solution at the end of the run 

312 t (float): Time at which `u` was recorded 

313 controller (pySDC.controller.controller): The controller 

314 strategy (Strategy): The resilience strategy 

315 Tend (float): Time you want to run to 

316 

317 Returns: 

318 float: Error 

319 """ 

320 lvl = controller.MS[0].levels[0] 

321 

322 # decide if we reached Tend 

323 Tend_reached = t + lvl.params.dt_initial >= Tend 

324 

325 if Tend_reached: 

326 u_star = lvl.prob.u_exact(t=t) 

327 return abs(u - u_star) / abs(u_star) 

328 else: 

329 print(f'Final time was not reached! Code crashed at t={t:.2f} instead of reaching Tend={Tend:.2f}') 

330 return np.inf 

331 

332 def single_run( 

333 self, 

334 strategy, 

335 run=0, 

336 faults=False, 

337 force_params=None, 

338 hook_class=None, 

339 space_comm=None, 

340 Tend=None, 

341 time_comm=None, 

342 ): 

343 ''' 

344 Run the problem once with the specified parameters 

345 

346 Args: 

347 strategy (Strategy): The resilience strategy you plan on using 

348 run (int): Index for fault generation 

349 faults (bool): Whether or not to put faults in 

350 force_params (dict): Change parameters in the description of the problem 

351 space_comm (MPI.Communicator): A communicator for space parallelisation 

352 Tend (float): Time you want to run to 

353 time_comm (MPI.Intracomm.Communicator): Communicator in time 

354 

355 Returns: 

356 dict: Stats object containing statistics for each step, each level and each iteration 

357 pySDC.Controller: The controller of the run 

358 float: The time the problem should have run to 

359 ''' 

360 hook_class = hook_collection + [LogWork] + ([LogData] if hook_class is None else hook_class) 

361 force_params = {} if force_params is None else force_params 

362 

363 # build the custom description 

364 custom_description = strategy.get_custom_description_for_faults(self.prob, self.num_procs) 

365 for k in force_params.keys(): 

366 custom_description[k] = {**custom_description.get(k, {}), **force_params[k]} 

367 

368 custom_controller_params = { 

369 'logger_level': LOGGER_LEVEL, 

370 **force_params.get('controller_params', {}), 

371 } 

372 

373 if faults: 

374 fault_stuff = { 

375 'rng': None, 

376 'args': strategy.get_fault_args(self.prob, self.num_procs), 

377 'rnd_params': strategy.get_random_params(self.prob, self.num_procs), 

378 } 

379 

380 # make parameters for faults: 

381 if self.mode == 'random': 

382 fault_stuff['rng'] = np.random.RandomState(run) 

383 elif self.mode == 'combination': 

384 fault_stuff['rng'] = run 

385 elif self.mode == 'regular': 

386 fault_stuff['rng'] = np.random.RandomState(run) 

387 fault_stuff['fault_frequency_iter'] = self.kwargs['fault_frequency_iter'] 

388 fault_stuff['rnd_params'] = { 

389 'bit': 12, 

390 'min_node': 1, 

391 } 

392 else: 

393 raise NotImplementedError(f'Don\'t know how to add faults in mode {self.mode}') 

394 

395 else: 

396 fault_stuff = None 

397 

398 return self.prob( 

399 custom_description=custom_description, 

400 num_procs=self.num_procs, 

401 hook_class=hook_class, 

402 fault_stuff=fault_stuff, 

403 Tend=self.get_Tend() if Tend is None else Tend, 

404 custom_controller_params=custom_controller_params, 

405 space_comm=space_comm, 

406 comm=time_comm, 

407 use_MPI=time_comm is not None, 

408 ) 

409 

410 def compare_strategies(self, run=0, faults=False, ax=None): # pragma: no cover 

411 ''' 

412 Take a closer look at how the strategies compare for a specific run 

413 

414 Args: 

415 run (int): The number of the run to get the appropriate random generator 

416 faults (bool): Whether or not to include faults 

417 ax (Matplotlib.axes): Somewhere to plot 

418 

419 Returns: 

420 None 

421 ''' 

422 if ax is None: 

423 fig, ax = plt.subplots(1, 1) 

424 store = True 

425 else: 

426 store = False 

427 

428 k_ax = ax.twinx() 

429 ls = ['-.' if type(strategy) == HotRodStrategy else '-' for strategy in self.strategies] 

430 [self.scrutinize_visual(self.strategies[i], run, faults, ax, k_ax, ls[i]) for i in range(len(self.strategies))] 

431 

432 # make a legend 

433 [k_ax.plot([None], [None], label=strategy.label, color=strategy.color) for strategy in self.strategies] 

434 k_ax.legend(frameon=True) 

435 

436 if store: 

437 fig.tight_layout() 

438 plt.savefig(f'data/{self.get_name()}-comparison.pdf', transparent=True) 

439 

440 def scrutinize_visual( 

441 self, strategy, run, faults, ax=None, k_ax=None, ls='-', plot_restarts=False 

442 ): # pragma: no cover 

443 ''' 

444 Take a closer look at a specific run with a plot 

445 

446 Args: 

447 strategy (Strategy): The resilience strategy you plan on using 

448 run (int): The number of the run to get the appropriate random generator 

449 faults (bool): Whether or not to include faults 

450 ax (Matplotlib.axes): Somewhere to plot the error 

451 k_ax (Matplotlib.axes): Somewhere to plot the iterations 

452 plot_restarts (bool): Make vertical lines wherever restarts happened 

453 

454 Returns: 

455 dict: Stats from the run 

456 ''' 

457 if ax is None: 

458 fig, ax = plt.subplots(1, 1) 

459 store = True 

460 else: 

461 store = False 

462 

463 force_params = {} 

464 

465 stats, controller, Tend = self.single_run( 

466 strategy=strategy, 

467 run=run, 

468 faults=faults, 

469 force_params=force_params, 

470 hook_class=hook_collection + [LogLocalErrorPostStep, LogData], 

471 ) 

472 

473 # plot the local error 

474 e_loc = get_sorted(stats, type='e_local_post_step', recomputed=False) 

475 ax.plot([me[0] for me in e_loc], [me[1] for me in e_loc], color=strategy.color, ls=ls) 

476 

477 # plot the iterations 

478 k_ax = ax.twinx() if k_ax is None else k_ax 

479 k = get_sorted(stats, type='k') 

480 k_ax.plot([me[0] for me in k], np.cumsum([me[1] for me in k]), color=strategy.color, ls='--') 

481 

482 # plot the faults 

483 faults = get_sorted(stats, type='bitflip') 

484 for fault_time in [me[0] for me in faults]: 

485 ax.axvline(fault_time, color='grey', ls=':') 

486 

487 # plot restarts 

488 if plot_restarts: 

489 restarts = get_sorted(stats, type='restart') 

490 [ax.axvline(me[0], color='black', ls='-.') if me[1] else '' for me in restarts] 

491 

492 # decorate 

493 ax.set_yscale('log') 

494 ax.set_ylabel(r'$\epsilon$') 

495 k_ax.set_ylabel('cumulative iterations (dashed)') 

496 ax.set_xlabel(r'$t$') 

497 

498 if store: 

499 fig.tight_layout() 

500 plt.savefig(f'data/{self.get_name()}-{strategy.name}-details.pdf', transparent=True) 

501 

502 def scrutinize(self, strategy, logger_level=15, **kwargs): 

503 ''' 

504 Take a closer look at a specific run 

505 

506 Args: 

507 strategy (Strategy): The resilience strategy you plan on using 

508 

509 Returns: 

510 None 

511 ''' 

512 from pySDC.projects.Resilience.fault_injection import FaultInjector 

513 

514 force_params = {} 

515 force_params['controller_params'] = {'logger_level': logger_level} 

516 force_params['sweeper_params'] = {'skip_residual_computation': ()} 

517 

518 stats, controller, Tend = self.single_run(strategy=strategy, force_params=force_params, **kwargs) 

519 

520 u_all = get_sorted(stats, type='u', recomputed=False) 

521 t, u = get_sorted(stats, type='u', recomputed=False)[np.argmax([me[0] for me in u_all])] 

522 k = [me[1] for me in get_sorted(stats, type='k')] 

523 

524 fault_hook = [me for me in controller.hooks if type(me) == FaultInjector] 

525 

526 unhappened_faults = fault_hook[0].faults if len(fault_hook) > 0 else [] 

527 

528 print(f'\nOverview for {strategy.name} strategy') 

529 # print faults 

530 faults = get_sorted(stats, type='bitflip') 

531 print('\nfaults:') 

532 print(' t | level | iter | node | bit | trgt | rank | pos') 

533 print('--------+-------+------+------+-----+------+---------------------') 

534 for f in faults: 

535 print( 

536 f' {f[0]:6.2f} | {f[1][0]:5d} | {f[1][1]:4d} | {f[1][2]:4d} | {f[1][4]:3d} | {f[1][5]:4d} | {f[1][6]:4d} |', 

537 f[1][3], 

538 ) 

539 print('--------+-------+------+------+-----+------+---------------------') 

540 for f in unhappened_faults: 

541 print( 

542 f'!{f.time:6.2f} | {f.level_number:5d} | {f.iteration:4d} | {f.node:4d} | {f.bit:3d} | {f.target:4d} | {f.rank:4d} |', 

543 f.problem_pos, 

544 ) 

545 

546 # see if we can determine if the faults where recovered 

547 no_faults = self.load(strategy=strategy, faults=False) 

548 e_star = np.mean(no_faults.get('error', [0])) 

549 error = self.get_error(u, t, controller, strategy, Tend) 

550 recovery_thresh = self.get_thresh(strategy) 

551 

552 print( 

553 f'e={error:.2e}, e^*={e_star:.2e}, thresh: {recovery_thresh:.2e} -> recovered: {error < recovery_thresh}, e_rel = {error/abs(u):.2e}' 

554 ) 

555 print(f'k: sum: {np.sum(k)}, min: {np.min(k)}, max: {np.max(k)}, mean: {np.mean(k):.2f},') 

556 

557 _newton_iter = get_sorted(stats, type='work_newton') 

558 if len(_newton_iter) > 0: 

559 newton_iter = [me[1] for me in _newton_iter] 

560 print( 

561 f'Newton: k: sum: {np.sum(newton_iter)}, min: {np.min(newton_iter)}, max: {np.max(newton_iter)}, mean: {np.mean(newton_iter):.2f},' 

562 ) 

563 

564 # checkout the step size 

565 dt = [me[1] for me in get_sorted(stats, type='dt')] 

566 print(f't: {t:.2f}, dt: min: {np.min(dt):.2e}, max: {np.max(dt):.2e}, mean: {np.mean(dt):.2e}') 

567 

568 # restarts 

569 restarts = [me[1] for me in get_sorted(stats, type='restart')] 

570 print(f'restarts: {sum(restarts)}, without faults: {no_faults["restarts"][0]}') 

571 

572 return stats 

573 

574 def convert_faults(self, faults): 

575 ''' 

576 Make arrays of useable data from an entry in the stats object returned by pySDC 

577 

578 Args: 

579 faults (list): The entry for faults returned by the pySDC run 

580 

581 Returns: 

582 list: The times when the faults happened 

583 list: The levels in which the faults happened 

584 list: The iterations in which the faults happened 

585 list: The nodes in which the faults happened 

586 list: The problem positions in which the faults happened 

587 list: The bits in which the faults happened 

588 ''' 

589 time = [faults[i][0] for i in range(len(faults))] 

590 level = [faults[i][1][0] for i in range(len(faults))] 

591 iteration = [faults[i][1][1] for i in range(len(faults))] 

592 node = [faults[i][1][2] for i in range(len(faults))] 

593 problem_pos = [faults[i][1][3] for i in range(len(faults))] 

594 bit = [faults[i][1][4] for i in range(len(faults))] 

595 return time, level, iteration, node, problem_pos, bit 

596 

597 def get_path(self, **kwargs): 

598 ''' 

599 Get the path to where the stats are stored 

600 

601 Args: 

602 strategy (Strategy): The resilience strategy 

603 faults (bool): Whether or not faults have been activated 

604 

605 Returns: 

606 str: The path to what you are looking for 

607 ''' 

608 return f'{self.stats_path}/{self.get_name(**kwargs)}.pickle' 

609 

610 def get_name(self, strategy=None, faults=True, mode=None): 

611 ''' 

612 Function to get a unique name for a kind of statistics based on the problem and strategy that was used 

613 

614 Args: 

615 strategy (Strategy): Resilience strategy 

616 faults (bool): Whether or not faults where inserted 

617 

618 Returns: 

619 str: The unique identifier 

620 ''' 

621 if self.prob == run_advection: 

622 prob_name = 'advection' 

623 elif self.prob == run_vdp: 

624 prob_name = 'vdp' 

625 elif self.prob == run_piline: 

626 prob_name = 'piline' 

627 elif self.prob == run_Lorenz: 

628 prob_name = 'Lorenz' 

629 elif self.prob == run_Schroedinger: 

630 prob_name = 'Schroedinger' 

631 elif self.prob == run_quench: 

632 prob_name = 'Quench' 

633 elif self.prob.__name__ == 'run_AC': 

634 prob_name = 'Allen-Cahn' 

635 else: 

636 raise NotImplementedError(f'Name not implemented for problem {self.prob}') 

637 

638 if faults: 

639 fault_name = '-faults' 

640 else: 

641 fault_name = '' 

642 

643 if strategy is not None: 

644 strategy_name = f'-{strategy.name}' 

645 else: 

646 strategy_name = '' 

647 

648 mode = self.mode if mode is None else mode 

649 if mode == 'regular': 

650 mode_thing = f'-regular{self.kwargs["fault_frequency_iter"] if faults else ""}' 

651 else: 

652 mode_thing = '' 

653 

654 mpi_thing = '-MPI' if self.use_MPI else '' 

655 return f'{prob_name}{strategy_name}{fault_name}-{self.num_procs}procs{mode_thing}{mpi_thing}' 

656 

657 def store(self, dat, **kwargs): 

658 ''' 

659 Stores the data for a run at a predefined path 

660 

661 Args: 

662 dat (dict): The data of the recorded statistics 

663 

664 Returns: 

665 None 

666 ''' 

667 with open(self.get_path(**kwargs), 'wb') as f: 

668 pickle.dump(dat, f) 

669 return None 

670 

671 def load(self, **kwargs): 

672 ''' 

673 Loads the stats belonging to a specific strategy and whether or not faults where inserted. 

674 When no data has been generated yet, a dictionary is returned which only contains the number of completed runs, 

675 which is 0 of course. 

676 

677 Returns: 

678 dict: Data from previous run or if it is not available a placeholder dictionary 

679 ''' 

680 kwargs['strategy'] = kwargs.get('strategy', self.strategies[MPI.COMM_WORLD.rank % len(self.strategies)]) 

681 

682 try: 

683 with open(self.get_path(**kwargs), 'rb') as f: 

684 dat = pickle.load(f) 

685 except FileNotFoundError: 

686 self.logger.debug(f'File \"{self.get_path(**kwargs)}\" not found!') 

687 return {'runs': 0} 

688 return dat 

689 

690 def get_thresh(self, strategy=None): 

691 """ 

692 Get recovery threshold based on relative and absolute tolerances 

693 

694 Args: 

695 strategy (Strategy): The resilience strategy 

696 """ 

697 fault_free = self.load(strategy=strategy, faults=False) 

698 assert ( 

699 fault_free['error'].std() / fault_free['error'].mean() < 1e-5 

700 ), f'Got too large variation of errors in {strategy} strategy!' 

701 return max([self.recovery_thresh_abs, self.recovery_thresh * fault_free["error"].mean()]) 

702 

703 def get_recovered(self, **kwargs): 

704 ''' 

705 Determine the recovery rate for a specific strategy and store it to disk. 

706 

707 Returns: 

708 None 

709 ''' 

710 if 'strategy' not in kwargs.keys(): 

711 [self.get_recovered(strategy=strat, **kwargs) for strat in self.strategies] 

712 else: 

713 try: 

714 with_faults = self.load(faults=True, **kwargs) 

715 with_faults['recovered'] = with_faults['error'] < self.get_thresh(kwargs['strategy']) 

716 self.store(faults=True, dat=with_faults, **kwargs) 

717 except KeyError as error: 

718 print( 

719 f'Warning: Can\'t compute recovery rate for strategy {kwargs["strategy"].name} in {self.prob.__name__} problem right now: KeyError: {error}' 

720 ) 

721 

722 return None 

723 

724 def crash_rate(self, dat, no_faults, thingA, mask): 

725 ''' 

726 Determine the rate of runs that crashed 

727 

728 Args: 

729 dat (dict): The data of the recorded statistics with faults 

730 no_faults (dict): The data of the corresponding fault-free stats 

731 thingA (str): Some key stored in the stats that will go on the y-axis 

732 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply before determining the rate 

733 

734 Returns: 

735 int: Ratio of the runs which crashed and fall under the specific criteria set by the mask 

736 ''' 

737 if len(dat[thingA][mask]) > 0: 

738 crash = dat['error'] == np.inf 

739 return len(dat[thingA][mask & crash]) / len(dat[thingA][mask]) 

740 else: 

741 return None 

742 

743 def rec_rate(self, dat, no_faults, thingA, mask): 

744 ''' 

745 Operation for plotting which returns the recovery rate for a given mask. 

746 Which thingA you apply this to actually does not matter here since we compute a rate. 

747 

748 Args: 

749 dat (dict): The recorded statistics 

750 no_faults (dict): The corresponding fault-free stats 

751 thingA (str): Some key stored in the stats 

752 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

753 

754 Returns: 

755 float: Recovery rate 

756 ''' 

757 if len(dat[thingA][mask]) > 0: 

758 return len(dat[thingA][mask & dat['recovered']]) / len(dat[thingA][mask]) 

759 else: 

760 return None 

761 

762 def mean(self, dat, no_faults, thingA, mask): 

763 ''' 

764 Operation for plotting which returns the mean of thingA after applying the mask 

765 

766 Args: 

767 dat (dict): The recorded statistics 

768 no_faults (dict): The corresponding fault-free stats 

769 thingA (str): Some key stored in the stats 

770 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

771 

772 Returns: 

773 float: Mean of thingA after applying mask 

774 ''' 

775 return np.mean(dat[thingA][mask]) 

776 

777 def extra_mean(self, dat, no_faults, thingA, mask): 

778 ''' 

779 Operation for plotting which returns the difference in mean of thingA between runs with and without faults after 

780 applying the mask 

781 

782 Args: 

783 dat (dict): The recorded statistics 

784 no_faults (dict): The corresponding fault-free stats 

785 thingA (str): Some key stored in the stats 

786 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering 

787 

788 Returns: 

789 float: Difference in mean of thingA between runs with and without faults after applying mask 

790 ''' 

791 if True in mask or int in [type(me) for me in mask]: 

792 return np.mean(dat[thingA][mask]) - np.mean(no_faults[thingA]) 

793 else: 

794 return None 

795 

796 def plot_thingA_per_thingB( 

797 self, 

798 strategy, 

799 thingA, 

800 thingB, 

801 ax=None, 

802 mask=None, 

803 recovered=False, 

804 op=None, 

805 **kwargs, 

806 ): # pragma: no cover 

807 ''' 

808 Plot thingA vs. thingB for a single strategy 

809 

810 Args: 

811 strategy (Strategy): The resilience strategy you want to plot 

812 thingA (str): Some key stored in the stats that will go on the y-axis 

813 thingB (str): Some key stored in the stats that will go on the x-axis 

814 ax (Matplotlib.axes): Somewhere to plot 

815 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes 

816 recovered (bool): Show the plot for both all runs and only the recovered ones 

817 op (function): Operation that is applied to thingA before plotting default is recovery rate 

818 

819 Returns: 

820 None 

821 ''' 

822 op = self.rec_rate if op is None else op 

823 dat = self.load(strategy=strategy, faults=True) 

824 no_faults = self.load(strategy=strategy, faults=False) 

825 

826 if mask is None: 

827 mask = np.ones_like(dat[thingB], dtype=bool) 

828 

829 admissable_thingB = np.unique(dat[thingB][mask]) 

830 me = np.zeros(len(admissable_thingB)) 

831 me_recovered = np.zeros_like(me) 

832 

833 for i in range(len(me)): 

834 _mask = (dat[thingB] == admissable_thingB[i]) & mask 

835 if _mask.any(): 

836 me[i] = op(dat, no_faults, thingA, _mask) 

837 me_recovered[i] = op(dat, no_faults, thingA, _mask & dat['recovered']) 

838 

839 if recovered: 

840 ax.plot( 

841 admissable_thingB, 

842 me_recovered, 

843 label=f'{strategy.label} (only recovered)', 

844 color=strategy.color, 

845 marker=strategy.marker, 

846 ls='--', 

847 linewidth=3, 

848 **kwargs, 

849 ) 

850 

851 ax.plot( 

852 admissable_thingB, 

853 me, 

854 label=f'{strategy.label}', 

855 color=strategy.color, 

856 marker=strategy.marker, 

857 linewidth=2, 

858 **kwargs, 

859 ) 

860 

861 ax.legend(frameon=False) 

862 ax.set_xlabel(thingB) 

863 ax.set_ylabel(thingA) 

864 return None 

865 

866 def plot_things_per_things( 

867 self, 

868 thingA='bit', 

869 thingB='bit', 

870 recovered=False, 

871 mask=None, 

872 op=None, 

873 args=None, 

874 strategies=None, 

875 name=None, 

876 store=True, 

877 ax=None, 

878 fig=None, 

879 plotting_args=None, 

880 ): # pragma: no cover 

881 ''' 

882 Plot thingA vs thingB for multiple strategies 

883 

884 Args: 

885 thingA (str): Some key stored in the stats that will go on the y-axis 

886 thingB (str): Some key stored in the stats that will go on the x-axis 

887 recovered (bool): Show the plot for both all runs and only the recovered ones 

888 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes 

889 op (function): Operation that is applied to thingA before plotting default is recovery rate 

890 args (dict): Parameters for how the plot should look 

891 strategies (list): List of the strategies you want to plot, if None, all will be plotted 

892 name (str): Optional name for the plot 

893 store (bool): Store the plot at a predefined path or not (for jupyter notebooks) 

894 ax (Matplotlib.axes): Somewhere to plot 

895 fig (Matplotlib.figure): Figure of the ax 

896 

897 Returns 

898 None 

899 ''' 

900 strategies = self.strategies if strategies is None else strategies 

901 args = {} if args is None else args 

902 

903 # make sure we have something to plot in 

904 if ax is None: 

905 fig, ax = plt.subplots(1, 1) 

906 elif fig is None: 

907 store = False 

908 

909 # execute the plots for all strategies 

910 for s in strategies: 

911 self.plot_thingA_per_thingB( 

912 s, 

913 thingA=thingA, 

914 thingB=thingB, 

915 recovered=recovered, 

916 ax=ax, 

917 mask=mask, 

918 op=op, 

919 **(plotting_args if plotting_args else {}), 

920 ) 

921 

922 # set the parameters 

923 [plt.setp(ax, k, v) for k, v in args.items()] 

924 

925 if store: 

926 fig.tight_layout() 

927 plt.savefig(f'data/{self.get_name()}-{thingA if name is None else name}_per_{thingB}.pdf', transparent=True) 

928 plt.close(fig) 

929 

930 return None 

931 

932 def plot_recovery_thresholds( 

933 self, strategies=None, thresh_range=None, ax=None, mask=None, **kwargs 

934 ): # pragma: no cover 

935 ''' 

936 Plot the recovery rate for a range of thresholds 

937 

938 Args: 

939 strategies (list): List of the strategies you want to plot, if None, all will be plotted 

940 thresh_range (list): thresholds for deciding whether to accept as recovered 

941 ax (Matplotlib.axes): Somewhere to plot 

942 mask (Numpy.ndarray of shape (n)): The mask you want to know about 

943 

944 Returns: 

945 None 

946 ''' 

947 # fill default values if nothing is specified 

948 strategies = self.strategies if strategies is None else strategies 

949 

950 thresh_range = 1 + np.linspace(-4e-2, 4e-2, 100) if thresh_range is None else thresh_range 

951 if ax is None: 

952 fig, ax = plt.subplots(1, 1) 

953 

954 rec_rates = [[None] * len(thresh_range)] * len(strategies) 

955 for strategy_idx in range(len(strategies)): 

956 strategy = strategies[strategy_idx] 

957 # load the stats 

958 fault_free = self.load(strategy=strategy, faults=False) 

959 with_faults = self.load(strategy=strategy, faults=True) 

960 

961 for thresh_idx in range(len(thresh_range)): 

962 rec_mask = self.get_mask( 

963 strategy=strategy, 

964 key='error', 

965 val=(thresh_range[thresh_idx] * fault_free['error'].mean()), 

966 op='gt', 

967 old_mask=mask, 

968 ) 

969 rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len( 

970 with_faults['error'] 

971 ) 

972 

973 ax.plot( 

974 thresh_range, rec_rates[strategy_idx], **{'color': strategy.color, 'label': strategy.label, **kwargs} 

975 ) 

976 ax.legend(frameon=False) 

977 ax.set_ylabel('recovery rate') 

978 ax.set_xlabel('threshold as ratio to fault-free error') 

979 

980 return None 

981 

982 def analyse_adaptivity(self, mask): # pragma: no cover 

983 ''' 

984 Analyse a set of runs with adaptivity 

985 

986 Args: 

987 mask (Numpy.ndarray of shape (n)): The mask you want to know about 

988 

989 Returns: 

990 None 

991 ''' 

992 index = self.get_index(mask) 

993 dat = self.load() 

994 

995 # make a header 

996 print(' run | bit | node | iter | e_em^* | e_em | e_glob^* | e_glob ') 

997 print('-------+-----+------+------+----------+----------+----------+----------') 

998 for i in index: 

999 e_em, e_glob = self.analyse_adaptivity_single(int(i)) 

1000 print( 

1001 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_em[1]:.2e}\ 

1002 \ 

1003 | {e_em[0]:.2e} | {e_glob[1]:.2e} | {e_glob[0]:.2e}' 

1004 ) 

1005 

1006 e_tol = AdaptivityStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][ 

1007 Adaptivity 

1008 ]['e_tol'] 

1009 print(f'We only restart when e_em > e_tol = {e_tol:.2e}!') 

1010 return None 

1011 

1012 def analyse_adaptivity_single(self, run): # pragma: no cover 

1013 ''' 

1014 Compute what the difference in embedded and global error are for a specific run with adaptivity 

1015 

1016 Args: 

1017 run (int): The run you want to know about 

1018 

1019 Returns: 

1020 list: Embedded error with fault and without for the last iteration in the step with a fault 

1021 list: Global error with and without fault at the end of the run 

1022 ''' 

1023 # perform one run with and one without faults 

1024 stats = [] 

1025 controllers = [] 

1026 for faults in [True, False]: 

1027 s, c, _ = self.single_run( 

1028 strategy=AdaptivityStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter] 

1029 ) 

1030 stats += [s] 

1031 controllers += [c] 

1032 

1033 # figure out when the fault happened 

1034 t_fault = get_sorted(stats[0], type='bitflip')[0][0] 

1035 

1036 # get embedded error 

1037 e_em = [ 

1038 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')] 

1039 for stat in stats 

1040 ] 

1041 

1042 # compute the global error 

1043 u_end = [get_sorted(stat, type='u')[-1] for stat in stats] 

1044 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]] 

1045 

1046 return [e_em[i][-1] for i in [0, 1]], e_glob 

1047 

1048 def analyse_HotRod(self, mask): # pragma: no cover 

1049 ''' 

1050 Analyse a set of runs with Hot Rod 

1051 

1052 Args: 

1053 mask (Numpy.ndarray of shape (n)): The mask you want to know about 

1054 

1055 Returns: 

1056 None 

1057 ''' 

1058 index = self.get_index(mask) 

1059 dat = self.load() 

1060 

1061 # make a header 

1062 print( 

1063 ' run | bit | node | iter | e_ex^* | e_ex | e_em^* | e_em | diff* | diff | e_glob^* \ 

1064| e_glob ' 

1065 ) 

1066 print( 

1067 '-------+-----+------+------+----------+----------+----------+----------+----------+----------+----------\ 

1068+----------' 

1069 ) 

1070 for i in index: 

1071 e_em, e_ex, e_glob = self.analyse_HotRod_single(int(i)) 

1072 print( 

1073 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_ex[1]:.2e}\ 

1074 \ 

1075 | {e_ex[0]:.2e} | {e_em[1]:.2e} | {e_em[0]:.2e} | {abs(e_em[1]-e_ex[1]):.2e} | {abs(e_em[0]-e_ex[0]):.2e} \ 

1076 | \ 

1077{e_glob[1]:.2e} | {e_glob[0]:.2e}' 

1078 ) 

1079 

1080 tol = HotRodStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][HotRod][ 

1081 'HotRod_tol' 

1082 ] 

1083 print(f'We only restart when diff > tol = {tol:.2e}!') 

1084 return None 

1085 

1086 def analyse_HotRod_single(self, run): # pragma: no cover 

1087 ''' 

1088 Compute what the difference in embedded, extrapolated and global error are for a specific run with Hot Rod 

1089 

1090 Args: 

1091 run (int): The run you want to know about 

1092 

1093 Returns: 

1094 list: Embedded error with fault and without for the last iteration in the step with a fault 

1095 list: Extrapolation error with fault and without for the last iteration in the step with a fault 

1096 list: Global error with and without fault at the end of the run 

1097 ''' 

1098 # perform one run with and one without faults 

1099 stats = [] 

1100 controllers = [] 

1101 for faults in [True, False]: 

1102 s, c, _ = self.single_run( 

1103 strategy=HotRodStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter] 

1104 ) 

1105 stats += [s] 

1106 controllers += [c] 

1107 

1108 # figure out when the fault happened 

1109 t_fault = get_sorted(stats[0], type='bitflip')[0][0] 

1110 

1111 # get embedded error 

1112 e_em = [ 

1113 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')] 

1114 for stat in stats 

1115 ] 

1116 # get extrapolated error 

1117 e_ex = [ 

1118 [me[1] for me in get_sorted(stat, type='error_extrapolation_estimate', time=t_fault, sortby='iter')] 

1119 for stat in stats 

1120 ] 

1121 

1122 # compute the global error 

1123 u_end = [get_sorted(stat, type='u')[-1] for stat in stats] 

1124 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]] 

1125 

1126 return [e_em[i][-1] for i in [0, 1]], [e_ex[i][-1] for i in [0, 1]], e_glob 

1127 

1128 def print_faults(self, mask=None, strategy=None): # pragma: no cover 

1129 ''' 

1130 Print all faults that happened within a certain mask 

1131 

1132 Args: 

1133 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of 

1134 strategy (Strategy): The resilience strategy you want to see the error for 

1135 

1136 Returns: 

1137 None 

1138 ''' 

1139 strategy = BaseStrategy() if strategy is None else strategy 

1140 

1141 index = self.get_index(mask) 

1142 dat = self.load(strategy=strategy) 

1143 

1144 e_star = np.mean(self.load(faults=False, strategy=strategy).get('error', [0])) 

1145 

1146 # make a header 

1147 print(' run | bit | node | iter | rank | rel err | space pos') 

1148 print('-------+-----+------+------+------+---------+-----------') 

1149 for i in index: 

1150 print( 

1151 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {dat.get("rank", np.zeros_like(dat["iteration"]))[i]:4.0f} | {dat["error"][i] / e_star:.1e} | {dat["problem_pos"][i]}' 

1152 ) 

1153 

1154 return None 

1155 

1156 def get_mask(self, strategy=None, key=None, val=None, op='eq', old_mask=None, compare_faults=False): 

1157 ''' 

1158 Make a mask to apply to stored data to filter out certain things 

1159 

1160 Args: 

1161 strategy (Strategy): The resilience strategy you want to apply the mask to. Most masks are the same for all 

1162 strategies so None is fine 

1163 key (str): The key in the stored statistics that you want to filter for some value 

1164 val (str, float, int, bool): A value that you want to use for filtering. Dtype depends on key 

1165 op (str): Operation that is applied for filtering 

1166 old_mask (Numpy.ndarray of shape (n)): Apply this mask on top of the filter 

1167 compare_faults (bool): instead of comparing to val, compare to the mean value for fault free runs 

1168 

1169 Returns: 

1170 Numpy.ndarray with boolean entries that can be used as a mask 

1171 ''' 

1172 strategy = self.strategies[0] if strategy is None else strategy 

1173 dat = self.load(strategy=strategy, faults=True) 

1174 

1175 if compare_faults: 

1176 if val is not None: 

1177 raise ValueError('Can\'t use val and compare_faults in get_mask at the same time!') 

1178 else: 

1179 vals = self.load(strategy=strategy, faults=False)[key] 

1180 val = sum(vals) / len(vals) 

1181 

1182 if None in [key, val] and op not in ['isfinite']: 

1183 mask = dat['bit'] == dat['bit'] 

1184 else: 

1185 if op == 'uneq': 

1186 mask = dat[key] != val 

1187 elif op == 'eq': 

1188 mask = dat[key] == val 

1189 elif op == 'leq': 

1190 mask = dat[key] <= val 

1191 elif op == 'geq': 

1192 mask = dat[key] >= val 

1193 elif op == 'lt': 

1194 mask = dat[key] < val 

1195 elif op == 'gt': 

1196 mask = dat[key] > val 

1197 elif op == 'isfinite': 

1198 mask = np.isfinite(dat[key]) 

1199 else: 

1200 raise NotImplementedError(f'Please implement op={op}!') 

1201 

1202 if old_mask is not None: 

1203 return mask & old_mask 

1204 else: 

1205 return mask 

1206 

1207 def get_fixable_faults_only(self, strategy): 

1208 """ 

1209 Return a mask of only faults that can be fixed with a given strategy. 

1210 

1211 Args: 

1212 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all 

1213 

1214 Returns: 

1215 Numpy.ndarray with boolean entries that can be used as a mask 

1216 """ 

1217 fixable = strategy.get_fixable_params( 

1218 maxiter=strategy.get_custom_description(self.prob, self.num_procs)['step_params']['maxiter'] 

1219 ) 

1220 mask = self.get_mask(strategy=strategy) 

1221 

1222 for kwargs in fixable: 

1223 mask = self.get_mask(strategy=strategy, **kwargs, old_mask=mask) 

1224 

1225 return mask 

1226 

1227 def get_index(self, mask=None): 

1228 ''' 

1229 Get the indeces of all runs in mask 

1230 

1231 Args: 

1232 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of 

1233 

1234 Returns: 

1235 Numpy.ndarray: Array of indeces 

1236 ''' 

1237 if mask is None: 

1238 dat = self.load() 

1239 return np.arange(len(dat['iteration'])) 

1240 else: 

1241 return np.arange(len(mask))[mask] 

1242 

1243 def get_statistics_info(self, mask=None, strategy=None, print_all=False, ax=None): # pragma: no cover 

1244 ''' 

1245 Get information about how many data points for faults we have given a particular mask 

1246 

1247 Args: 

1248 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1249 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all 

1250 strategies, so you don't need to supply this argument 

1251 print_all (bool): Whether to add information that is normally not useful to the table 

1252 ax (Matplotlib.axes): Somewhere to plot the combinations histogram 

1253 

1254 Returns: 

1255 None 

1256 ''' 

1257 

1258 # load some data from which to infer the number occurrences of some event 

1259 strategy = self.strategies[0] if strategy is None else strategy 

1260 dat = self.load(stratagy=strategy, faults=True) 

1261 

1262 # make a dummy mask in case none is supplied 

1263 if mask is None: 

1264 mask = np.ones_like(dat['error'], dtype=bool) 

1265 

1266 # print a header 

1267 print(f' tot: {len(dat["error"][mask]):6} | avg. counts | mean deviation | unique entries') 

1268 print('-------------------------------------------------------------') 

1269 

1270 # make a list of all keys that you want to look at 

1271 keys = ['iteration', 'bit', 'node'] 

1272 if print_all: 

1273 keys += ['problem_pos', 'level', 'target'] 

1274 

1275 # print the table 

1276 for key in keys: 

1277 counts, dev, unique = self.count_occurrences(dat[key][mask]) 

1278 print(f' {key:11} | {counts:11.1f} | {dev:14.2f} | {unique:14}') 

1279 

1280 return None 

1281 

1282 def combinations_histogram(self, dat=None, keys=None, mask=None, ax=None): # pragma: no cover 

1283 ''' 

1284 Make a histogram ouf of the occurrences of combinations 

1285 

1286 Args: 

1287 dat (dict): The data of the recorded statistics 

1288 keys (list): The keys in dat that you want to know the combinations of 

1289 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1290 

1291 Returns: 

1292 Matplotlib.axes: The plot 

1293 ''' 

1294 if ax is None: 

1295 fig, ax = plt.subplots(1, 1) 

1296 

1297 occurrences, bins = self.get_combination_histogram(dat, keys, mask) 

1298 

1299 ax.bar(bins[:-1], occurrences) 

1300 

1301 ax.set_xlabel('Occurrence of combinations') 

1302 

1303 return ax 

1304 

1305 def get_combination_histogram(self, dat=None, keys=None, mask=None): # pragma: no cover 

1306 ''' 

1307 Check how many combinations of values we expect and how many we find to see if we need to do more experiments. 

1308 It is assumed that each allowed value for each key appears at least once in dat after the mask was applied 

1309 

1310 Args: 

1311 dat (dict): The data of the recorded statistics 

1312 keys (list): The keys in dat that you want to know the combinations of 

1313 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1314 

1315 Returns: 

1316 Numpy.ndarray: Number of occurrences of combinations 

1317 Numpy.ndarray: Bins 

1318 ''' 

1319 

1320 # load default values 

1321 dat = self.load(strategy=self.strategies[0], faults=True) if dat is None else dat 

1322 keys = ['iteration', 'bit', 'node'] if keys is None else keys 

1323 if mask is None: 

1324 mask = np.ones_like(dat['error'], dtype=bool) 

1325 

1326 # get unique values and compute how many combinations you expect 

1327 num_unique = [len(np.unique(dat[key][mask])) for key in keys] 

1328 expected_number_of_combinations = np.prod(num_unique) 

1329 

1330 # test what you actually get 

1331 combination_counts = self.get_combination_counts(dat, keys, mask) 

1332 

1333 # make a histogram with the result 

1334 occurrences, bins = np.histogram(combination_counts, bins=np.arange(max(combination_counts) + 1)) 

1335 occurrences[0] = expected_number_of_combinations - len(combination_counts) 

1336 

1337 return occurrences, bins 

1338 

1339 def get_max_combinations(self, dat=None, strategy=None): 

1340 ''' 

1341 Count how many combinations of parameters for faults are possible 

1342 

1343 Args: 

1344 dat (dict): The recorded statistics 

1345 keys (list): The keys in dat that you want to know the combinations of 

1346 strategy (Strategy): The resilience strategy 

1347 

1348 Returns: 

1349 int: Number of possible combinations 

1350 ''' 

1351 strategy = self.strategies[0] if strategy is None else strategy 

1352 stats, controller, Tend = self.single_run(strategy=strategy, run=0, faults=True) 

1353 faultHook = get_fault_injector_hook(controller) 

1354 ranges = [ 

1355 (0, faultHook.rnd_params['level_number']), 

1356 (faultHook.rnd_params.get('min_node', 0), faultHook.rnd_params['node'] + 1), 

1357 (1, faultHook.rnd_params['iteration'] + 1), 

1358 (0, faultHook.rnd_params['bit']), 

1359 (0, faultHook.rnd_params['rank']), 

1360 ] 

1361 ranges += [(0, i) for i in faultHook.rnd_params['problem_pos']] 

1362 return np.prod([me[1] - me[0] for me in ranges], dtype=int) 

1363 

1364 def get_combination_counts(self, dat, keys, mask): 

1365 ''' 

1366 Get counts of how often all combinations of values of keys appear. This is done recursively to support arbitrary 

1367 numbers of keys 

1368 

1369 Args: 

1370 dat (dict): The data of the recorded statistics 

1371 keys (list): The keys in dat that you want to know the combinations of 

1372 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting 

1373 

1374 Returns: 

1375 list: Occurrences of all combinations 

1376 ''' 

1377 key = keys[0] 

1378 unique_vals = np.unique(dat[key][mask]) 

1379 res = [] 

1380 

1381 for i in range(len(unique_vals)): 

1382 inner_mask = self.get_mask(key=key, val=unique_vals[i], op='eq', old_mask=mask) 

1383 if len(keys) > 1: 

1384 res += self.get_combination_counts(dat, keys[1:], inner_mask) 

1385 else: 

1386 res += [self.count_occurrences(dat[key][inner_mask])[0]] 

1387 return res 

1388 

1389 def count_occurrences(self, vals): 

1390 ''' 

1391 Count the occurrences of unique values in vals and compute average deviation from mean 

1392 

1393 Args: 

1394 vals (list): Values you want to check 

1395 

1396 Returns: 

1397 float: Mean of number of occurrences of unique values in vals 

1398 float: Average deviation from mean number of occurrences 

1399 int: Number of unique entries 

1400 ''' 

1401 unique_vals, counts = np.unique(vals, return_counts=True) 

1402 

1403 if len(counts) > 0: 

1404 return counts.mean(), sum(abs(counts - counts.mean())) / len(counts), len(counts) 

1405 else: 

1406 return None, None, 0 

1407 

1408 def bar_plot_thing( 

1409 self, x=None, thing=None, ax=None, mask=None, store=False, faults=False, name=None, op=None, args=None 

1410 ): # pragma: no cover 

1411 ''' 

1412 Make a bar plot about something! 

1413 

1414 Args: 

1415 x (Numpy.ndarray of dimension 1): x values for bar plot 

1416 thing (str): Some key stored in the stats that will go on the y-axis 

1417 mask (Numpy.ndarray of shape (n)): The mask you want to apply before plotting 

1418 store (bool): Store the plot at a predefined path or not (for jupyter notebooks) 

1419 faults (bool): Whether to load stats with faults or without 

1420 name (str): Optional name for the plot 

1421 op (function): Operation that is applied to thing before plotting default is recovery rate 

1422 args (dict): Parameters for how the plot should look 

1423 

1424 Returns: 

1425 None 

1426 ''' 

1427 if ax is None: 

1428 fig, ax = plt.subplots(1, 1) 

1429 store = True 

1430 op = self.mean if op is None else op 

1431 

1432 # get the values for the bars 

1433 height = np.zeros(len(self.strategies)) 

1434 for strategy_idx in range(len(self.strategies)): 

1435 strategy = self.strategies[strategy_idx] 

1436 

1437 # load the values 

1438 dat = self.load(strategy=strategy, faults=faults) 

1439 no_faults = self.load(strategy=strategy, faults=False) 

1440 

1441 # check if we have a mask 

1442 if mask is None: 

1443 mask = np.ones_like(dat[thing], dtype=bool) 

1444 

1445 height[strategy_idx] = op(dat, no_faults, thing, mask) 

1446 

1447 # get some x values 

1448 x = np.arange(len(self.strategies)) if x is None else x 

1449 

1450 # prepare labels 

1451 ticks = [strategy.bar_plot_x_label for strategy in self.strategies] 

1452 

1453 ax.bar(x, height, tick_label=ticks) 

1454 

1455 # set the parameters 

1456 ax.set_ylabel(thing) 

1457 args = {} if args is None else args 

1458 [plt.setp(ax, k, v) for k, v in args.items()] 

1459 

1460 if store: 

1461 fig.tight_layout() 

1462 plt.savefig(f'data/{self.get_name()}-{thing if name is None else name}-barplot.pdf', transparent=True) 

1463 plt.close(fig) 

1464 

1465 return None 

1466 

1467 def fault_frequency_plot(self, ax, iter_ax, kwargs_range, strategy=None): # pragma: no cover 

1468 func_args = locals() 

1469 func_args.pop('self', None) 

1470 if strategy is None: 

1471 for strat in self.strategies: 

1472 args = {**func_args, 'strategy': strat} 

1473 self.fault_frequency_plot(**args) 

1474 return None 

1475 

1476 # load data 

1477 all_data = {} 

1478 for me in kwargs_range['fault_frequency_iter']: 

1479 self.kwargs['fault_frequency_iter'] = me 

1480 self.get_recovered() 

1481 all_data[me] = self.load(strategy=strategy, faults=True, mode='regular') 

1482 

1483 # get_recovery_rate 

1484 results = {} 

1485 results['frequencies'] = list(all_data.keys()) 

1486 results['recovery_rate'] = [ 

1487 len(all_data[key]['recovered'][all_data[key]['recovered'] is True]) / len(all_data[key]['recovered']) 

1488 for key in all_data.keys() 

1489 ] 

1490 # results['iterations'] = [np.mean(all_data[key]['total_iteration']) for key in all_data.keys()] 

1491 results['iterations'] = [ 

1492 np.mean(all_data[key]['total_iteration'][all_data[key]['error'] != np.inf]) for key in all_data.keys() 

1493 ] 

1494 

1495 ax.plot(results['frequencies'], results['recovery_rate'], **strategy.style) 

1496 iter_ax.plot(results['frequencies'], results['iterations'], **{**strategy.style, 'ls': '--'}) 

1497 

1498 ax.set_xscale('log') 

1499 ax.set_xlabel('iterations between fault') 

1500 ax.set_ylabel('recovery rate') 

1501 iter_ax.set_ylabel('average total iterations if not crashed (dashed)') 

1502 ax.legend(frameon=False) 

1503 

1504 return None 

1505 

1506 def get_HR_tol(self, verbose=False): 

1507 from pySDC.implementations.convergence_controller_classes.hotrod import HotRod 

1508 

1509 HR_strategy = HotRodStrategy(useMPI=self.use_MPI) 

1510 

1511 description = HR_strategy.get_custom_description(self.prob, self.num_procs) 

1512 description['convergence_controllers'][HotRod]['HotRod_tol'] = 1e2 

1513 

1514 stats, _, _ = self.single_run(HR_strategy, force_params=description) 

1515 

1516 e_extrapolation = get_sorted(stats, type='error_extrapolation_estimate') 

1517 diff = [] 

1518 for i in range(len(e_extrapolation)): 

1519 if e_extrapolation[i][1] is not None: 

1520 e_embedded = get_sorted(stats, type='error_embedded_estimate', time=e_extrapolation[i][0]) 

1521 diff += [abs(e_extrapolation[i][1] - e_embedded[-1][1])] 

1522 

1523 max_diff = max(diff) 

1524 proposed_tol = 2 * max_diff 

1525 if verbose: 

1526 print( 

1527 f'Max. diff: {max_diff:.6e} -> proposed HR tolerance: {proposed_tol:.6e} for {self.prob.__name__} problem with {self.num_procs} procs.' 

1528 ) 

1529 else: 

1530 print(f'{proposed_tol:.6e}') 

1531 

1532 

1533def check_local_error(): # pragma: no cover 

1534 """ 

1535 Make a plot of the resolution over time for all problems 

1536 """ 

1537 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench] 

1538 problems = [run_quench] 

1539 strategies = [BaseStrategy(), AdaptivityStrategy(), IterateStrategy()] 

1540 

1541 for i in range(len(problems)): 

1542 stats_analyser = FaultStats( 

1543 prob=problems[i], 

1544 strategies=strategies, 

1545 faults=[False], 

1546 reload=True, 

1547 recovery_thresh=1.1, 

1548 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1549 num_procs=1, 

1550 mode='random', 

1551 ) 

1552 stats_analyser.compare_strategies() 

1553 plt.show() 

1554 

1555 

1556def parse_args(): 

1557 import sys 

1558 

1559 kwargs = {} 

1560 

1561 for i in range(len(sys.argv)): 

1562 if 'prob' in sys.argv[i]: 

1563 if sys.argv[i + 1] == 'run_Lorenz': 

1564 kwargs['prob'] = run_Lorenz 

1565 elif sys.argv[i + 1] == 'run_vdp': 

1566 kwargs['prob'] = run_vdp 

1567 elif sys.argv[i + 1] == 'run_Schroedinger': 

1568 kwargs['prob'] = run_Schroedinger 

1569 elif sys.argv[i + 1] == 'run_quench': 

1570 kwargs['prob'] = run_quench 

1571 else: 

1572 raise NotImplementedError 

1573 elif 'num_procs' in sys.argv[i]: 

1574 kwargs['num_procs'] = int(sys.argv[i + 1]) 

1575 elif 'runs' in sys.argv[i]: 

1576 kwargs['runs'] = int(sys.argv[i + 1]) 

1577 elif 'mode' in sys.argv[i]: 

1578 kwargs['mode'] = str(sys.argv[i + 1]) 

1579 elif 'reload' in sys.argv[i]: 

1580 kwargs['reload'] = False if sys.argv[i + 1] == 'False' else True 

1581 

1582 return kwargs 

1583 

1584 

1585def compare_adaptivity_modes(): 

1586 from pySDC.projects.Resilience.strategies import AdaptivityRestartFirstStep 

1587 

1588 fig, axs = plt.subplots(2, 2, sharey=True) 

1589 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench] 

1590 titles = ['Van der Pol', 'Lorenz attractor', r'Schr\"odinger', 'Quench'] 

1591 

1592 for i in range(len(problems)): 

1593 ax = axs.flatten()[i] 

1594 

1595 ax.set_title(titles[i]) 

1596 

1597 stats_analyser = FaultStats( 

1598 prob=problems[i], 

1599 strategies=[AdaptivityStrategy()], 

1600 faults=[False, True], 

1601 reload=True, 

1602 recovery_thresh=1.1, 

1603 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1604 num_procs=1, 

1605 mode='default', 

1606 stats_path='data/stats-jusuf', 

1607 ) 

1608 stats_analyser.get_recovered() 

1609 

1610 for strategy in stats_analyser.strategies: 

1611 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1612 stats_analyser.plot_things_per_things( 

1613 'recovered', 

1614 'bit', 

1615 False, 

1616 strategies=[strategy], 

1617 op=stats_analyser.rec_rate, 

1618 mask=fixable, 

1619 args={'ylabel': 'recovery rate', 'label': 'bla'}, 

1620 name='fixable_recovery', 

1621 ax=ax, 

1622 ) 

1623 single_proc = ax.lines[-1] 

1624 single_proc.set_label('single process') 

1625 single_proc.set_color('black') 

1626 

1627 stats_analyser = FaultStats( 

1628 prob=problems[i], 

1629 strategies=[AdaptivityStrategy(), AdaptivityRestartFirstStep()], 

1630 faults=[False, True], 

1631 reload=True, 

1632 recovery_thresh=1.5, 

1633 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0), 

1634 num_procs=4, 

1635 mode='default', 

1636 stats_path='data/stats-jusuf', 

1637 ) 

1638 stats_analyser.get_recovered() 

1639 

1640 for strategy in stats_analyser.strategies: 

1641 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1642 stats_analyser.plot_things_per_things( 

1643 'recovered', 

1644 'bit', 

1645 False, 

1646 strategies=[strategy], 

1647 op=stats_analyser.rec_rate, 

1648 mask=fixable, 

1649 args={'ylabel': 'recovery rate'}, 

1650 name='fixable_recovery', 

1651 ax=ax, 

1652 ) 

1653 fig.suptitle('Comparison of restart modes with adaptivity with 4 ranks') 

1654 fig.tight_layout() 

1655 plt.show() 

1656 

1657 

1658def main(): 

1659 kwargs = { 

1660 'prob': run_AC, 

1661 'num_procs': 1, 

1662 'mode': 'default', 

1663 'runs': 2000, 

1664 'reload': True, 

1665 **parse_args(), 

1666 } 

1667 

1668 strategy_args = { 

1669 'stop_at_nan': True, 

1670 } 

1671 

1672 from pySDC.projects.Resilience.strategies import AdaptivityPolynomialError, kAdaptivityStrategy 

1673 

1674 stats_analyser = FaultStats( 

1675 strategies=[ 

1676 BaseStrategy(**strategy_args), 

1677 AdaptivityStrategy(**strategy_args), 

1678 kAdaptivityStrategy(**strategy_args), 

1679 HotRodStrategy(**strategy_args), 

1680 AdaptivityPolynomialError(**strategy_args), 

1681 ], 

1682 faults=[False, True], 

1683 recovery_thresh=1.15, 

1684 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(kwargs.get('prob', None), 0), 

1685 stats_path='data/stats-jusuf', 

1686 **kwargs, 

1687 ) 

1688 stats_analyser.run_stats_generation(runs=kwargs['runs'], step=12) 

1689 

1690 if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data 

1691 return None 

1692 

1693 stats_analyser.get_recovered() 

1694 mask = None 

1695 

1696 # stats_analyser.compare_strategies() 

1697 stats_analyser.plot_things_per_things( 

1698 'recovered', 'node', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1699 ) 

1700 stats_analyser.plot_things_per_things( 

1701 'recovered', 'iteration', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1702 ) 

1703 

1704 stats_analyser.plot_things_per_things( 

1705 'recovered', 'bit', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'} 

1706 ) 

1707 

1708 # make a plot for only the faults that can be recovered 

1709 fig, ax = plt.subplots(1, 1) 

1710 for strategy in stats_analyser.strategies: 

1711 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy) 

1712 stats_analyser.plot_things_per_things( 

1713 'recovered', 

1714 'bit', 

1715 False, 

1716 strategies=[strategy], 

1717 op=stats_analyser.rec_rate, 

1718 mask=fixable, 

1719 args={'ylabel': 'recovery rate'}, 

1720 name='fixable_recovery', 

1721 ax=ax, 

1722 ) 

1723 fig.tight_layout() 

1724 fig.savefig(f'data/{stats_analyser.get_name()}-recoverable.pdf', transparent=True) 

1725 

1726 fig, ax = plt.subplots(1, 1, figsize=(13, 4)) 

1727 stats_analyser.plot_recovery_thresholds(ax=ax, thresh_range=np.logspace(-1, 1, 1000)) 

1728 # ax.axvline(stats_analyser.get_thresh(BaseStrategy()), color='grey', ls=':', label='recovery threshold') 

1729 ax.set_xscale('log') 

1730 ax.legend(frameon=False) 

1731 fig.tight_layout() 

1732 fig.savefig(f'data/{stats_analyser.get_name()}-threshold.pdf', transparent=True) 

1733 

1734 stats_analyser.plot_things_per_things( 

1735 'total_iteration', 

1736 'bit', 

1737 True, 

1738 op=stats_analyser.mean, 

1739 mask=mask, 

1740 args={'yscale': 'log', 'ylabel': 'total iterations'}, 

1741 ) 

1742 stats_analyser.plot_things_per_things( 

1743 'total_iteration', 

1744 'bit', 

1745 True, 

1746 op=stats_analyser.extra_mean, 

1747 mask=mask, 

1748 args={'yscale': 'linear', 'ylabel': 'extra iterations'}, 

1749 name='extra_iter', 

1750 ) 

1751 stats_analyser.plot_things_per_things( 

1752 'error', 'bit', True, op=stats_analyser.mean, mask=mask, args={'yscale': 'log'} 

1753 ) 

1754 

1755 stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.9, 1.3, 100)) 

1756 plt.show() 

1757 

1758 

1759if __name__ == "__main__": 

1760 main() 

1761 # compare_adaptivity_modes() 

1762 # check_local_error()