Coverage for pySDC / projects / Resilience / fault_stats.py: 56%
451 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-27 07:06 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-27 07:06 +0000
1import numpy as np
2import matplotlib.pyplot as plt
3from mpi4py import MPI
4import pickle
6import pySDC.helpers.plot_helper as plot_helper
7from pySDC.helpers.stats_helper import get_sorted
9from pySDC.projects.Resilience.hook import hook_collection, LogUAllIter, LogData
10from pySDC.projects.Resilience.fault_injection import get_fault_injector_hook
11from pySDC.implementations.convergence_controller_classes.hotrod import HotRod
12from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
13from pySDC.implementations.hooks.log_errors import LogLocalErrorPostStep
14from pySDC.implementations.hooks.log_work import LogWork
16# these problems are available for testing
17from pySDC.projects.Resilience.advection import run_advection
18from pySDC.projects.Resilience.vdp import run_vdp
19from pySDC.projects.Resilience.piline import run_piline
20from pySDC.projects.Resilience.Lorenz import run_Lorenz
21from pySDC.projects.Resilience.Schroedinger import run_Schroedinger
22from pySDC.projects.Resilience.quench import run_quench
23from pySDC.projects.Resilience.AC import run_AC
24from pySDC.projects.Resilience.RBC import run_RBC
25from pySDC.projects.Resilience.GS import run_GS
27from pySDC.projects.Resilience.strategies import BaseStrategy, AdaptivityStrategy, IterateStrategy, HotRodStrategy
28import logging
30plot_helper.setup_mpl(reset=True)
32LOGGER_LEVEL = 40
34RECOVERY_THRESH_ABS = {
35 # run_quench: 5e-3,
36 # run_Schroedinger: 1.7e-6,
37}
40class FaultStats:
41 '''
42 Class to generate and analyse fault statistics
43 '''
45 def __init__(
46 self,
47 prob=None,
48 strategies=None,
49 faults=None,
50 reload=True,
51 recovery_thresh=1 + 1e-3,
52 recovery_thresh_abs=0.0,
53 num_procs=1,
54 mode='default',
55 stats_path='data/stats',
56 use_MPI=False,
57 **kwargs,
58 ):
59 '''
60 Initialization routine
62 Args:
63 prob: A function that runs a pySDC problem, see imports for available problems
64 strategies (list): List of resilience strategies
65 faults (list): List of booleans that describe whether to use faults or not
66 reload (bool): Load previously computed statistics and continue from there or start from scratch
67 recovery_thresh (float): Relative threshold for recovery
68 num_procs (int): Number of processes
69 mode (str): Mode for fault generation: Either 'random' or 'combination'
70 '''
71 self.prob = prob
72 self.strategies = [None] if strategies is None else strategies
73 self.faults = [False, True] if faults is None else faults
74 self.reload = reload
75 self.recovery_thresh = recovery_thresh
76 self.recovery_thresh_abs = recovery_thresh_abs
77 self.num_procs = num_procs
78 self.use_MPI = use_MPI
79 self.stats_path = stats_path
80 self.kwargs = {
81 'fault_frequency_iter': 500,
82 **kwargs,
83 }
85 # decide mode
86 if mode == 'default':
87 if prob.__name__ in ['run_vdp', 'run_Lorenz']:
88 self.mode = 'combination'
89 else:
90 self.mode = 'random'
91 else:
92 self.mode = mode
94 self.logger = logging.getLogger('FaultStats')
95 self.logger.level = LOGGER_LEVEL
97 msg = 'Starting FaultStats with attributes'
98 for key, val in self.__dict__.items():
99 if key in ['logger']:
100 continue
101 item = [str(me) for me in val] if type(val) == list else str(val)
102 msg += '\n\t' f'{key}: {item}'
103 self.logger.log(25, msg)
105 def get_Tend(self):
106 '''
107 Get the final time of runs for fault stats based on the problem
109 Returns:
110 float: Tend to put into the run
111 '''
112 return self.strategies[0].get_Tend(self.prob, self.num_procs, resilience_experiment=True)
114 def run_stats_generation(
115 self, runs=1000, step=None, comm=None, kwargs_range=None, faults=None, _reload=False, _runs_partial=0
116 ):
117 '''
118 Run the generation of stats for all strategies in the `self.strategies` variable
120 Args:
121 runs (int): Number of runs you want to do
122 step (int): Number of runs you want to do between saving
123 comm (MPI.Communicator): Communicator for distributing runs
124 faults (bool): Whether to do stats with faults or without
125 kw_args_range (dict): Range for the parameters
126 _reload, _runs_partial: Variables only used for recursion. Do not change!
128 Returns:
129 None
130 '''
131 if faults is None:
132 for f in self.faults:
133 self.run_stats_generation(runs=runs, step=step, comm=comm, kwargs_range=kwargs_range, faults=f)
134 return None
136 for key, val in kwargs_range.items() if kwargs_range is not None else {}:
137 if type(val) == int:
138 self.kwargs[key] = val
139 else:
140 for me in val:
141 kwargs_range_me = {**kwargs_range, key: me}
142 self.run_stats_generation(
143 runs=runs, step=step, comm=comm, kwargs_range=kwargs_range_me, faults=faults
144 )
145 return None
147 comm = MPI.COMM_WORLD if comm is None else comm
148 step = (runs if comm.size == 1 else comm.size) if step is None else step
149 _runs_partial = step if _runs_partial == 0 else _runs_partial
150 reload = self.reload or _reload
152 # see if we limit the number of runs we want to do
153 max_runs = (
154 (min(self.get_max_combinations(), runs) if self.mode == 'combination' else runs) if faults else min(runs, 5)
155 )
157 if reload:
158 # sort the strategies to do some load balancing
159 sorting_index = None
160 if comm.rank == 0:
161 already_completed = np.array(
162 [self.load(strategy=strategy, faults=faults).get('runs', 0) for strategy in self.strategies]
163 )
164 sorting_index_ = np.argsort(already_completed)
165 sorting_index = sorting_index_[already_completed[sorting_index_] < max_runs]
166 _runs_partial = max(min(already_completed), _runs_partial)
168 # tell all ranks what strategies to use
169 sorting_index = comm.bcast(sorting_index, root=0)
170 _runs_partial = comm.bcast(_runs_partial, root=0)
171 strategies = [self.strategies[i] for i in sorting_index]
172 if len(strategies) == 0: # check if we are already done
173 return None
174 else:
175 strategies = self.strategies
177 strategy_comm = comm.Split(comm.rank % len(strategies))
179 for j in range(0, len(strategies), comm.size):
180 self.generate_stats(
181 strategy=strategies[j + (comm.rank % len(strategies) % (len(strategies) - j))],
182 runs=min(_runs_partial, max_runs),
183 faults=faults,
184 reload=reload,
185 comm=strategy_comm,
186 )
187 strategy_comm.Free()
188 self.run_stats_generation(
189 runs=runs, step=step, comm=comm, faults=faults, _reload=True, _runs_partial=_runs_partial + step
190 )
192 return None
194 def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, comm=None):
195 '''
196 Generate statistics for recovery from bit flips
197 -----------------------------------------------
199 Every run is given a different random seed such that we have different faults and the results are then stored
201 Args:
202 strategy (Strategy): Resilience strategy
203 runs (int): Number of runs you want to do
204 reload (bool): Load previously computed statistics and continue from there or start from scratch
205 faults (bool): Whether to do stats with faults or without
206 comm (MPI.Communicator): Communicator for distributing runs
208 Returns:
209 None
210 '''
211 comm = MPI.COMM_WORLD if comm is None else comm
213 # initialize dictionary to store the stats in
214 dat = {
215 'level': np.zeros(runs),
216 'iteration': np.zeros(runs),
217 'node': np.zeros(runs),
218 'problem_pos': [],
219 'bit': np.zeros(runs),
220 'error': np.zeros(runs),
221 'total_iteration': np.zeros(runs),
222 'total_newton_iteration': np.zeros(runs),
223 'restarts': np.zeros(runs),
224 'target': np.zeros(runs),
225 'rank': np.zeros(runs),
226 }
228 # store arguments for storing and loading
229 identifier_args = {
230 'faults': faults,
231 'strategy': strategy,
232 }
234 # reload previously recorded stats and write them to dat
235 if reload:
236 already_completed_ = None
237 if comm.rank == 0:
238 already_completed_ = self.load(**identifier_args)
239 already_completed = comm.bcast(already_completed_, root=0)
240 if already_completed['runs'] > 0 and already_completed['runs'] <= runs and comm.rank == 0:
241 avail_len = min([already_completed['runs'], runs])
242 for k in dat.keys():
243 dat[k][:avail_len] = already_completed.get(k, [None] * avail_len)
244 else:
245 already_completed = {'runs': 0}
247 # prepare a message
248 involved_ranks = comm.gather(MPI.COMM_WORLD.rank, root=0)
249 msg = f'{comm.size} rank(s) ({involved_ranks}) doing {strategy.name}{" with faults" if faults else ""} with {self.num_procs} ranks for {self.prob.__name__} from {already_completed["runs"]} to {runs}'
250 if comm.rank == 0 and already_completed['runs'] < runs:
251 print(msg, flush=True)
253 space_comm = MPI.COMM_SELF.Split(True)
255 # perform the remaining experiments
256 for i in range(already_completed['runs'], runs):
257 if i % comm.size != comm.rank:
258 continue
260 # perform a single experiment with the correct random seed
261 stats, controller, crash = self.single_run(strategy=strategy, run=i, faults=faults, space_comm=space_comm)
263 # get the data from the stats
264 faults_run = get_sorted(stats, type='bitflip')
266 if faults:
267 assert len(faults_run) > 0, f"Did not record a fault in run {i} of {strategy.name}!"
268 dat['level'][i] = faults_run[0][1][0]
269 dat['iteration'][i] = faults_run[0][1][1]
270 dat['node'][i] = faults_run[0][1][2]
271 dat['problem_pos'] += [faults_run[0][1][3]]
272 dat['bit'][i] = faults_run[0][1][4]
273 dat['target'][i] = faults_run[0][1][5]
274 dat['rank'][i] = faults_run[0][1][6]
276 if crash:
277 print('Code crashed!')
278 dat['error'][i] = np.inf
279 continue
281 # record the rest of the data
282 t, u = get_sorted(stats, type='u', recomputed=False)[-1]
283 error = self.get_error(u, t, controller, strategy, self.get_Tend())
284 total_iteration = sum([k[1] for k in get_sorted(stats, type='k')])
285 total_newton_iteration = sum([k[1] for k in get_sorted(stats, type='work_newton')])
287 dat['error'][i] = error
288 dat['total_iteration'][i] = total_iteration
289 dat['total_newton_iteration'][i] = total_newton_iteration
290 dat['restarts'][i] = sum([me[1] for me in get_sorted(stats, type='restart')])
292 # free the space communicator
293 space_comm.Free()
295 dat_full = {}
296 for k in dat.keys():
297 dat_full[k] = comm.reduce(dat[k], op=MPI.SUM)
299 # store the completed stats
300 dat_full['runs'] = runs
302 if already_completed['runs'] < runs:
303 if comm.rank == 0:
304 self.store(dat_full, **identifier_args)
305 if self.faults:
306 self.get_recovered(strategy=strategy)
308 return None
310 def get_error(self, u, t, controller, strategy, Tend):
311 """
312 Compute the error.
314 Args:
315 u (dtype_u): The solution at the end of the run
316 t (float): Time at which `u` was recorded
317 controller (pySDC.controller.controller): The controller
318 strategy (Strategy): The resilience strategy
319 Tend (float): Time you want to run to
321 Returns:
322 float: Error
323 """
324 lvl = controller.MS[0].levels[0]
326 # decide if we reached Tend
327 Tend_reached = t + lvl.params.dt_initial >= Tend
329 if Tend_reached:
330 u_star = lvl.prob.u_exact(t=t)
331 return abs(u - u_star) / abs(u_star)
332 else:
333 print(f'Final time was not reached! Code crashed at t={t:.2f} instead of reaching Tend={Tend:.2f}')
334 return np.inf
336 def single_run(
337 self,
338 strategy,
339 run=0,
340 faults=False,
341 force_params=None,
342 hook_class=None,
343 space_comm=None,
344 Tend=None,
345 time_comm=None,
346 ):
347 '''
348 Run the problem once with the specified parameters
350 Args:
351 strategy (Strategy): The resilience strategy you plan on using
352 run (int): Index for fault generation
353 faults (bool): Whether or not to put faults in
354 force_params (dict): Change parameters in the description of the problem
355 space_comm (MPI.Communicator): A communicator for space parallelisation
356 Tend (float): Time you want to run to
357 time_comm (MPI.Intracomm.Communicator): Communicator in time
359 Returns:
360 dict: Stats object containing statistics for each step, each level and each iteration
361 pySDC.Controller: The controller of the run
362 float: The time the problem should have run to
363 '''
364 hook_class = hook_collection + [LogWork] + ([LogData] if hook_class is None else hook_class)
365 force_params = {} if force_params is None else force_params
367 # build the custom description
368 custom_description = strategy.get_custom_description_for_faults(self.prob, self.num_procs)
369 for k in force_params.keys():
370 custom_description[k] = {**custom_description.get(k, {}), **force_params[k]}
372 custom_controller_params = {
373 'logger_level': LOGGER_LEVEL,
374 **force_params.get('controller_params', {}),
375 }
377 if faults:
378 fault_stuff = {
379 'rng': None,
380 'args': strategy.get_fault_args(self.prob, self.num_procs),
381 'rnd_params': strategy.get_random_params(self.prob, self.num_procs),
382 }
384 # make parameters for faults:
385 if self.mode == 'random':
386 fault_stuff['rng'] = np.random.RandomState(run)
387 elif self.mode == 'combination':
388 fault_stuff['rng'] = run
389 elif self.mode == 'regular':
390 fault_stuff['rng'] = np.random.RandomState(run)
391 fault_stuff['fault_frequency_iter'] = self.kwargs['fault_frequency_iter']
392 fault_stuff['rnd_params'] = {
393 'bit': 12,
394 'min_node': 1,
395 }
396 else:
397 raise NotImplementedError(f'Don\'t know how to add faults in mode {self.mode}')
399 else:
400 fault_stuff = None
402 return self.prob(
403 custom_description=custom_description,
404 num_procs=self.num_procs,
405 hook_class=hook_class,
406 fault_stuff=fault_stuff,
407 Tend=self.get_Tend() if Tend is None else Tend,
408 custom_controller_params=custom_controller_params,
409 space_comm=space_comm,
410 comm=time_comm,
411 use_MPI=time_comm is not None,
412 )
414 def compare_strategies(self, run=0, faults=False, ax=None): # pragma: no cover
415 '''
416 Take a closer look at how the strategies compare for a specific run
418 Args:
419 run (int): The number of the run to get the appropriate random generator
420 faults (bool): Whether or not to include faults
421 ax (Matplotlib.axes): Somewhere to plot
423 Returns:
424 None
425 '''
426 if ax is None:
427 fig, ax = plt.subplots(1, 1)
428 store = True
429 else:
430 store = False
432 k_ax = ax.twinx()
433 ls = ['-.' if type(strategy) == HotRodStrategy else '-' for strategy in self.strategies]
434 [self.scrutinize_visual(self.strategies[i], run, faults, ax, k_ax, ls[i]) for i in range(len(self.strategies))]
436 # make a legend
437 [k_ax.plot([None], [None], label=strategy.label, color=strategy.color) for strategy in self.strategies]
438 k_ax.legend(frameon=True)
440 if store:
441 fig.tight_layout()
442 plt.savefig(f'data/{self.get_name()}-comparison.pdf', transparent=True)
444 def scrutinize_visual(
445 self, strategy, run, faults, ax=None, k_ax=None, ls='-', plot_restarts=False
446 ): # pragma: no cover
447 '''
448 Take a closer look at a specific run with a plot
450 Args:
451 strategy (Strategy): The resilience strategy you plan on using
452 run (int): The number of the run to get the appropriate random generator
453 faults (bool): Whether or not to include faults
454 ax (Matplotlib.axes): Somewhere to plot the error
455 k_ax (Matplotlib.axes): Somewhere to plot the iterations
456 plot_restarts (bool): Make vertical lines wherever restarts happened
458 Returns:
459 dict: Stats from the run
460 '''
461 if ax is None:
462 fig, ax = plt.subplots(1, 1)
463 store = True
464 else:
465 store = False
467 force_params = {}
469 stats, controller, Tend = self.single_run(
470 strategy=strategy,
471 run=run,
472 faults=faults,
473 force_params=force_params,
474 hook_class=hook_collection + [LogLocalErrorPostStep, LogData],
475 )
477 # plot the local error
478 e_loc = get_sorted(stats, type='e_local_post_step', recomputed=False)
479 ax.plot([me[0] for me in e_loc], [me[1] for me in e_loc], color=strategy.color, ls=ls)
481 # plot the iterations
482 k_ax = ax.twinx() if k_ax is None else k_ax
483 k = get_sorted(stats, type='k')
484 k_ax.plot([me[0] for me in k], np.cumsum([me[1] for me in k]), color=strategy.color, ls='--')
486 # plot the faults
487 faults = get_sorted(stats, type='bitflip')
488 for fault_time in [me[0] for me in faults]:
489 ax.axvline(fault_time, color='grey', ls=':')
491 # plot restarts
492 if plot_restarts:
493 restarts = get_sorted(stats, type='restart')
494 [ax.axvline(me[0], color='black', ls='-.') if me[1] else '' for me in restarts]
496 # decorate
497 ax.set_yscale('log')
498 ax.set_ylabel(r'$\epsilon$')
499 k_ax.set_ylabel('cumulative iterations (dashed)')
500 ax.set_xlabel(r'$t$')
502 if store:
503 fig.tight_layout()
504 plt.savefig(f'data/{self.get_name()}-{strategy.name}-details.pdf', transparent=True)
506 def scrutinize(self, strategy, logger_level=15, **kwargs):
507 '''
508 Take a closer look at a specific run
510 Args:
511 strategy (Strategy): The resilience strategy you plan on using
513 Returns:
514 None
515 '''
516 from pySDC.projects.Resilience.fault_injection import FaultInjector
518 force_params = {}
519 force_params['controller_params'] = {'logger_level': logger_level}
520 force_params['sweeper_params'] = {'skip_residual_computation': ()}
522 stats, controller, Tend = self.single_run(strategy=strategy, force_params=force_params, **kwargs)
524 u_all = get_sorted(stats, type='u', recomputed=False)
525 t, u = get_sorted(stats, type='u', recomputed=False)[np.argmax([me[0] for me in u_all])]
526 k = [me[1] for me in get_sorted(stats, type='k')]
528 fault_hook = [me for me in controller.hooks if type(me) == FaultInjector]
530 unhappened_faults = fault_hook[0].faults if len(fault_hook) > 0 else []
532 print(f'\nOverview for {strategy.name} strategy')
533 # print faults
534 faults = get_sorted(stats, type='bitflip')
535 print('\nfaults:')
536 print(' t | level | iter | node | bit | trgt | rank | pos')
537 print('--------+-------+------+------+-----+------+---------------------')
538 for f in faults:
539 print(
540 f' {f[0]:6.2f} | {f[1][0]:5d} | {f[1][1]:4d} | {f[1][2]:4d} | {f[1][4]:3d} | {f[1][5]:4d} | {f[1][6]:4d} |',
541 f[1][3],
542 )
543 print('--------+-------+------+------+-----+------+---------------------')
544 for f in unhappened_faults:
545 print(
546 f'!{f.time:6.2f} | {f.level_number:5d} | {f.iteration:4d} | {f.node:4d} | {f.bit:3d} | {f.target:4d} | {f.rank:4d} |',
547 f.problem_pos,
548 )
550 # see if we can determine if the faults where recovered
551 no_faults = self.load(strategy=strategy, faults=False)
552 e_star = np.mean(no_faults.get('error', [0]))
553 error = self.get_error(u, t, controller, strategy, Tend)
554 recovery_thresh = self.get_thresh(strategy)
556 print(
557 f'e={error:.2e}, e^*={e_star:.2e}, thresh: {recovery_thresh:.2e} -> recovered: {error < recovery_thresh}, e_rel = {error/abs(u):.2e}'
558 )
559 print(f'k: sum: {np.sum(k)}, min: {np.min(k)}, max: {np.max(k)}, mean: {np.mean(k):.2f},')
561 _newton_iter = get_sorted(stats, type='work_newton')
562 if len(_newton_iter) > 0:
563 newton_iter = [me[1] for me in _newton_iter]
564 print(
565 f'Newton: k: sum: {np.sum(newton_iter)}, min: {np.min(newton_iter)}, max: {np.max(newton_iter)}, mean: {np.mean(newton_iter):.2f},'
566 )
568 # checkout the step size
569 dt = [me[1] for me in get_sorted(stats, type='dt')]
570 print(f't: {t:.2f}, dt: min: {np.min(dt):.2e}, max: {np.max(dt):.2e}, mean: {np.mean(dt):.2e}')
572 # restarts
573 restarts = [me[1] for me in get_sorted(stats, type='restart')]
574 print(f'restarts: {sum(restarts)}, without faults: {no_faults["restarts"][0]}')
576 return stats
578 def convert_faults(self, faults):
579 '''
580 Make arrays of useable data from an entry in the stats object returned by pySDC
582 Args:
583 faults (list): The entry for faults returned by the pySDC run
585 Returns:
586 list: The times when the faults happened
587 list: The levels in which the faults happened
588 list: The iterations in which the faults happened
589 list: The nodes in which the faults happened
590 list: The problem positions in which the faults happened
591 list: The bits in which the faults happened
592 '''
593 time = [faults[i][0] for i in range(len(faults))]
594 level = [faults[i][1][0] for i in range(len(faults))]
595 iteration = [faults[i][1][1] for i in range(len(faults))]
596 node = [faults[i][1][2] for i in range(len(faults))]
597 problem_pos = [faults[i][1][3] for i in range(len(faults))]
598 bit = [faults[i][1][4] for i in range(len(faults))]
599 return time, level, iteration, node, problem_pos, bit
601 def get_path(self, **kwargs):
602 '''
603 Get the path to where the stats are stored
605 Args:
606 strategy (Strategy): The resilience strategy
607 faults (bool): Whether or not faults have been activated
609 Returns:
610 str: The path to what you are looking for
611 '''
612 return f'{self.stats_path}/{self.get_name(**kwargs)}.pickle'
614 def get_name(self, strategy=None, faults=True, mode=None):
615 '''
616 Function to get a unique name for a kind of statistics based on the problem and strategy that was used
618 Args:
619 strategy (Strategy): Resilience strategy
620 faults (bool): Whether or not faults where inserted
622 Returns:
623 str: The unique identifier
624 '''
625 if self.prob == run_advection:
626 prob_name = 'advection'
627 elif self.prob == run_vdp:
628 prob_name = 'vdp'
629 elif self.prob == run_piline:
630 prob_name = 'piline'
631 elif self.prob == run_Lorenz:
632 prob_name = 'Lorenz'
633 elif self.prob == run_Schroedinger:
634 prob_name = 'Schroedinger'
635 elif self.prob == run_quench:
636 prob_name = 'Quench'
637 elif self.prob.__name__ == 'run_AC':
638 prob_name = 'Allen-Cahn'
639 elif self.prob.__name__ == 'run_RBC':
640 prob_name = 'Rayleigh-Benard'
641 elif self.prob.__name__ == 'run_GS':
642 prob_name = 'Gray-Scott'
643 else:
644 raise NotImplementedError(f'Name not implemented for problem {self.prob}')
646 if faults:
647 fault_name = '-faults'
648 else:
649 fault_name = ''
651 if strategy is not None:
652 strategy_name = f'-{strategy.name}'
653 else:
654 strategy_name = ''
656 mode = self.mode if mode is None else mode
657 if mode == 'regular':
658 mode_thing = f'-regular{self.kwargs["fault_frequency_iter"] if faults else ""}'
659 else:
660 mode_thing = ''
662 mpi_thing = '-MPI' if self.use_MPI else ''
663 return f'{prob_name}{strategy_name}{fault_name}-{self.num_procs}procs{mode_thing}{mpi_thing}'
665 def store(self, dat, **kwargs):
666 '''
667 Stores the data for a run at a predefined path
669 Args:
670 dat (dict): The data of the recorded statistics
672 Returns:
673 None
674 '''
675 with open(self.get_path(**kwargs), 'wb') as f:
676 pickle.dump(dat, f)
677 return None
679 def load(self, **kwargs):
680 '''
681 Loads the stats belonging to a specific strategy and whether or not faults where inserted.
682 When no data has been generated yet, a dictionary is returned which only contains the number of completed runs,
683 which is 0 of course.
685 Returns:
686 dict: Data from previous run or if it is not available a placeholder dictionary
687 '''
688 kwargs['strategy'] = kwargs.get('strategy', self.strategies[MPI.COMM_WORLD.rank % len(self.strategies)])
690 try:
691 with open(self.get_path(**kwargs), 'rb') as f:
692 dat = pickle.load(f)
693 except FileNotFoundError:
694 self.logger.debug(f'File \"{self.get_path(**kwargs)}\" not found!')
695 return {'runs': 0}
696 return dat
698 def get_thresh(self, strategy=None):
699 """
700 Get recovery threshold based on relative and absolute tolerances
702 Args:
703 strategy (Strategy): The resilience strategy
704 """
705 fault_free = self.load(strategy=strategy, faults=False)
706 assert (
707 fault_free['error'].std() / fault_free['error'].mean() < 1e-5
708 ), f'Got too large variation of errors in {strategy} strategy!'
709 return max([self.recovery_thresh_abs, self.recovery_thresh * fault_free["error"].mean()])
711 def get_recovered(self, **kwargs):
712 '''
713 Determine the recovery rate for a specific strategy and store it to disk.
715 Returns:
716 None
717 '''
718 if 'strategy' not in kwargs.keys():
719 [self.get_recovered(strategy=strat, **kwargs) for strat in self.strategies]
720 else:
721 try:
722 with_faults = self.load(faults=True, **kwargs)
723 with_faults['recovered'] = with_faults['error'] <= self.get_thresh(kwargs['strategy'])
724 self.store(faults=True, dat=with_faults, **kwargs)
725 except KeyError as error:
726 print(
727 f'Warning: Can\'t compute recovery rate for strategy {kwargs["strategy"].name} in {self.prob.__name__} problem right now: KeyError: {error}'
728 )
730 return None
732 def crash_rate(self, dat, no_faults, thingA, mask):
733 '''
734 Determine the rate of runs that crashed
736 Args:
737 dat (dict): The data of the recorded statistics with faults
738 no_faults (dict): The data of the corresponding fault-free stats
739 thingA (str): Some key stored in the stats that will go on the y-axis
740 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply before determining the rate
742 Returns:
743 int: Ratio of the runs which crashed and fall under the specific criteria set by the mask
744 '''
745 if len(dat[thingA][mask]) > 0:
746 crash = dat['error'] == np.inf
747 return len(dat[thingA][mask & crash]) / len(dat[thingA][mask])
748 else:
749 return None
751 def rec_rate(self, dat, no_faults, thingA, mask):
752 '''
753 Operation for plotting which returns the recovery rate for a given mask.
754 Which thingA you apply this to actually does not matter here since we compute a rate.
756 Args:
757 dat (dict): The recorded statistics
758 no_faults (dict): The corresponding fault-free stats
759 thingA (str): Some key stored in the stats
760 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
762 Returns:
763 float: Recovery rate
764 '''
765 if len(dat[thingA][mask]) > 0:
766 return len(dat[thingA][mask & dat['recovered']]) / len(dat[thingA][mask])
767 else:
768 return None
770 def mean(self, dat, no_faults, thingA, mask):
771 '''
772 Operation for plotting which returns the mean of thingA after applying the mask
774 Args:
775 dat (dict): The recorded statistics
776 no_faults (dict): The corresponding fault-free stats
777 thingA (str): Some key stored in the stats
778 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
780 Returns:
781 float: Mean of thingA after applying mask
782 '''
783 return np.mean(dat[thingA][mask])
785 def extra_mean(self, dat, no_faults, thingA, mask):
786 '''
787 Operation for plotting which returns the difference in mean of thingA between runs with and without faults after
788 applying the mask
790 Args:
791 dat (dict): The recorded statistics
792 no_faults (dict): The corresponding fault-free stats
793 thingA (str): Some key stored in the stats
794 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
796 Returns:
797 float: Difference in mean of thingA between runs with and without faults after applying mask
798 '''
799 if True in mask or int in [type(me) for me in mask]:
800 return np.mean(dat[thingA][mask]) - np.mean(no_faults[thingA])
801 else:
802 return None
804 def plot_thingA_per_thingB(
805 self,
806 strategy,
807 thingA,
808 thingB,
809 ax=None,
810 mask=None,
811 recovered=False,
812 op=None,
813 **kwargs,
814 ): # pragma: no cover
815 '''
816 Plot thingA vs. thingB for a single strategy
818 Args:
819 strategy (Strategy): The resilience strategy you want to plot
820 thingA (str): Some key stored in the stats that will go on the y-axis
821 thingB (str): Some key stored in the stats that will go on the x-axis
822 ax (Matplotlib.axes): Somewhere to plot
823 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes
824 recovered (bool): Show the plot for both all runs and only the recovered ones
825 op (function): Operation that is applied to thingA before plotting default is recovery rate
827 Returns:
828 None
829 '''
830 op = self.rec_rate if op is None else op
831 dat = self.load(strategy=strategy, faults=True)
832 no_faults = self.load(strategy=strategy, faults=False)
834 if mask is None:
835 mask = np.ones_like(dat[thingB], dtype=bool)
837 admissable_thingB = np.unique(dat[thingB][mask])
838 me = np.zeros(len(admissable_thingB))
839 me_recovered = np.zeros_like(me)
841 for i in range(len(me)):
842 _mask = np.logical_and((dat[thingB] == admissable_thingB[i]), mask)
843 if _mask.any():
844 me[i] = op(dat, no_faults, thingA, _mask)
845 me_recovered[i] = op(dat, no_faults, thingA, np.logical_and(_mask, dat['recovered']))
847 if recovered:
848 ax.plot(
849 admissable_thingB,
850 me_recovered,
851 label=f'{strategy.label} (only recovered)',
852 color=strategy.color,
853 marker=strategy.marker,
854 ls='--',
855 linewidth=3,
856 **kwargs,
857 )
859 ax.plot(
860 admissable_thingB,
861 me,
862 label=f'{strategy.label}',
863 color=strategy.color,
864 marker=strategy.marker,
865 linewidth=2,
866 **kwargs,
867 )
869 ax.legend(frameon=False)
870 ax.set_xlabel(thingB)
871 ax.set_ylabel(thingA)
872 return None
874 def plot_things_per_things(
875 self,
876 thingA='bit',
877 thingB='bit',
878 recovered=False,
879 mask=None,
880 op=None,
881 args=None,
882 strategies=None,
883 name=None,
884 store=False,
885 ax=None,
886 fig=None,
887 plotting_args=None,
888 ): # pragma: no cover
889 '''
890 Plot thingA vs thingB for multiple strategies
892 Args:
893 thingA (str): Some key stored in the stats that will go on the y-axis
894 thingB (str): Some key stored in the stats that will go on the x-axis
895 recovered (bool): Show the plot for both all runs and only the recovered ones
896 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes
897 op (function): Operation that is applied to thingA before plotting default is recovery rate
898 args (dict): Parameters for how the plot should look
899 strategies (list): List of the strategies you want to plot, if None, all will be plotted
900 name (str): Optional name for the plot
901 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)
902 ax (Matplotlib.axes): Somewhere to plot
903 fig (Matplotlib.figure): Figure of the ax
905 Returns
906 None
907 '''
908 strategies = self.strategies if strategies is None else strategies
909 args = {} if args is None else args
911 # make sure we have something to plot in
912 if ax is None:
913 fig, ax = plt.subplots(1, 1)
914 elif fig is None:
915 store = False
917 # execute the plots for all strategies
918 for s in strategies:
919 self.plot_thingA_per_thingB(
920 s,
921 thingA=thingA,
922 thingB=thingB,
923 recovered=recovered,
924 ax=ax,
925 mask=mask,
926 op=op,
927 **(plotting_args if plotting_args else {}),
928 )
930 # set the parameters
931 [plt.setp(ax, k, v) for k, v in args.items()]
933 if store:
934 fig.tight_layout()
935 plt.savefig(f'data/{self.get_name()}-{thingA if name is None else name}_per_{thingB}.pdf', transparent=True)
936 plt.close(fig)
938 return None
940 def plot_recovery_thresholds(
941 self, strategies=None, thresh_range=None, ax=None, recoverable_only=False, **kwargs
942 ): # pragma: no cover
943 '''
944 Plot the recovery rate for a range of thresholds
946 Args:
947 strategies (list): List of the strategies you want to plot, if None, all will be plotted
948 thresh_range (list): thresholds for deciding whether to accept as recovered
949 ax (Matplotlib.axes): Somewhere to plot
951 Returns:
952 None
953 '''
954 # fill default values if nothing is specified
955 strategies = self.strategies if strategies is None else strategies
957 thresh_range = 1 + np.linspace(-4e-2, 4e-2, 100) if thresh_range is None else thresh_range
958 if ax is None:
959 fig, ax = plt.subplots(1, 1)
961 rec_rates = [[None] * len(thresh_range)] * len(strategies)
962 for strategy_idx in range(len(strategies)):
963 strategy = strategies[strategy_idx]
964 # load the stats
965 fault_free = self.load(strategy=strategy, faults=False)
966 with_faults = self.load(strategy=strategy, faults=True)
968 if recoverable_only:
969 recoverable_mask = self.get_fixable_faults_only(strategy)
970 else:
971 recoverable_mask = self.get_mask(strategy=strategy)
973 for thresh_idx in range(len(thresh_range)):
974 rec_mask = self.get_mask(
975 strategy=strategy,
976 key='error',
977 val=(thresh_range[thresh_idx] * fault_free['error'].mean()),
978 op='gt',
979 old_mask=recoverable_mask,
980 )
981 rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len(
982 with_faults['error'][recoverable_mask]
983 )
985 ax.plot(
986 thresh_range, rec_rates[strategy_idx], **{'color': strategy.color, 'label': strategy.label, **kwargs}
987 )
988 ax.legend(frameon=False)
989 ax.set_ylabel('recovery rate')
990 ax.set_xlabel('threshold as ratio to fault-free error')
992 return None
994 def analyse_adaptivity(self, mask): # pragma: no cover
995 '''
996 Analyse a set of runs with adaptivity
998 Args:
999 mask (Numpy.ndarray of shape (n)): The mask you want to know about
1001 Returns:
1002 None
1003 '''
1004 index = self.get_index(mask)
1005 dat = self.load()
1007 # make a header
1008 print(' run | bit | node | iter | e_em^* | e_em | e_glob^* | e_glob ')
1009 print('-------+-----+------+------+----------+----------+----------+----------')
1010 for i in index:
1011 e_em, e_glob = self.analyse_adaptivity_single(int(i))
1012 print(f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_em[1]:.2e}\
1013 | {e_em[0]:.2e} | {e_glob[1]:.2e} | {e_glob[0]:.2e}')
1015 e_tol = AdaptivityStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][
1016 Adaptivity
1017 ]['e_tol']
1018 print(f'We only restart when e_em > e_tol = {e_tol:.2e}!')
1019 return None
1021 def analyse_adaptivity_single(self, run): # pragma: no cover
1022 '''
1023 Compute what the difference in embedded and global error are for a specific run with adaptivity
1025 Args:
1026 run (int): The run you want to know about
1028 Returns:
1029 list: Embedded error with fault and without for the last iteration in the step with a fault
1030 list: Global error with and without fault at the end of the run
1031 '''
1032 # perform one run with and one without faults
1033 stats = []
1034 controllers = []
1035 for faults in [True, False]:
1036 s, c, _ = self.single_run(
1037 strategy=AdaptivityStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]
1038 )
1039 stats += [s]
1040 controllers += [c]
1042 # figure out when the fault happened
1043 t_fault = get_sorted(stats[0], type='bitflip')[0][0]
1045 # get embedded error
1046 e_em = [
1047 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]
1048 for stat in stats
1049 ]
1051 # compute the global error
1052 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]
1053 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]
1055 return [e_em[i][-1] for i in [0, 1]], e_glob
1057 def analyse_HotRod(self, mask): # pragma: no cover
1058 '''
1059 Analyse a set of runs with Hot Rod
1061 Args:
1062 mask (Numpy.ndarray of shape (n)): The mask you want to know about
1064 Returns:
1065 None
1066 '''
1067 index = self.get_index(mask)
1068 dat = self.load()
1070 # make a header
1071 print(' run | bit | node | iter | e_ex^* | e_ex | e_em^* | e_em | diff* | diff | e_glob^* \
1072| e_glob ')
1073 print('-------+-----+------+------+----------+----------+----------+----------+----------+----------+----------\
1074+----------')
1075 for i in index:
1076 e_em, e_ex, e_glob = self.analyse_HotRod_single(int(i))
1077 print(f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_ex[1]:.2e}\
1078 | {e_ex[0]:.2e} | {e_em[1]:.2e} | {e_em[0]:.2e} | {abs(e_em[1]-e_ex[1]):.2e} | {abs(e_em[0]-e_ex[0]):.2e} | \
1079{e_glob[1]:.2e} | {e_glob[0]:.2e}')
1081 tol = HotRodStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][HotRod][
1082 'HotRod_tol'
1083 ]
1084 print(f'We only restart when diff > tol = {tol:.2e}!')
1085 return None
1087 def analyse_HotRod_single(self, run): # pragma: no cover
1088 '''
1089 Compute what the difference in embedded, extrapolated and global error are for a specific run with Hot Rod
1091 Args:
1092 run (int): The run you want to know about
1094 Returns:
1095 list: Embedded error with fault and without for the last iteration in the step with a fault
1096 list: Extrapolation error with fault and without for the last iteration in the step with a fault
1097 list: Global error with and without fault at the end of the run
1098 '''
1099 # perform one run with and one without faults
1100 stats = []
1101 controllers = []
1102 for faults in [True, False]:
1103 s, c, _ = self.single_run(
1104 strategy=HotRodStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]
1105 )
1106 stats += [s]
1107 controllers += [c]
1109 # figure out when the fault happened
1110 t_fault = get_sorted(stats[0], type='bitflip')[0][0]
1112 # get embedded error
1113 e_em = [
1114 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]
1115 for stat in stats
1116 ]
1117 # get extrapolated error
1118 e_ex = [
1119 [me[1] for me in get_sorted(stat, type='error_extrapolation_estimate', time=t_fault, sortby='iter')]
1120 for stat in stats
1121 ]
1123 # compute the global error
1124 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]
1125 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]
1127 return [e_em[i][-1] for i in [0, 1]], [e_ex[i][-1] for i in [0, 1]], e_glob
1129 def print_faults(self, mask=None, strategy=None): # pragma: no cover
1130 '''
1131 Print all faults that happened within a certain mask
1133 Args:
1134 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of
1135 strategy (Strategy): The resilience strategy you want to see the error for
1137 Returns:
1138 None
1139 '''
1140 strategy = BaseStrategy() if strategy is None else strategy
1142 index = self.get_index(mask)
1143 dat = self.load(strategy=strategy)
1145 e_star = np.mean(self.load(faults=False, strategy=strategy).get('error', [0]))
1147 # make a header
1148 print(' run | bit | node | iter | rank | rel err | space pos')
1149 print('-------+-----+------+------+------+---------+-----------')
1150 for i in index:
1151 print(
1152 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {dat.get("rank", np.zeros_like(dat["iteration"]))[i]:4.0f} | {dat["error"][i] / e_star:.1e} | {dat["problem_pos"][i]}'
1153 )
1155 return None
1157 def get_mask(self, strategy=None, key=None, val=None, op='eq', old_mask=None, compare_faults=False):
1158 '''
1159 Make a mask to apply to stored data to filter out certain things
1161 Args:
1162 strategy (Strategy): The resilience strategy you want to apply the mask to. Most masks are the same for all
1163 strategies so None is fine
1164 key (str): The key in the stored statistics that you want to filter for some value
1165 val (str, float, int, bool): A value that you want to use for filtering. Dtype depends on key
1166 op (str): Operation that is applied for filtering
1167 old_mask (Numpy.ndarray of shape (n)): Apply this mask on top of the filter
1168 compare_faults (bool): instead of comparing to val, compare to the mean value for fault free runs
1170 Returns:
1171 Numpy.ndarray with boolean entries that can be used as a mask
1172 '''
1173 strategy = self.strategies[0] if strategy is None else strategy
1174 dat = self.load(strategy=strategy, faults=True)
1176 if compare_faults:
1177 if val is not None:
1178 raise ValueError('Can\'t use val and compare_faults in get_mask at the same time!')
1179 else:
1180 vals = self.load(strategy=strategy, faults=False)[key]
1181 val = sum(vals) / len(vals)
1183 if None in [key, val] and op not in ['isfinite']:
1184 mask = dat['bit'] == dat['bit']
1185 else:
1186 if op == 'uneq':
1187 mask = dat[key] != val
1188 elif op == 'eq':
1189 mask = dat[key] == val
1190 elif op == 'leq':
1191 mask = dat[key] <= val
1192 elif op == 'geq':
1193 mask = dat[key] >= val
1194 elif op == 'lt':
1195 mask = dat[key] < val
1196 elif op == 'gt':
1197 mask = dat[key] > val
1198 elif op == 'isfinite':
1199 mask = np.isfinite(dat[key])
1200 else:
1201 raise NotImplementedError(f'Please implement op={op}!')
1203 if old_mask is not None:
1204 return mask & old_mask
1205 else:
1206 return mask
1208 def get_fixable_faults_only(self, strategy):
1209 """
1210 Return a mask of only faults that can be fixed with a given strategy.
1212 Args:
1213 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all
1215 Returns:
1216 Numpy.ndarray with boolean entries that can be used as a mask
1217 """
1218 fixable = strategy.get_fixable_params(
1219 maxiter=strategy.get_custom_description(self.prob, self.num_procs)['step_params']['maxiter']
1220 )
1221 mask = self.get_mask(strategy=strategy)
1223 for kwargs in fixable:
1224 mask = self.get_mask(strategy=strategy, **kwargs, old_mask=mask)
1226 return mask
1228 def get_index(self, mask=None):
1229 '''
1230 Get the indeces of all runs in mask
1232 Args:
1233 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of
1235 Returns:
1236 Numpy.ndarray: Array of indeces
1237 '''
1238 if mask is None:
1239 dat = self.load()
1240 return np.arange(len(dat['iteration']))
1241 else:
1242 return np.arange(len(mask))[mask]
1244 def get_statistics_info(self, mask=None, strategy=None, print_all=False, ax=None): # pragma: no cover
1245 '''
1246 Get information about how many data points for faults we have given a particular mask
1248 Args:
1249 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1250 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all
1251 strategies, so you don't need to supply this argument
1252 print_all (bool): Whether to add information that is normally not useful to the table
1253 ax (Matplotlib.axes): Somewhere to plot the combinations histogram
1255 Returns:
1256 None
1257 '''
1259 # load some data from which to infer the number occurrences of some event
1260 strategy = self.strategies[0] if strategy is None else strategy
1261 dat = self.load(stratagy=strategy, faults=True)
1263 # make a dummy mask in case none is supplied
1264 if mask is None:
1265 mask = np.ones_like(dat['error'], dtype=bool)
1267 # print a header
1268 print(f' tot: {len(dat["error"][mask]):6} | avg. counts | mean deviation | unique entries')
1269 print('-------------------------------------------------------------')
1271 # make a list of all keys that you want to look at
1272 keys = ['iteration', 'bit', 'node']
1273 if print_all:
1274 keys += ['problem_pos', 'level', 'target']
1276 # print the table
1277 for key in keys:
1278 counts, dev, unique = self.count_occurrences(dat[key][mask])
1279 print(f' {key:11} | {counts:11.1f} | {dev:14.2f} | {unique:14}')
1281 return None
1283 def combinations_histogram(self, dat=None, keys=None, mask=None, ax=None): # pragma: no cover
1284 '''
1285 Make a histogram ouf of the occurrences of combinations
1287 Args:
1288 dat (dict): The data of the recorded statistics
1289 keys (list): The keys in dat that you want to know the combinations of
1290 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1292 Returns:
1293 Matplotlib.axes: The plot
1294 '''
1295 if ax is None:
1296 fig, ax = plt.subplots(1, 1)
1298 occurrences, bins = self.get_combination_histogram(dat, keys, mask)
1300 ax.bar(bins[:-1], occurrences)
1302 ax.set_xlabel('Occurrence of combinations')
1304 return ax
1306 def get_combination_histogram(self, dat=None, keys=None, mask=None): # pragma: no cover
1307 '''
1308 Check how many combinations of values we expect and how many we find to see if we need to do more experiments.
1309 It is assumed that each allowed value for each key appears at least once in dat after the mask was applied
1311 Args:
1312 dat (dict): The data of the recorded statistics
1313 keys (list): The keys in dat that you want to know the combinations of
1314 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1316 Returns:
1317 Numpy.ndarray: Number of occurrences of combinations
1318 Numpy.ndarray: Bins
1319 '''
1321 # load default values
1322 dat = self.load(strategy=self.strategies[0], faults=True) if dat is None else dat
1323 keys = ['iteration', 'bit', 'node'] if keys is None else keys
1324 if mask is None:
1325 mask = np.ones_like(dat['error'], dtype=bool)
1327 # get unique values and compute how many combinations you expect
1328 num_unique = [len(np.unique(dat[key][mask])) for key in keys]
1329 expected_number_of_combinations = np.prod(num_unique)
1331 # test what you actually get
1332 combination_counts = self.get_combination_counts(dat, keys, mask)
1334 # make a histogram with the result
1335 occurrences, bins = np.histogram(combination_counts, bins=np.arange(max(combination_counts) + 1))
1336 occurrences[0] = expected_number_of_combinations - len(combination_counts)
1338 return occurrences, bins
1340 def get_max_combinations(self, dat=None, strategy=None):
1341 '''
1342 Count how many combinations of parameters for faults are possible
1344 Args:
1345 dat (dict): The recorded statistics
1346 keys (list): The keys in dat that you want to know the combinations of
1347 strategy (Strategy): The resilience strategy
1349 Returns:
1350 int: Number of possible combinations
1351 '''
1352 strategy = self.strategies[0] if strategy is None else strategy
1353 stats, controller, Tend = self.single_run(strategy=strategy, run=0, faults=True)
1354 faultHook = get_fault_injector_hook(controller)
1355 ranges = [
1356 (0, faultHook.rnd_params['level_number']),
1357 (faultHook.rnd_params.get('min_node', 0), faultHook.rnd_params['node'] + 1),
1358 (1, faultHook.rnd_params['iteration'] + 1),
1359 (0, faultHook.rnd_params['bit']),
1360 (0, faultHook.rnd_params['rank']),
1361 ]
1362 ranges += [(0, i) for i in faultHook.rnd_params['problem_pos']]
1363 return np.prod([me[1] - me[0] for me in ranges], dtype=int)
1365 def get_combination_counts(self, dat, keys, mask):
1366 '''
1367 Get counts of how often all combinations of values of keys appear. This is done recursively to support arbitrary
1368 numbers of keys
1370 Args:
1371 dat (dict): The data of the recorded statistics
1372 keys (list): The keys in dat that you want to know the combinations of
1373 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1375 Returns:
1376 list: Occurrences of all combinations
1377 '''
1378 key = keys[0]
1379 unique_vals = np.unique(dat[key][mask])
1380 res = []
1382 for i in range(len(unique_vals)):
1383 inner_mask = self.get_mask(key=key, val=unique_vals[i], op='eq', old_mask=mask)
1384 if len(keys) > 1:
1385 res += self.get_combination_counts(dat, keys[1:], inner_mask)
1386 else:
1387 res += [self.count_occurrences(dat[key][inner_mask])[0]]
1388 return res
1390 def count_occurrences(self, vals):
1391 '''
1392 Count the occurrences of unique values in vals and compute average deviation from mean
1394 Args:
1395 vals (list): Values you want to check
1397 Returns:
1398 float: Mean of number of occurrences of unique values in vals
1399 float: Average deviation from mean number of occurrences
1400 int: Number of unique entries
1401 '''
1402 unique_vals, counts = np.unique(vals, return_counts=True)
1404 if len(counts) > 0:
1405 return counts.mean(), sum(abs(counts - counts.mean())) / len(counts), len(counts)
1406 else:
1407 return None, None, 0
1409 def bar_plot_thing(
1410 self, x=None, thing=None, ax=None, mask=None, store=False, faults=False, name=None, op=None, args=None
1411 ): # pragma: no cover
1412 '''
1413 Make a bar plot about something!
1415 Args:
1416 x (Numpy.ndarray of dimension 1): x values for bar plot
1417 thing (str): Some key stored in the stats that will go on the y-axis
1418 mask (Numpy.ndarray of shape (n)): The mask you want to apply before plotting
1419 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)
1420 faults (bool): Whether to load stats with faults or without
1421 name (str): Optional name for the plot
1422 op (function): Operation that is applied to thing before plotting default is recovery rate
1423 args (dict): Parameters for how the plot should look
1425 Returns:
1426 None
1427 '''
1428 if ax is None:
1429 fig, ax = plt.subplots(1, 1)
1430 store = True
1431 op = self.mean if op is None else op
1433 # get the values for the bars
1434 height = np.zeros(len(self.strategies))
1435 for strategy_idx in range(len(self.strategies)):
1436 strategy = self.strategies[strategy_idx]
1438 # load the values
1439 dat = self.load(strategy=strategy, faults=faults)
1440 no_faults = self.load(strategy=strategy, faults=False)
1442 # check if we have a mask
1443 if mask is None:
1444 mask = np.ones_like(dat[thing], dtype=bool)
1446 height[strategy_idx] = op(dat, no_faults, thing, mask)
1448 # get some x values
1449 x = np.arange(len(self.strategies)) if x is None else x
1451 # prepare labels
1452 ticks = [strategy.bar_plot_x_label for strategy in self.strategies]
1454 ax.bar(x, height, tick_label=ticks)
1456 # set the parameters
1457 ax.set_ylabel(thing)
1458 args = {} if args is None else args
1459 [plt.setp(ax, k, v) for k, v in args.items()]
1461 if store:
1462 fig.tight_layout()
1463 plt.savefig(f'data/{self.get_name()}-{thing if name is None else name}-barplot.pdf', transparent=True)
1464 plt.close(fig)
1466 return None
1468 def fault_frequency_plot(self, ax, iter_ax, kwargs_range, strategy=None): # pragma: no cover
1469 func_args = locals()
1470 func_args.pop('self', None)
1471 if strategy is None:
1472 for strat in self.strategies:
1473 args = {**func_args, 'strategy': strat}
1474 self.fault_frequency_plot(**args)
1475 return None
1477 # load data
1478 all_data = {}
1479 for me in kwargs_range['fault_frequency_iter']:
1480 self.kwargs['fault_frequency_iter'] = me
1481 self.get_recovered()
1482 all_data[me] = self.load(strategy=strategy, faults=True, mode='regular')
1484 # get_recovery_rate
1485 results = {}
1486 results['frequencies'] = list(all_data.keys())
1487 results['recovery_rate'] = [
1488 len(all_data[key]['recovered'][all_data[key]['recovered'] is True]) / len(all_data[key]['recovered'])
1489 for key in all_data.keys()
1490 ]
1491 # results['iterations'] = [np.mean(all_data[key]['total_iteration']) for key in all_data.keys()]
1492 results['iterations'] = [
1493 np.mean(all_data[key]['total_iteration'][all_data[key]['error'] != np.inf]) for key in all_data.keys()
1494 ]
1496 ax.plot(results['frequencies'], results['recovery_rate'], **strategy.style)
1497 iter_ax.plot(results['frequencies'], results['iterations'], **{**strategy.style, 'ls': '--'})
1499 ax.set_xscale('log')
1500 ax.set_xlabel('iterations between fault')
1501 ax.set_ylabel('recovery rate')
1502 iter_ax.set_ylabel('average total iterations if not crashed (dashed)')
1503 ax.legend(frameon=False)
1505 return None
1507 def get_HR_tol(self, verbose=False):
1508 from pySDC.implementations.convergence_controller_classes.hotrod import HotRod
1510 HR_strategy = HotRodStrategy(useMPI=self.use_MPI)
1512 description = HR_strategy.get_custom_description_for_faults(self.prob, self.num_procs)
1513 description['convergence_controllers'][HotRod]['HotRod_tol'] = 1e2
1515 stats, _, _ = self.single_run(HR_strategy, force_params=description)
1517 e_extrapolation = get_sorted(stats, type='error_extrapolation_estimate')
1518 diff = []
1519 for i in range(len(e_extrapolation)):
1520 if e_extrapolation[i][1] is not None:
1521 e_embedded = get_sorted(stats, type='error_embedded_estimate', time=e_extrapolation[i][0])
1522 diff += [abs(e_extrapolation[i][1] - e_embedded[-1][1])]
1524 max_diff = max(diff)
1525 proposed_tol = 2 * max_diff
1526 if verbose:
1527 print(
1528 f'Max. diff: {max_diff:.6e} -> proposed HR tolerance: {proposed_tol:.6e} for {self.prob.__name__} problem with {self.num_procs} procs.'
1529 )
1530 else:
1531 print(f'{proposed_tol:.6e}')
1534def check_local_error(): # pragma: no cover
1535 """
1536 Make a plot of the resolution over time for all problems
1537 """
1538 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]
1539 problems = [run_quench]
1540 strategies = [BaseStrategy(), AdaptivityStrategy(), IterateStrategy()]
1542 for i in range(len(problems)):
1543 stats_analyser = FaultStats(
1544 prob=problems[i],
1545 strategies=strategies,
1546 faults=[False],
1547 reload=True,
1548 recovery_thresh=1.1,
1549 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1550 num_procs=1,
1551 mode='random',
1552 )
1553 stats_analyser.compare_strategies()
1554 plt.show()
1557def parse_args():
1558 import sys
1560 kwargs = {}
1562 for i in range(len(sys.argv)):
1563 if 'prob' in sys.argv[i]:
1564 if sys.argv[i + 1] == 'run_Lorenz':
1565 kwargs['prob'] = run_Lorenz
1566 elif sys.argv[i + 1] == 'run_vdp':
1567 kwargs['prob'] = run_vdp
1568 elif sys.argv[i + 1] == 'run_Schroedinger':
1569 kwargs['prob'] = run_Schroedinger
1570 elif sys.argv[i + 1] == 'run_quench':
1571 kwargs['prob'] = run_quench
1572 elif sys.argv[i + 1] == 'run_AC':
1573 kwargs['prob'] = run_AC
1574 elif sys.argv[i + 1] == 'run_RBC':
1575 kwargs['prob'] = run_RBC
1576 elif sys.argv[i + 1] == 'run_GS':
1577 kwargs['prob'] = run_GS
1578 else:
1579 raise NotImplementedError
1580 elif 'num_procs' in sys.argv[i]:
1581 kwargs['num_procs'] = int(sys.argv[i + 1])
1582 elif 'runs' in sys.argv[i]:
1583 kwargs['runs'] = int(sys.argv[i + 1])
1584 elif 'mode' in sys.argv[i]:
1585 kwargs['mode'] = str(sys.argv[i + 1])
1586 elif 'reload' in sys.argv[i]:
1587 kwargs['reload'] = False if sys.argv[i + 1] == 'False' else True
1589 return kwargs
1592def compare_adaptivity_modes():
1593 from pySDC.projects.Resilience.strategies import AdaptivityRestartFirstStep
1595 fig, axs = plt.subplots(2, 2, sharey=True)
1596 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]
1597 titles = ['Van der Pol', 'Lorenz attractor', r'Schr\"odinger', 'Quench']
1599 for i in range(len(problems)):
1600 ax = axs.flatten()[i]
1602 ax.set_title(titles[i])
1604 stats_analyser = FaultStats(
1605 prob=problems[i],
1606 strategies=[AdaptivityStrategy()],
1607 faults=[False, True],
1608 reload=True,
1609 recovery_thresh=1.1,
1610 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1611 num_procs=1,
1612 mode='default',
1613 stats_path='data/stats-jusuf',
1614 )
1615 stats_analyser.get_recovered()
1617 for strategy in stats_analyser.strategies:
1618 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1619 stats_analyser.plot_things_per_things(
1620 'recovered',
1621 'bit',
1622 False,
1623 strategies=[strategy],
1624 op=stats_analyser.rec_rate,
1625 mask=fixable,
1626 args={'ylabel': 'recovery rate', 'label': 'bla'},
1627 name='fixable_recovery',
1628 ax=ax,
1629 )
1630 single_proc = ax.lines[-1]
1631 single_proc.set_label('single process')
1632 single_proc.set_color('black')
1634 stats_analyser = FaultStats(
1635 prob=problems[i],
1636 strategies=[AdaptivityStrategy(), AdaptivityRestartFirstStep()],
1637 faults=[False, True],
1638 reload=True,
1639 recovery_thresh=1.5,
1640 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1641 num_procs=4,
1642 mode='default',
1643 stats_path='data/stats-jusuf',
1644 )
1645 stats_analyser.get_recovered()
1647 for strategy in stats_analyser.strategies:
1648 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1649 stats_analyser.plot_things_per_things(
1650 'recovered',
1651 'bit',
1652 False,
1653 strategies=[strategy],
1654 op=stats_analyser.rec_rate,
1655 mask=fixable,
1656 args={'ylabel': 'recovery rate'},
1657 name='fixable_recovery',
1658 ax=ax,
1659 )
1660 fig.suptitle('Comparison of restart modes with adaptivity with 4 ranks')
1661 fig.tight_layout()
1662 plt.show()
1665def main():
1666 kwargs = {
1667 'prob': run_vdp,
1668 'num_procs': 1,
1669 'mode': 'default',
1670 'runs': 4000,
1671 'reload': True,
1672 **parse_args(),
1673 }
1675 strategy_args = {
1676 'stop_at_nan': True,
1677 }
1679 from pySDC.projects.Resilience.strategies import AdaptivityPolynomialError, kAdaptivityStrategy
1681 stats_analyser = FaultStats(
1682 strategies=[
1683 BaseStrategy(**strategy_args),
1684 AdaptivityStrategy(**strategy_args),
1685 kAdaptivityStrategy(**strategy_args),
1686 HotRodStrategy(**strategy_args),
1687 AdaptivityPolynomialError(**strategy_args),
1688 ],
1689 faults=[False, True],
1690 recovery_thresh=1.1,
1691 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(kwargs.get('prob', None), 0),
1692 stats_path='data/stats-jusuf',
1693 **kwargs,
1694 )
1695 stats_analyser.get_recovered()
1697 stats_analyser.run_stats_generation(runs=kwargs['runs'], step=8)
1699 if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data
1700 return None
1702 stats_analyser.get_recovered()
1703 mask = None
1705 # stats_analyser.compare_strategies()
1706 stats_analyser.plot_things_per_things(
1707 'recovered', 'node', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1708 )
1709 stats_analyser.plot_things_per_things(
1710 'recovered', 'iteration', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1711 )
1713 stats_analyser.plot_things_per_things(
1714 'recovered', 'bit', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1715 )
1717 # make a plot for only the faults that can be recovered
1718 fig, ax = plt.subplots(1, 1)
1719 for strategy in stats_analyser.strategies:
1720 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1721 stats_analyser.plot_things_per_things(
1722 'recovered',
1723 'bit',
1724 False,
1725 strategies=[strategy],
1726 op=stats_analyser.rec_rate,
1727 mask=fixable,
1728 args={'ylabel': 'recovery rate'},
1729 name='fixable_recovery',
1730 ax=ax,
1731 )
1732 fig.tight_layout()
1733 fig.savefig(f'data/{stats_analyser.get_name()}-recoverable.pdf', transparent=True)
1735 fig, ax = plt.subplots(1, 1, figsize=(13, 4))
1736 stats_analyser.plot_recovery_thresholds(ax=ax, thresh_range=np.logspace(-1, 1, 1000))
1737 # ax.axvline(stats_analyser.get_thresh(BaseStrategy()), color='grey', ls=':', label='recovery threshold')
1738 ax.set_xscale('log')
1739 ax.legend(frameon=False)
1740 fig.tight_layout()
1741 fig.savefig(f'data/{stats_analyser.get_name()}-threshold.pdf', transparent=True)
1743 stats_analyser.plot_things_per_things(
1744 'total_iteration',
1745 'bit',
1746 True,
1747 op=stats_analyser.mean,
1748 mask=mask,
1749 args={'yscale': 'log', 'ylabel': 'total iterations'},
1750 )
1751 stats_analyser.plot_things_per_things(
1752 'total_iteration',
1753 'bit',
1754 True,
1755 op=stats_analyser.extra_mean,
1756 mask=mask,
1757 args={'yscale': 'linear', 'ylabel': 'extra iterations'},
1758 name='extra_iter',
1759 )
1760 stats_analyser.plot_things_per_things(
1761 'error', 'bit', True, op=stats_analyser.mean, mask=mask, args={'yscale': 'log'}
1762 )
1764 stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.9, 1.3, 100))
1765 plt.show()
1768if __name__ == "__main__":
1769 main()
1770 # compare_adaptivity_modes()
1771 # check_local_error()