Coverage for pySDC/projects/Resilience/fault_stats.py: 56%
451 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-18 07:07 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-18 07:07 +0000
1import numpy as np
2import matplotlib.pyplot as plt
3from mpi4py import MPI
4import pickle
6import pySDC.helpers.plot_helper as plot_helper
7from pySDC.helpers.stats_helper import get_sorted
9from pySDC.projects.Resilience.hook import hook_collection, LogUAllIter, LogData
10from pySDC.projects.Resilience.fault_injection import get_fault_injector_hook
11from pySDC.implementations.convergence_controller_classes.hotrod import HotRod
12from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
13from pySDC.implementations.hooks.log_errors import LogLocalErrorPostStep
14from pySDC.implementations.hooks.log_work import LogWork
16# these problems are available for testing
17from pySDC.projects.Resilience.advection import run_advection
18from pySDC.projects.Resilience.vdp import run_vdp
19from pySDC.projects.Resilience.piline import run_piline
20from pySDC.projects.Resilience.Lorenz import run_Lorenz
21from pySDC.projects.Resilience.Schroedinger import run_Schroedinger
22from pySDC.projects.Resilience.quench import run_quench
23from pySDC.projects.Resilience.AC import run_AC
24from pySDC.projects.Resilience.RBC import run_RBC
25from pySDC.projects.Resilience.GS import run_GS
27from pySDC.projects.Resilience.strategies import BaseStrategy, AdaptivityStrategy, IterateStrategy, HotRodStrategy
28import logging
30plot_helper.setup_mpl(reset=True)
32LOGGER_LEVEL = 40
34RECOVERY_THRESH_ABS = {
35 # run_quench: 5e-3,
36 # run_Schroedinger: 1.7e-6,
37}
40class FaultStats:
41 '''
42 Class to generate and analyse fault statistics
43 '''
45 def __init__(
46 self,
47 prob=None,
48 strategies=None,
49 faults=None,
50 reload=True,
51 recovery_thresh=1 + 1e-3,
52 recovery_thresh_abs=0.0,
53 num_procs=1,
54 mode='default',
55 stats_path='data/stats',
56 use_MPI=False,
57 **kwargs,
58 ):
59 '''
60 Initialization routine
62 Args:
63 prob: A function that runs a pySDC problem, see imports for available problems
64 strategies (list): List of resilience strategies
65 faults (list): List of booleans that describe whether to use faults or not
66 reload (bool): Load previously computed statistics and continue from there or start from scratch
67 recovery_thresh (float): Relative threshold for recovery
68 num_procs (int): Number of processes
69 mode (str): Mode for fault generation: Either 'random' or 'combination'
70 '''
71 self.prob = prob
72 self.strategies = [None] if strategies is None else strategies
73 self.faults = [False, True] if faults is None else faults
74 self.reload = reload
75 self.recovery_thresh = recovery_thresh
76 self.recovery_thresh_abs = recovery_thresh_abs
77 self.num_procs = num_procs
78 self.use_MPI = use_MPI
79 self.stats_path = stats_path
80 self.kwargs = {
81 'fault_frequency_iter': 500,
82 **kwargs,
83 }
85 # decide mode
86 if mode == 'default':
87 if prob.__name__ in ['run_vdp', 'run_Lorenz']:
88 self.mode = 'combination'
89 else:
90 self.mode = 'random'
91 else:
92 self.mode = mode
94 self.logger = logging.getLogger('FaultStats')
95 self.logger.level = LOGGER_LEVEL
97 msg = 'Starting FaultStats with attributes'
98 for key, val in self.__dict__.items():
99 if key in ['logger']:
100 continue
101 item = [str(me) for me in val] if type(val) == list else str(val)
102 msg += '\n\t' f'{key}: {item}'
103 self.logger.log(25, msg)
105 def get_Tend(self):
106 '''
107 Get the final time of runs for fault stats based on the problem
109 Returns:
110 float: Tend to put into the run
111 '''
112 return self.strategies[0].get_Tend(self.prob, self.num_procs, resilience_experiment=True)
114 def run_stats_generation(
115 self, runs=1000, step=None, comm=None, kwargs_range=None, faults=None, _reload=False, _runs_partial=0
116 ):
117 '''
118 Run the generation of stats for all strategies in the `self.strategies` variable
120 Args:
121 runs (int): Number of runs you want to do
122 step (int): Number of runs you want to do between saving
123 comm (MPI.Communicator): Communicator for distributing runs
124 faults (bool): Whether to do stats with faults or without
125 kw_args_range (dict): Range for the parameters
126 _reload, _runs_partial: Variables only used for recursion. Do not change!
128 Returns:
129 None
130 '''
131 if faults is None:
132 for f in self.faults:
133 self.run_stats_generation(runs=runs, step=step, comm=comm, kwargs_range=kwargs_range, faults=f)
134 return None
136 for key, val in kwargs_range.items() if kwargs_range is not None else {}:
137 if type(val) == int:
138 self.kwargs[key] = val
139 else:
140 for me in val:
141 kwargs_range_me = {**kwargs_range, key: me}
142 self.run_stats_generation(
143 runs=runs, step=step, comm=comm, kwargs_range=kwargs_range_me, faults=faults
144 )
145 return None
147 comm = MPI.COMM_WORLD if comm is None else comm
148 step = (runs if comm.size == 1 else comm.size) if step is None else step
149 _runs_partial = step if _runs_partial == 0 else _runs_partial
150 reload = self.reload or _reload
152 # see if we limit the number of runs we want to do
153 max_runs = (
154 (min(self.get_max_combinations(), runs) if self.mode == 'combination' else runs) if faults else min(runs, 5)
155 )
157 if reload:
158 # sort the strategies to do some load balancing
159 sorting_index = None
160 if comm.rank == 0:
161 already_completed = np.array(
162 [self.load(strategy=strategy, faults=faults).get('runs', 0) for strategy in self.strategies]
163 )
164 sorting_index_ = np.argsort(already_completed)
165 sorting_index = sorting_index_[already_completed[sorting_index_] < max_runs]
166 _runs_partial = max(min(already_completed), _runs_partial)
168 # tell all ranks what strategies to use
169 sorting_index = comm.bcast(sorting_index, root=0)
170 _runs_partial = comm.bcast(_runs_partial, root=0)
171 strategies = [self.strategies[i] for i in sorting_index]
172 if len(strategies) == 0: # check if we are already done
173 return None
174 else:
175 strategies = self.strategies
177 strategy_comm = comm.Split(comm.rank % len(strategies))
179 for j in range(0, len(strategies), comm.size):
180 self.generate_stats(
181 strategy=strategies[j + (comm.rank % len(strategies) % (len(strategies) - j))],
182 runs=min(_runs_partial, max_runs),
183 faults=faults,
184 reload=reload,
185 comm=strategy_comm,
186 )
187 strategy_comm.Free()
188 self.run_stats_generation(
189 runs=runs, step=step, comm=comm, faults=faults, _reload=True, _runs_partial=_runs_partial + step
190 )
192 return None
194 def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, comm=None):
195 '''
196 Generate statistics for recovery from bit flips
197 -----------------------------------------------
199 Every run is given a different random seed such that we have different faults and the results are then stored
201 Args:
202 strategy (Strategy): Resilience strategy
203 runs (int): Number of runs you want to do
204 reload (bool): Load previously computed statistics and continue from there or start from scratch
205 faults (bool): Whether to do stats with faults or without
206 comm (MPI.Communicator): Communicator for distributing runs
208 Returns:
209 None
210 '''
211 comm = MPI.COMM_WORLD if comm is None else comm
213 # initialize dictionary to store the stats in
214 dat = {
215 'level': np.zeros(runs),
216 'iteration': np.zeros(runs),
217 'node': np.zeros(runs),
218 'problem_pos': [],
219 'bit': np.zeros(runs),
220 'error': np.zeros(runs),
221 'total_iteration': np.zeros(runs),
222 'total_newton_iteration': np.zeros(runs),
223 'restarts': np.zeros(runs),
224 'target': np.zeros(runs),
225 'rank': np.zeros(runs),
226 }
228 # store arguments for storing and loading
229 identifier_args = {
230 'faults': faults,
231 'strategy': strategy,
232 }
234 # reload previously recorded stats and write them to dat
235 if reload:
236 already_completed_ = None
237 if comm.rank == 0:
238 already_completed_ = self.load(**identifier_args)
239 already_completed = comm.bcast(already_completed_, root=0)
240 if already_completed['runs'] > 0 and already_completed['runs'] <= runs and comm.rank == 0:
241 avail_len = min([already_completed['runs'], runs])
242 for k in dat.keys():
243 dat[k][:avail_len] = already_completed.get(k, [None] * avail_len)
244 else:
245 already_completed = {'runs': 0}
247 # prepare a message
248 involved_ranks = comm.gather(MPI.COMM_WORLD.rank, root=0)
249 msg = f'{comm.size} rank(s) ({involved_ranks}) doing {strategy.name}{" with faults" if faults else ""} with {self.num_procs} ranks for {self.prob.__name__} from {already_completed["runs"]} to {runs}'
250 if comm.rank == 0 and already_completed['runs'] < runs:
251 print(msg, flush=True)
253 space_comm = MPI.COMM_SELF.Split(True)
255 # perform the remaining experiments
256 for i in range(already_completed['runs'], runs):
257 if i % comm.size != comm.rank:
258 continue
260 # perform a single experiment with the correct random seed
261 stats, controller, crash = self.single_run(strategy=strategy, run=i, faults=faults, space_comm=space_comm)
263 # get the data from the stats
264 faults_run = get_sorted(stats, type='bitflip')
266 if faults:
267 assert len(faults_run) > 0, f"Did not record a fault in run {i} of {strategy.name}!"
268 dat['level'][i] = faults_run[0][1][0]
269 dat['iteration'][i] = faults_run[0][1][1]
270 dat['node'][i] = faults_run[0][1][2]
271 dat['problem_pos'] += [faults_run[0][1][3]]
272 dat['bit'][i] = faults_run[0][1][4]
273 dat['target'][i] = faults_run[0][1][5]
274 dat['rank'][i] = faults_run[0][1][6]
276 if crash:
277 print('Code crashed!')
278 dat['error'][i] = np.inf
279 continue
281 # record the rest of the data
282 t, u = get_sorted(stats, type='u', recomputed=False)[-1]
283 error = self.get_error(u, t, controller, strategy, self.get_Tend())
284 total_iteration = sum([k[1] for k in get_sorted(stats, type='k')])
285 total_newton_iteration = sum([k[1] for k in get_sorted(stats, type='work_newton')])
287 dat['error'][i] = error
288 dat['total_iteration'][i] = total_iteration
289 dat['total_newton_iteration'][i] = total_newton_iteration
290 dat['restarts'][i] = sum([me[1] for me in get_sorted(stats, type='restart')])
292 # free the space communicator
293 space_comm.Free()
295 dat_full = {}
296 for k in dat.keys():
297 dat_full[k] = comm.reduce(dat[k], op=MPI.SUM)
299 # store the completed stats
300 dat_full['runs'] = runs
302 if already_completed['runs'] < runs:
303 if comm.rank == 0:
304 self.store(dat_full, **identifier_args)
305 if self.faults:
306 self.get_recovered(strategy=strategy)
308 return None
310 def get_error(self, u, t, controller, strategy, Tend):
311 """
312 Compute the error.
314 Args:
315 u (dtype_u): The solution at the end of the run
316 t (float): Time at which `u` was recorded
317 controller (pySDC.controller.controller): The controller
318 strategy (Strategy): The resilience strategy
319 Tend (float): Time you want to run to
321 Returns:
322 float: Error
323 """
324 lvl = controller.MS[0].levels[0]
326 # decide if we reached Tend
327 Tend_reached = t + lvl.params.dt_initial >= Tend
329 if Tend_reached:
330 u_star = lvl.prob.u_exact(t=t)
331 return abs(u - u_star) / abs(u_star)
332 else:
333 print(f'Final time was not reached! Code crashed at t={t:.2f} instead of reaching Tend={Tend:.2f}')
334 return np.inf
336 def single_run(
337 self,
338 strategy,
339 run=0,
340 faults=False,
341 force_params=None,
342 hook_class=None,
343 space_comm=None,
344 Tend=None,
345 time_comm=None,
346 ):
347 '''
348 Run the problem once with the specified parameters
350 Args:
351 strategy (Strategy): The resilience strategy you plan on using
352 run (int): Index for fault generation
353 faults (bool): Whether or not to put faults in
354 force_params (dict): Change parameters in the description of the problem
355 space_comm (MPI.Communicator): A communicator for space parallelisation
356 Tend (float): Time you want to run to
357 time_comm (MPI.Intracomm.Communicator): Communicator in time
359 Returns:
360 dict: Stats object containing statistics for each step, each level and each iteration
361 pySDC.Controller: The controller of the run
362 float: The time the problem should have run to
363 '''
364 hook_class = hook_collection + [LogWork] + ([LogData] if hook_class is None else hook_class)
365 force_params = {} if force_params is None else force_params
367 # build the custom description
368 custom_description = strategy.get_custom_description_for_faults(self.prob, self.num_procs)
369 for k in force_params.keys():
370 custom_description[k] = {**custom_description.get(k, {}), **force_params[k]}
372 custom_controller_params = {
373 'logger_level': LOGGER_LEVEL,
374 **force_params.get('controller_params', {}),
375 }
377 if faults:
378 fault_stuff = {
379 'rng': None,
380 'args': strategy.get_fault_args(self.prob, self.num_procs),
381 'rnd_params': strategy.get_random_params(self.prob, self.num_procs),
382 }
384 # make parameters for faults:
385 if self.mode == 'random':
386 fault_stuff['rng'] = np.random.RandomState(run)
387 elif self.mode == 'combination':
388 fault_stuff['rng'] = run
389 elif self.mode == 'regular':
390 fault_stuff['rng'] = np.random.RandomState(run)
391 fault_stuff['fault_frequency_iter'] = self.kwargs['fault_frequency_iter']
392 fault_stuff['rnd_params'] = {
393 'bit': 12,
394 'min_node': 1,
395 }
396 else:
397 raise NotImplementedError(f'Don\'t know how to add faults in mode {self.mode}')
399 else:
400 fault_stuff = None
402 return self.prob(
403 custom_description=custom_description,
404 num_procs=self.num_procs,
405 hook_class=hook_class,
406 fault_stuff=fault_stuff,
407 Tend=self.get_Tend() if Tend is None else Tend,
408 custom_controller_params=custom_controller_params,
409 space_comm=space_comm,
410 comm=time_comm,
411 use_MPI=time_comm is not None,
412 )
414 def compare_strategies(self, run=0, faults=False, ax=None): # pragma: no cover
415 '''
416 Take a closer look at how the strategies compare for a specific run
418 Args:
419 run (int): The number of the run to get the appropriate random generator
420 faults (bool): Whether or not to include faults
421 ax (Matplotlib.axes): Somewhere to plot
423 Returns:
424 None
425 '''
426 if ax is None:
427 fig, ax = plt.subplots(1, 1)
428 store = True
429 else:
430 store = False
432 k_ax = ax.twinx()
433 ls = ['-.' if type(strategy) == HotRodStrategy else '-' for strategy in self.strategies]
434 [self.scrutinize_visual(self.strategies[i], run, faults, ax, k_ax, ls[i]) for i in range(len(self.strategies))]
436 # make a legend
437 [k_ax.plot([None], [None], label=strategy.label, color=strategy.color) for strategy in self.strategies]
438 k_ax.legend(frameon=True)
440 if store:
441 fig.tight_layout()
442 plt.savefig(f'data/{self.get_name()}-comparison.pdf', transparent=True)
444 def scrutinize_visual(
445 self, strategy, run, faults, ax=None, k_ax=None, ls='-', plot_restarts=False
446 ): # pragma: no cover
447 '''
448 Take a closer look at a specific run with a plot
450 Args:
451 strategy (Strategy): The resilience strategy you plan on using
452 run (int): The number of the run to get the appropriate random generator
453 faults (bool): Whether or not to include faults
454 ax (Matplotlib.axes): Somewhere to plot the error
455 k_ax (Matplotlib.axes): Somewhere to plot the iterations
456 plot_restarts (bool): Make vertical lines wherever restarts happened
458 Returns:
459 dict: Stats from the run
460 '''
461 if ax is None:
462 fig, ax = plt.subplots(1, 1)
463 store = True
464 else:
465 store = False
467 force_params = {}
469 stats, controller, Tend = self.single_run(
470 strategy=strategy,
471 run=run,
472 faults=faults,
473 force_params=force_params,
474 hook_class=hook_collection + [LogLocalErrorPostStep, LogData],
475 )
477 # plot the local error
478 e_loc = get_sorted(stats, type='e_local_post_step', recomputed=False)
479 ax.plot([me[0] for me in e_loc], [me[1] for me in e_loc], color=strategy.color, ls=ls)
481 # plot the iterations
482 k_ax = ax.twinx() if k_ax is None else k_ax
483 k = get_sorted(stats, type='k')
484 k_ax.plot([me[0] for me in k], np.cumsum([me[1] for me in k]), color=strategy.color, ls='--')
486 # plot the faults
487 faults = get_sorted(stats, type='bitflip')
488 for fault_time in [me[0] for me in faults]:
489 ax.axvline(fault_time, color='grey', ls=':')
491 # plot restarts
492 if plot_restarts:
493 restarts = get_sorted(stats, type='restart')
494 [ax.axvline(me[0], color='black', ls='-.') if me[1] else '' for me in restarts]
496 # decorate
497 ax.set_yscale('log')
498 ax.set_ylabel(r'$\epsilon$')
499 k_ax.set_ylabel('cumulative iterations (dashed)')
500 ax.set_xlabel(r'$t$')
502 if store:
503 fig.tight_layout()
504 plt.savefig(f'data/{self.get_name()}-{strategy.name}-details.pdf', transparent=True)
506 def scrutinize(self, strategy, logger_level=15, **kwargs):
507 '''
508 Take a closer look at a specific run
510 Args:
511 strategy (Strategy): The resilience strategy you plan on using
513 Returns:
514 None
515 '''
516 from pySDC.projects.Resilience.fault_injection import FaultInjector
518 force_params = {}
519 force_params['controller_params'] = {'logger_level': logger_level}
520 force_params['sweeper_params'] = {'skip_residual_computation': ()}
522 stats, controller, Tend = self.single_run(strategy=strategy, force_params=force_params, **kwargs)
524 u_all = get_sorted(stats, type='u', recomputed=False)
525 t, u = get_sorted(stats, type='u', recomputed=False)[np.argmax([me[0] for me in u_all])]
526 k = [me[1] for me in get_sorted(stats, type='k')]
528 fault_hook = [me for me in controller.hooks if type(me) == FaultInjector]
530 unhappened_faults = fault_hook[0].faults if len(fault_hook) > 0 else []
532 print(f'\nOverview for {strategy.name} strategy')
533 # print faults
534 faults = get_sorted(stats, type='bitflip')
535 print('\nfaults:')
536 print(' t | level | iter | node | bit | trgt | rank | pos')
537 print('--------+-------+------+------+-----+------+---------------------')
538 for f in faults:
539 print(
540 f' {f[0]:6.2f} | {f[1][0]:5d} | {f[1][1]:4d} | {f[1][2]:4d} | {f[1][4]:3d} | {f[1][5]:4d} | {f[1][6]:4d} |',
541 f[1][3],
542 )
543 print('--------+-------+------+------+-----+------+---------------------')
544 for f in unhappened_faults:
545 print(
546 f'!{f.time:6.2f} | {f.level_number:5d} | {f.iteration:4d} | {f.node:4d} | {f.bit:3d} | {f.target:4d} | {f.rank:4d} |',
547 f.problem_pos,
548 )
550 # see if we can determine if the faults where recovered
551 no_faults = self.load(strategy=strategy, faults=False)
552 e_star = np.mean(no_faults.get('error', [0]))
553 error = self.get_error(u, t, controller, strategy, Tend)
554 recovery_thresh = self.get_thresh(strategy)
556 print(
557 f'e={error:.2e}, e^*={e_star:.2e}, thresh: {recovery_thresh:.2e} -> recovered: {error < recovery_thresh}, e_rel = {error/abs(u):.2e}'
558 )
559 print(f'k: sum: {np.sum(k)}, min: {np.min(k)}, max: {np.max(k)}, mean: {np.mean(k):.2f},')
561 _newton_iter = get_sorted(stats, type='work_newton')
562 if len(_newton_iter) > 0:
563 newton_iter = [me[1] for me in _newton_iter]
564 print(
565 f'Newton: k: sum: {np.sum(newton_iter)}, min: {np.min(newton_iter)}, max: {np.max(newton_iter)}, mean: {np.mean(newton_iter):.2f},'
566 )
568 # checkout the step size
569 dt = [me[1] for me in get_sorted(stats, type='dt')]
570 print(f't: {t:.2f}, dt: min: {np.min(dt):.2e}, max: {np.max(dt):.2e}, mean: {np.mean(dt):.2e}')
572 # restarts
573 restarts = [me[1] for me in get_sorted(stats, type='restart')]
574 print(f'restarts: {sum(restarts)}, without faults: {no_faults["restarts"][0]}')
576 return stats
578 def convert_faults(self, faults):
579 '''
580 Make arrays of useable data from an entry in the stats object returned by pySDC
582 Args:
583 faults (list): The entry for faults returned by the pySDC run
585 Returns:
586 list: The times when the faults happened
587 list: The levels in which the faults happened
588 list: The iterations in which the faults happened
589 list: The nodes in which the faults happened
590 list: The problem positions in which the faults happened
591 list: The bits in which the faults happened
592 '''
593 time = [faults[i][0] for i in range(len(faults))]
594 level = [faults[i][1][0] for i in range(len(faults))]
595 iteration = [faults[i][1][1] for i in range(len(faults))]
596 node = [faults[i][1][2] for i in range(len(faults))]
597 problem_pos = [faults[i][1][3] for i in range(len(faults))]
598 bit = [faults[i][1][4] for i in range(len(faults))]
599 return time, level, iteration, node, problem_pos, bit
601 def get_path(self, **kwargs):
602 '''
603 Get the path to where the stats are stored
605 Args:
606 strategy (Strategy): The resilience strategy
607 faults (bool): Whether or not faults have been activated
609 Returns:
610 str: The path to what you are looking for
611 '''
612 return f'{self.stats_path}/{self.get_name(**kwargs)}.pickle'
614 def get_name(self, strategy=None, faults=True, mode=None):
615 '''
616 Function to get a unique name for a kind of statistics based on the problem and strategy that was used
618 Args:
619 strategy (Strategy): Resilience strategy
620 faults (bool): Whether or not faults where inserted
622 Returns:
623 str: The unique identifier
624 '''
625 if self.prob == run_advection:
626 prob_name = 'advection'
627 elif self.prob == run_vdp:
628 prob_name = 'vdp'
629 elif self.prob == run_piline:
630 prob_name = 'piline'
631 elif self.prob == run_Lorenz:
632 prob_name = 'Lorenz'
633 elif self.prob == run_Schroedinger:
634 prob_name = 'Schroedinger'
635 elif self.prob == run_quench:
636 prob_name = 'Quench'
637 elif self.prob.__name__ == 'run_AC':
638 prob_name = 'Allen-Cahn'
639 elif self.prob.__name__ == 'run_RBC':
640 prob_name = 'Rayleigh-Benard'
641 elif self.prob.__name__ == 'run_GS':
642 prob_name = 'Gray-Scott'
643 else:
644 raise NotImplementedError(f'Name not implemented for problem {self.prob}')
646 if faults:
647 fault_name = '-faults'
648 else:
649 fault_name = ''
651 if strategy is not None:
652 strategy_name = f'-{strategy.name}'
653 else:
654 strategy_name = ''
656 mode = self.mode if mode is None else mode
657 if mode == 'regular':
658 mode_thing = f'-regular{self.kwargs["fault_frequency_iter"] if faults else ""}'
659 else:
660 mode_thing = ''
662 mpi_thing = '-MPI' if self.use_MPI else ''
663 return f'{prob_name}{strategy_name}{fault_name}-{self.num_procs}procs{mode_thing}{mpi_thing}'
665 def store(self, dat, **kwargs):
666 '''
667 Stores the data for a run at a predefined path
669 Args:
670 dat (dict): The data of the recorded statistics
672 Returns:
673 None
674 '''
675 with open(self.get_path(**kwargs), 'wb') as f:
676 pickle.dump(dat, f)
677 return None
679 def load(self, **kwargs):
680 '''
681 Loads the stats belonging to a specific strategy and whether or not faults where inserted.
682 When no data has been generated yet, a dictionary is returned which only contains the number of completed runs,
683 which is 0 of course.
685 Returns:
686 dict: Data from previous run or if it is not available a placeholder dictionary
687 '''
688 kwargs['strategy'] = kwargs.get('strategy', self.strategies[MPI.COMM_WORLD.rank % len(self.strategies)])
690 try:
691 with open(self.get_path(**kwargs), 'rb') as f:
692 dat = pickle.load(f)
693 except FileNotFoundError:
694 self.logger.debug(f'File \"{self.get_path(**kwargs)}\" not found!')
695 return {'runs': 0}
696 return dat
698 def get_thresh(self, strategy=None):
699 """
700 Get recovery threshold based on relative and absolute tolerances
702 Args:
703 strategy (Strategy): The resilience strategy
704 """
705 fault_free = self.load(strategy=strategy, faults=False)
706 assert (
707 fault_free['error'].std() / fault_free['error'].mean() < 1e-5
708 ), f'Got too large variation of errors in {strategy} strategy!'
709 return max([self.recovery_thresh_abs, self.recovery_thresh * fault_free["error"].mean()])
711 def get_recovered(self, **kwargs):
712 '''
713 Determine the recovery rate for a specific strategy and store it to disk.
715 Returns:
716 None
717 '''
718 if 'strategy' not in kwargs.keys():
719 [self.get_recovered(strategy=strat, **kwargs) for strat in self.strategies]
720 else:
721 try:
722 with_faults = self.load(faults=True, **kwargs)
723 with_faults['recovered'] = with_faults['error'] <= self.get_thresh(kwargs['strategy'])
724 self.store(faults=True, dat=with_faults, **kwargs)
725 except KeyError as error:
726 print(
727 f'Warning: Can\'t compute recovery rate for strategy {kwargs["strategy"].name} in {self.prob.__name__} problem right now: KeyError: {error}'
728 )
730 return None
732 def crash_rate(self, dat, no_faults, thingA, mask):
733 '''
734 Determine the rate of runs that crashed
736 Args:
737 dat (dict): The data of the recorded statistics with faults
738 no_faults (dict): The data of the corresponding fault-free stats
739 thingA (str): Some key stored in the stats that will go on the y-axis
740 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply before determining the rate
742 Returns:
743 int: Ratio of the runs which crashed and fall under the specific criteria set by the mask
744 '''
745 if len(dat[thingA][mask]) > 0:
746 crash = dat['error'] == np.inf
747 return len(dat[thingA][mask & crash]) / len(dat[thingA][mask])
748 else:
749 return None
751 def rec_rate(self, dat, no_faults, thingA, mask):
752 '''
753 Operation for plotting which returns the recovery rate for a given mask.
754 Which thingA you apply this to actually does not matter here since we compute a rate.
756 Args:
757 dat (dict): The recorded statistics
758 no_faults (dict): The corresponding fault-free stats
759 thingA (str): Some key stored in the stats
760 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
762 Returns:
763 float: Recovery rate
764 '''
765 if len(dat[thingA][mask]) > 0:
766 return len(dat[thingA][mask & dat['recovered']]) / len(dat[thingA][mask])
767 else:
768 return None
770 def mean(self, dat, no_faults, thingA, mask):
771 '''
772 Operation for plotting which returns the mean of thingA after applying the mask
774 Args:
775 dat (dict): The recorded statistics
776 no_faults (dict): The corresponding fault-free stats
777 thingA (str): Some key stored in the stats
778 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
780 Returns:
781 float: Mean of thingA after applying mask
782 '''
783 return np.mean(dat[thingA][mask])
785 def extra_mean(self, dat, no_faults, thingA, mask):
786 '''
787 Operation for plotting which returns the difference in mean of thingA between runs with and without faults after
788 applying the mask
790 Args:
791 dat (dict): The recorded statistics
792 no_faults (dict): The corresponding fault-free stats
793 thingA (str): Some key stored in the stats
794 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
796 Returns:
797 float: Difference in mean of thingA between runs with and without faults after applying mask
798 '''
799 if True in mask or int in [type(me) for me in mask]:
800 return np.mean(dat[thingA][mask]) - np.mean(no_faults[thingA])
801 else:
802 return None
804 def plot_thingA_per_thingB(
805 self,
806 strategy,
807 thingA,
808 thingB,
809 ax=None,
810 mask=None,
811 recovered=False,
812 op=None,
813 **kwargs,
814 ): # pragma: no cover
815 '''
816 Plot thingA vs. thingB for a single strategy
818 Args:
819 strategy (Strategy): The resilience strategy you want to plot
820 thingA (str): Some key stored in the stats that will go on the y-axis
821 thingB (str): Some key stored in the stats that will go on the x-axis
822 ax (Matplotlib.axes): Somewhere to plot
823 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes
824 recovered (bool): Show the plot for both all runs and only the recovered ones
825 op (function): Operation that is applied to thingA before plotting default is recovery rate
827 Returns:
828 None
829 '''
830 op = self.rec_rate if op is None else op
831 dat = self.load(strategy=strategy, faults=True)
832 no_faults = self.load(strategy=strategy, faults=False)
834 if mask is None:
835 mask = np.ones_like(dat[thingB], dtype=bool)
837 admissable_thingB = np.unique(dat[thingB][mask])
838 me = np.zeros(len(admissable_thingB))
839 me_recovered = np.zeros_like(me)
841 for i in range(len(me)):
842 _mask = np.logical_and((dat[thingB] == admissable_thingB[i]), mask)
843 if _mask.any():
844 me[i] = op(dat, no_faults, thingA, _mask)
845 me_recovered[i] = op(dat, no_faults, thingA, np.logical_and(_mask, dat['recovered']))
847 if recovered:
848 ax.plot(
849 admissable_thingB,
850 me_recovered,
851 label=f'{strategy.label} (only recovered)',
852 color=strategy.color,
853 marker=strategy.marker,
854 ls='--',
855 linewidth=3,
856 **kwargs,
857 )
859 ax.plot(
860 admissable_thingB,
861 me,
862 label=f'{strategy.label}',
863 color=strategy.color,
864 marker=strategy.marker,
865 linewidth=2,
866 **kwargs,
867 )
869 ax.legend(frameon=False)
870 ax.set_xlabel(thingB)
871 ax.set_ylabel(thingA)
872 return None
874 def plot_things_per_things(
875 self,
876 thingA='bit',
877 thingB='bit',
878 recovered=False,
879 mask=None,
880 op=None,
881 args=None,
882 strategies=None,
883 name=None,
884 store=False,
885 ax=None,
886 fig=None,
887 plotting_args=None,
888 ): # pragma: no cover
889 '''
890 Plot thingA vs thingB for multiple strategies
892 Args:
893 thingA (str): Some key stored in the stats that will go on the y-axis
894 thingB (str): Some key stored in the stats that will go on the x-axis
895 recovered (bool): Show the plot for both all runs and only the recovered ones
896 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes
897 op (function): Operation that is applied to thingA before plotting default is recovery rate
898 args (dict): Parameters for how the plot should look
899 strategies (list): List of the strategies you want to plot, if None, all will be plotted
900 name (str): Optional name for the plot
901 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)
902 ax (Matplotlib.axes): Somewhere to plot
903 fig (Matplotlib.figure): Figure of the ax
905 Returns
906 None
907 '''
908 strategies = self.strategies if strategies is None else strategies
909 args = {} if args is None else args
911 # make sure we have something to plot in
912 if ax is None:
913 fig, ax = plt.subplots(1, 1)
914 elif fig is None:
915 store = False
917 # execute the plots for all strategies
918 for s in strategies:
919 self.plot_thingA_per_thingB(
920 s,
921 thingA=thingA,
922 thingB=thingB,
923 recovered=recovered,
924 ax=ax,
925 mask=mask,
926 op=op,
927 **(plotting_args if plotting_args else {}),
928 )
930 # set the parameters
931 [plt.setp(ax, k, v) for k, v in args.items()]
933 if store:
934 fig.tight_layout()
935 plt.savefig(f'data/{self.get_name()}-{thingA if name is None else name}_per_{thingB}.pdf', transparent=True)
936 plt.close(fig)
938 return None
940 def plot_recovery_thresholds(
941 self, strategies=None, thresh_range=None, ax=None, recoverable_only=False, **kwargs
942 ): # pragma: no cover
943 '''
944 Plot the recovery rate for a range of thresholds
946 Args:
947 strategies (list): List of the strategies you want to plot, if None, all will be plotted
948 thresh_range (list): thresholds for deciding whether to accept as recovered
949 ax (Matplotlib.axes): Somewhere to plot
951 Returns:
952 None
953 '''
954 # fill default values if nothing is specified
955 strategies = self.strategies if strategies is None else strategies
957 thresh_range = 1 + np.linspace(-4e-2, 4e-2, 100) if thresh_range is None else thresh_range
958 if ax is None:
959 fig, ax = plt.subplots(1, 1)
961 rec_rates = [[None] * len(thresh_range)] * len(strategies)
962 for strategy_idx in range(len(strategies)):
963 strategy = strategies[strategy_idx]
964 # load the stats
965 fault_free = self.load(strategy=strategy, faults=False)
966 with_faults = self.load(strategy=strategy, faults=True)
968 if recoverable_only:
969 recoverable_mask = self.get_fixable_faults_only(strategy)
970 else:
971 recoverable_mask = self.get_mask(strategy=strategy)
973 for thresh_idx in range(len(thresh_range)):
974 rec_mask = self.get_mask(
975 strategy=strategy,
976 key='error',
977 val=(thresh_range[thresh_idx] * fault_free['error'].mean()),
978 op='gt',
979 old_mask=recoverable_mask,
980 )
981 rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len(
982 with_faults['error'][recoverable_mask]
983 )
985 ax.plot(
986 thresh_range, rec_rates[strategy_idx], **{'color': strategy.color, 'label': strategy.label, **kwargs}
987 )
988 ax.legend(frameon=False)
989 ax.set_ylabel('recovery rate')
990 ax.set_xlabel('threshold as ratio to fault-free error')
992 return None
994 def analyse_adaptivity(self, mask): # pragma: no cover
995 '''
996 Analyse a set of runs with adaptivity
998 Args:
999 mask (Numpy.ndarray of shape (n)): The mask you want to know about
1001 Returns:
1002 None
1003 '''
1004 index = self.get_index(mask)
1005 dat = self.load()
1007 # make a header
1008 print(' run | bit | node | iter | e_em^* | e_em | e_glob^* | e_glob ')
1009 print('-------+-----+------+------+----------+----------+----------+----------')
1010 for i in index:
1011 e_em, e_glob = self.analyse_adaptivity_single(int(i))
1012 print(
1013 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_em[1]:.2e}\
1014 | {e_em[0]:.2e} | {e_glob[1]:.2e} | {e_glob[0]:.2e}'
1015 )
1017 e_tol = AdaptivityStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][
1018 Adaptivity
1019 ]['e_tol']
1020 print(f'We only restart when e_em > e_tol = {e_tol:.2e}!')
1021 return None
1023 def analyse_adaptivity_single(self, run): # pragma: no cover
1024 '''
1025 Compute what the difference in embedded and global error are for a specific run with adaptivity
1027 Args:
1028 run (int): The run you want to know about
1030 Returns:
1031 list: Embedded error with fault and without for the last iteration in the step with a fault
1032 list: Global error with and without fault at the end of the run
1033 '''
1034 # perform one run with and one without faults
1035 stats = []
1036 controllers = []
1037 for faults in [True, False]:
1038 s, c, _ = self.single_run(
1039 strategy=AdaptivityStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]
1040 )
1041 stats += [s]
1042 controllers += [c]
1044 # figure out when the fault happened
1045 t_fault = get_sorted(stats[0], type='bitflip')[0][0]
1047 # get embedded error
1048 e_em = [
1049 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]
1050 for stat in stats
1051 ]
1053 # compute the global error
1054 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]
1055 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]
1057 return [e_em[i][-1] for i in [0, 1]], e_glob
1059 def analyse_HotRod(self, mask): # pragma: no cover
1060 '''
1061 Analyse a set of runs with Hot Rod
1063 Args:
1064 mask (Numpy.ndarray of shape (n)): The mask you want to know about
1066 Returns:
1067 None
1068 '''
1069 index = self.get_index(mask)
1070 dat = self.load()
1072 # make a header
1073 print(
1074 ' run | bit | node | iter | e_ex^* | e_ex | e_em^* | e_em | diff* | diff | e_glob^* \
1075| e_glob '
1076 )
1077 print(
1078 '-------+-----+------+------+----------+----------+----------+----------+----------+----------+----------\
1079+----------'
1080 )
1081 for i in index:
1082 e_em, e_ex, e_glob = self.analyse_HotRod_single(int(i))
1083 print(
1084 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_ex[1]:.2e}\
1085 | {e_ex[0]:.2e} | {e_em[1]:.2e} | {e_em[0]:.2e} | {abs(e_em[1]-e_ex[1]):.2e} | {abs(e_em[0]-e_ex[0]):.2e} | \
1086{e_glob[1]:.2e} | {e_glob[0]:.2e}'
1087 )
1089 tol = HotRodStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][HotRod][
1090 'HotRod_tol'
1091 ]
1092 print(f'We only restart when diff > tol = {tol:.2e}!')
1093 return None
1095 def analyse_HotRod_single(self, run): # pragma: no cover
1096 '''
1097 Compute what the difference in embedded, extrapolated and global error are for a specific run with Hot Rod
1099 Args:
1100 run (int): The run you want to know about
1102 Returns:
1103 list: Embedded error with fault and without for the last iteration in the step with a fault
1104 list: Extrapolation error with fault and without for the last iteration in the step with a fault
1105 list: Global error with and without fault at the end of the run
1106 '''
1107 # perform one run with and one without faults
1108 stats = []
1109 controllers = []
1110 for faults in [True, False]:
1111 s, c, _ = self.single_run(
1112 strategy=HotRodStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]
1113 )
1114 stats += [s]
1115 controllers += [c]
1117 # figure out when the fault happened
1118 t_fault = get_sorted(stats[0], type='bitflip')[0][0]
1120 # get embedded error
1121 e_em = [
1122 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]
1123 for stat in stats
1124 ]
1125 # get extrapolated error
1126 e_ex = [
1127 [me[1] for me in get_sorted(stat, type='error_extrapolation_estimate', time=t_fault, sortby='iter')]
1128 for stat in stats
1129 ]
1131 # compute the global error
1132 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]
1133 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]
1135 return [e_em[i][-1] for i in [0, 1]], [e_ex[i][-1] for i in [0, 1]], e_glob
1137 def print_faults(self, mask=None, strategy=None): # pragma: no cover
1138 '''
1139 Print all faults that happened within a certain mask
1141 Args:
1142 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of
1143 strategy (Strategy): The resilience strategy you want to see the error for
1145 Returns:
1146 None
1147 '''
1148 strategy = BaseStrategy() if strategy is None else strategy
1150 index = self.get_index(mask)
1151 dat = self.load(strategy=strategy)
1153 e_star = np.mean(self.load(faults=False, strategy=strategy).get('error', [0]))
1155 # make a header
1156 print(' run | bit | node | iter | rank | rel err | space pos')
1157 print('-------+-----+------+------+------+---------+-----------')
1158 for i in index:
1159 print(
1160 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {dat.get("rank", np.zeros_like(dat["iteration"]))[i]:4.0f} | {dat["error"][i] / e_star:.1e} | {dat["problem_pos"][i]}'
1161 )
1163 return None
1165 def get_mask(self, strategy=None, key=None, val=None, op='eq', old_mask=None, compare_faults=False):
1166 '''
1167 Make a mask to apply to stored data to filter out certain things
1169 Args:
1170 strategy (Strategy): The resilience strategy you want to apply the mask to. Most masks are the same for all
1171 strategies so None is fine
1172 key (str): The key in the stored statistics that you want to filter for some value
1173 val (str, float, int, bool): A value that you want to use for filtering. Dtype depends on key
1174 op (str): Operation that is applied for filtering
1175 old_mask (Numpy.ndarray of shape (n)): Apply this mask on top of the filter
1176 compare_faults (bool): instead of comparing to val, compare to the mean value for fault free runs
1178 Returns:
1179 Numpy.ndarray with boolean entries that can be used as a mask
1180 '''
1181 strategy = self.strategies[0] if strategy is None else strategy
1182 dat = self.load(strategy=strategy, faults=True)
1184 if compare_faults:
1185 if val is not None:
1186 raise ValueError('Can\'t use val and compare_faults in get_mask at the same time!')
1187 else:
1188 vals = self.load(strategy=strategy, faults=False)[key]
1189 val = sum(vals) / len(vals)
1191 if None in [key, val] and op not in ['isfinite']:
1192 mask = dat['bit'] == dat['bit']
1193 else:
1194 if op == 'uneq':
1195 mask = dat[key] != val
1196 elif op == 'eq':
1197 mask = dat[key] == val
1198 elif op == 'leq':
1199 mask = dat[key] <= val
1200 elif op == 'geq':
1201 mask = dat[key] >= val
1202 elif op == 'lt':
1203 mask = dat[key] < val
1204 elif op == 'gt':
1205 mask = dat[key] > val
1206 elif op == 'isfinite':
1207 mask = np.isfinite(dat[key])
1208 else:
1209 raise NotImplementedError(f'Please implement op={op}!')
1211 if old_mask is not None:
1212 return mask & old_mask
1213 else:
1214 return mask
1216 def get_fixable_faults_only(self, strategy):
1217 """
1218 Return a mask of only faults that can be fixed with a given strategy.
1220 Args:
1221 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all
1223 Returns:
1224 Numpy.ndarray with boolean entries that can be used as a mask
1225 """
1226 fixable = strategy.get_fixable_params(
1227 maxiter=strategy.get_custom_description(self.prob, self.num_procs)['step_params']['maxiter']
1228 )
1229 mask = self.get_mask(strategy=strategy)
1231 for kwargs in fixable:
1232 mask = self.get_mask(strategy=strategy, **kwargs, old_mask=mask)
1234 return mask
1236 def get_index(self, mask=None):
1237 '''
1238 Get the indeces of all runs in mask
1240 Args:
1241 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of
1243 Returns:
1244 Numpy.ndarray: Array of indeces
1245 '''
1246 if mask is None:
1247 dat = self.load()
1248 return np.arange(len(dat['iteration']))
1249 else:
1250 return np.arange(len(mask))[mask]
1252 def get_statistics_info(self, mask=None, strategy=None, print_all=False, ax=None): # pragma: no cover
1253 '''
1254 Get information about how many data points for faults we have given a particular mask
1256 Args:
1257 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1258 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all
1259 strategies, so you don't need to supply this argument
1260 print_all (bool): Whether to add information that is normally not useful to the table
1261 ax (Matplotlib.axes): Somewhere to plot the combinations histogram
1263 Returns:
1264 None
1265 '''
1267 # load some data from which to infer the number occurrences of some event
1268 strategy = self.strategies[0] if strategy is None else strategy
1269 dat = self.load(stratagy=strategy, faults=True)
1271 # make a dummy mask in case none is supplied
1272 if mask is None:
1273 mask = np.ones_like(dat['error'], dtype=bool)
1275 # print a header
1276 print(f' tot: {len(dat["error"][mask]):6} | avg. counts | mean deviation | unique entries')
1277 print('-------------------------------------------------------------')
1279 # make a list of all keys that you want to look at
1280 keys = ['iteration', 'bit', 'node']
1281 if print_all:
1282 keys += ['problem_pos', 'level', 'target']
1284 # print the table
1285 for key in keys:
1286 counts, dev, unique = self.count_occurrences(dat[key][mask])
1287 print(f' {key:11} | {counts:11.1f} | {dev:14.2f} | {unique:14}')
1289 return None
1291 def combinations_histogram(self, dat=None, keys=None, mask=None, ax=None): # pragma: no cover
1292 '''
1293 Make a histogram ouf of the occurrences of combinations
1295 Args:
1296 dat (dict): The data of the recorded statistics
1297 keys (list): The keys in dat that you want to know the combinations of
1298 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1300 Returns:
1301 Matplotlib.axes: The plot
1302 '''
1303 if ax is None:
1304 fig, ax = plt.subplots(1, 1)
1306 occurrences, bins = self.get_combination_histogram(dat, keys, mask)
1308 ax.bar(bins[:-1], occurrences)
1310 ax.set_xlabel('Occurrence of combinations')
1312 return ax
1314 def get_combination_histogram(self, dat=None, keys=None, mask=None): # pragma: no cover
1315 '''
1316 Check how many combinations of values we expect and how many we find to see if we need to do more experiments.
1317 It is assumed that each allowed value for each key appears at least once in dat after the mask was applied
1319 Args:
1320 dat (dict): The data of the recorded statistics
1321 keys (list): The keys in dat that you want to know the combinations of
1322 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1324 Returns:
1325 Numpy.ndarray: Number of occurrences of combinations
1326 Numpy.ndarray: Bins
1327 '''
1329 # load default values
1330 dat = self.load(strategy=self.strategies[0], faults=True) if dat is None else dat
1331 keys = ['iteration', 'bit', 'node'] if keys is None else keys
1332 if mask is None:
1333 mask = np.ones_like(dat['error'], dtype=bool)
1335 # get unique values and compute how many combinations you expect
1336 num_unique = [len(np.unique(dat[key][mask])) for key in keys]
1337 expected_number_of_combinations = np.prod(num_unique)
1339 # test what you actually get
1340 combination_counts = self.get_combination_counts(dat, keys, mask)
1342 # make a histogram with the result
1343 occurrences, bins = np.histogram(combination_counts, bins=np.arange(max(combination_counts) + 1))
1344 occurrences[0] = expected_number_of_combinations - len(combination_counts)
1346 return occurrences, bins
1348 def get_max_combinations(self, dat=None, strategy=None):
1349 '''
1350 Count how many combinations of parameters for faults are possible
1352 Args:
1353 dat (dict): The recorded statistics
1354 keys (list): The keys in dat that you want to know the combinations of
1355 strategy (Strategy): The resilience strategy
1357 Returns:
1358 int: Number of possible combinations
1359 '''
1360 strategy = self.strategies[0] if strategy is None else strategy
1361 stats, controller, Tend = self.single_run(strategy=strategy, run=0, faults=True)
1362 faultHook = get_fault_injector_hook(controller)
1363 ranges = [
1364 (0, faultHook.rnd_params['level_number']),
1365 (faultHook.rnd_params.get('min_node', 0), faultHook.rnd_params['node'] + 1),
1366 (1, faultHook.rnd_params['iteration'] + 1),
1367 (0, faultHook.rnd_params['bit']),
1368 (0, faultHook.rnd_params['rank']),
1369 ]
1370 ranges += [(0, i) for i in faultHook.rnd_params['problem_pos']]
1371 return np.prod([me[1] - me[0] for me in ranges], dtype=int)
1373 def get_combination_counts(self, dat, keys, mask):
1374 '''
1375 Get counts of how often all combinations of values of keys appear. This is done recursively to support arbitrary
1376 numbers of keys
1378 Args:
1379 dat (dict): The data of the recorded statistics
1380 keys (list): The keys in dat that you want to know the combinations of
1381 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1383 Returns:
1384 list: Occurrences of all combinations
1385 '''
1386 key = keys[0]
1387 unique_vals = np.unique(dat[key][mask])
1388 res = []
1390 for i in range(len(unique_vals)):
1391 inner_mask = self.get_mask(key=key, val=unique_vals[i], op='eq', old_mask=mask)
1392 if len(keys) > 1:
1393 res += self.get_combination_counts(dat, keys[1:], inner_mask)
1394 else:
1395 res += [self.count_occurrences(dat[key][inner_mask])[0]]
1396 return res
1398 def count_occurrences(self, vals):
1399 '''
1400 Count the occurrences of unique values in vals and compute average deviation from mean
1402 Args:
1403 vals (list): Values you want to check
1405 Returns:
1406 float: Mean of number of occurrences of unique values in vals
1407 float: Average deviation from mean number of occurrences
1408 int: Number of unique entries
1409 '''
1410 unique_vals, counts = np.unique(vals, return_counts=True)
1412 if len(counts) > 0:
1413 return counts.mean(), sum(abs(counts - counts.mean())) / len(counts), len(counts)
1414 else:
1415 return None, None, 0
1417 def bar_plot_thing(
1418 self, x=None, thing=None, ax=None, mask=None, store=False, faults=False, name=None, op=None, args=None
1419 ): # pragma: no cover
1420 '''
1421 Make a bar plot about something!
1423 Args:
1424 x (Numpy.ndarray of dimension 1): x values for bar plot
1425 thing (str): Some key stored in the stats that will go on the y-axis
1426 mask (Numpy.ndarray of shape (n)): The mask you want to apply before plotting
1427 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)
1428 faults (bool): Whether to load stats with faults or without
1429 name (str): Optional name for the plot
1430 op (function): Operation that is applied to thing before plotting default is recovery rate
1431 args (dict): Parameters for how the plot should look
1433 Returns:
1434 None
1435 '''
1436 if ax is None:
1437 fig, ax = plt.subplots(1, 1)
1438 store = True
1439 op = self.mean if op is None else op
1441 # get the values for the bars
1442 height = np.zeros(len(self.strategies))
1443 for strategy_idx in range(len(self.strategies)):
1444 strategy = self.strategies[strategy_idx]
1446 # load the values
1447 dat = self.load(strategy=strategy, faults=faults)
1448 no_faults = self.load(strategy=strategy, faults=False)
1450 # check if we have a mask
1451 if mask is None:
1452 mask = np.ones_like(dat[thing], dtype=bool)
1454 height[strategy_idx] = op(dat, no_faults, thing, mask)
1456 # get some x values
1457 x = np.arange(len(self.strategies)) if x is None else x
1459 # prepare labels
1460 ticks = [strategy.bar_plot_x_label for strategy in self.strategies]
1462 ax.bar(x, height, tick_label=ticks)
1464 # set the parameters
1465 ax.set_ylabel(thing)
1466 args = {} if args is None else args
1467 [plt.setp(ax, k, v) for k, v in args.items()]
1469 if store:
1470 fig.tight_layout()
1471 plt.savefig(f'data/{self.get_name()}-{thing if name is None else name}-barplot.pdf', transparent=True)
1472 plt.close(fig)
1474 return None
1476 def fault_frequency_plot(self, ax, iter_ax, kwargs_range, strategy=None): # pragma: no cover
1477 func_args = locals()
1478 func_args.pop('self', None)
1479 if strategy is None:
1480 for strat in self.strategies:
1481 args = {**func_args, 'strategy': strat}
1482 self.fault_frequency_plot(**args)
1483 return None
1485 # load data
1486 all_data = {}
1487 for me in kwargs_range['fault_frequency_iter']:
1488 self.kwargs['fault_frequency_iter'] = me
1489 self.get_recovered()
1490 all_data[me] = self.load(strategy=strategy, faults=True, mode='regular')
1492 # get_recovery_rate
1493 results = {}
1494 results['frequencies'] = list(all_data.keys())
1495 results['recovery_rate'] = [
1496 len(all_data[key]['recovered'][all_data[key]['recovered'] is True]) / len(all_data[key]['recovered'])
1497 for key in all_data.keys()
1498 ]
1499 # results['iterations'] = [np.mean(all_data[key]['total_iteration']) for key in all_data.keys()]
1500 results['iterations'] = [
1501 np.mean(all_data[key]['total_iteration'][all_data[key]['error'] != np.inf]) for key in all_data.keys()
1502 ]
1504 ax.plot(results['frequencies'], results['recovery_rate'], **strategy.style)
1505 iter_ax.plot(results['frequencies'], results['iterations'], **{**strategy.style, 'ls': '--'})
1507 ax.set_xscale('log')
1508 ax.set_xlabel('iterations between fault')
1509 ax.set_ylabel('recovery rate')
1510 iter_ax.set_ylabel('average total iterations if not crashed (dashed)')
1511 ax.legend(frameon=False)
1513 return None
1515 def get_HR_tol(self, verbose=False):
1516 from pySDC.implementations.convergence_controller_classes.hotrod import HotRod
1518 HR_strategy = HotRodStrategy(useMPI=self.use_MPI)
1520 description = HR_strategy.get_custom_description_for_faults(self.prob, self.num_procs)
1521 description['convergence_controllers'][HotRod]['HotRod_tol'] = 1e2
1523 stats, _, _ = self.single_run(HR_strategy, force_params=description)
1525 e_extrapolation = get_sorted(stats, type='error_extrapolation_estimate')
1526 diff = []
1527 for i in range(len(e_extrapolation)):
1528 if e_extrapolation[i][1] is not None:
1529 e_embedded = get_sorted(stats, type='error_embedded_estimate', time=e_extrapolation[i][0])
1530 diff += [abs(e_extrapolation[i][1] - e_embedded[-1][1])]
1532 max_diff = max(diff)
1533 proposed_tol = 2 * max_diff
1534 if verbose:
1535 print(
1536 f'Max. diff: {max_diff:.6e} -> proposed HR tolerance: {proposed_tol:.6e} for {self.prob.__name__} problem with {self.num_procs} procs.'
1537 )
1538 else:
1539 print(f'{proposed_tol:.6e}')
1542def check_local_error(): # pragma: no cover
1543 """
1544 Make a plot of the resolution over time for all problems
1545 """
1546 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]
1547 problems = [run_quench]
1548 strategies = [BaseStrategy(), AdaptivityStrategy(), IterateStrategy()]
1550 for i in range(len(problems)):
1551 stats_analyser = FaultStats(
1552 prob=problems[i],
1553 strategies=strategies,
1554 faults=[False],
1555 reload=True,
1556 recovery_thresh=1.1,
1557 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1558 num_procs=1,
1559 mode='random',
1560 )
1561 stats_analyser.compare_strategies()
1562 plt.show()
1565def parse_args():
1566 import sys
1568 kwargs = {}
1570 for i in range(len(sys.argv)):
1571 if 'prob' in sys.argv[i]:
1572 if sys.argv[i + 1] == 'run_Lorenz':
1573 kwargs['prob'] = run_Lorenz
1574 elif sys.argv[i + 1] == 'run_vdp':
1575 kwargs['prob'] = run_vdp
1576 elif sys.argv[i + 1] == 'run_Schroedinger':
1577 kwargs['prob'] = run_Schroedinger
1578 elif sys.argv[i + 1] == 'run_quench':
1579 kwargs['prob'] = run_quench
1580 elif sys.argv[i + 1] == 'run_AC':
1581 kwargs['prob'] = run_AC
1582 elif sys.argv[i + 1] == 'run_RBC':
1583 kwargs['prob'] = run_RBC
1584 elif sys.argv[i + 1] == 'run_GS':
1585 kwargs['prob'] = run_GS
1586 else:
1587 raise NotImplementedError
1588 elif 'num_procs' in sys.argv[i]:
1589 kwargs['num_procs'] = int(sys.argv[i + 1])
1590 elif 'runs' in sys.argv[i]:
1591 kwargs['runs'] = int(sys.argv[i + 1])
1592 elif 'mode' in sys.argv[i]:
1593 kwargs['mode'] = str(sys.argv[i + 1])
1594 elif 'reload' in sys.argv[i]:
1595 kwargs['reload'] = False if sys.argv[i + 1] == 'False' else True
1597 return kwargs
1600def compare_adaptivity_modes():
1601 from pySDC.projects.Resilience.strategies import AdaptivityRestartFirstStep
1603 fig, axs = plt.subplots(2, 2, sharey=True)
1604 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]
1605 titles = ['Van der Pol', 'Lorenz attractor', r'Schr\"odinger', 'Quench']
1607 for i in range(len(problems)):
1608 ax = axs.flatten()[i]
1610 ax.set_title(titles[i])
1612 stats_analyser = FaultStats(
1613 prob=problems[i],
1614 strategies=[AdaptivityStrategy()],
1615 faults=[False, True],
1616 reload=True,
1617 recovery_thresh=1.1,
1618 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1619 num_procs=1,
1620 mode='default',
1621 stats_path='data/stats-jusuf',
1622 )
1623 stats_analyser.get_recovered()
1625 for strategy in stats_analyser.strategies:
1626 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1627 stats_analyser.plot_things_per_things(
1628 'recovered',
1629 'bit',
1630 False,
1631 strategies=[strategy],
1632 op=stats_analyser.rec_rate,
1633 mask=fixable,
1634 args={'ylabel': 'recovery rate', 'label': 'bla'},
1635 name='fixable_recovery',
1636 ax=ax,
1637 )
1638 single_proc = ax.lines[-1]
1639 single_proc.set_label('single process')
1640 single_proc.set_color('black')
1642 stats_analyser = FaultStats(
1643 prob=problems[i],
1644 strategies=[AdaptivityStrategy(), AdaptivityRestartFirstStep()],
1645 faults=[False, True],
1646 reload=True,
1647 recovery_thresh=1.5,
1648 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1649 num_procs=4,
1650 mode='default',
1651 stats_path='data/stats-jusuf',
1652 )
1653 stats_analyser.get_recovered()
1655 for strategy in stats_analyser.strategies:
1656 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1657 stats_analyser.plot_things_per_things(
1658 'recovered',
1659 'bit',
1660 False,
1661 strategies=[strategy],
1662 op=stats_analyser.rec_rate,
1663 mask=fixable,
1664 args={'ylabel': 'recovery rate'},
1665 name='fixable_recovery',
1666 ax=ax,
1667 )
1668 fig.suptitle('Comparison of restart modes with adaptivity with 4 ranks')
1669 fig.tight_layout()
1670 plt.show()
1673def main():
1674 kwargs = {
1675 'prob': run_vdp,
1676 'num_procs': 1,
1677 'mode': 'default',
1678 'runs': 4000,
1679 'reload': True,
1680 **parse_args(),
1681 }
1683 strategy_args = {
1684 'stop_at_nan': True,
1685 }
1687 from pySDC.projects.Resilience.strategies import AdaptivityPolynomialError, kAdaptivityStrategy
1689 stats_analyser = FaultStats(
1690 strategies=[
1691 BaseStrategy(**strategy_args),
1692 AdaptivityStrategy(**strategy_args),
1693 kAdaptivityStrategy(**strategy_args),
1694 HotRodStrategy(**strategy_args),
1695 AdaptivityPolynomialError(**strategy_args),
1696 ],
1697 faults=[False, True],
1698 recovery_thresh=1.1,
1699 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(kwargs.get('prob', None), 0),
1700 stats_path='data/stats-jusuf',
1701 **kwargs,
1702 )
1703 stats_analyser.get_recovered()
1705 stats_analyser.run_stats_generation(runs=kwargs['runs'], step=8)
1707 if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data
1708 return None
1710 stats_analyser.get_recovered()
1711 mask = None
1713 # stats_analyser.compare_strategies()
1714 stats_analyser.plot_things_per_things(
1715 'recovered', 'node', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1716 )
1717 stats_analyser.plot_things_per_things(
1718 'recovered', 'iteration', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1719 )
1721 stats_analyser.plot_things_per_things(
1722 'recovered', 'bit', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1723 )
1725 # make a plot for only the faults that can be recovered
1726 fig, ax = plt.subplots(1, 1)
1727 for strategy in stats_analyser.strategies:
1728 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1729 stats_analyser.plot_things_per_things(
1730 'recovered',
1731 'bit',
1732 False,
1733 strategies=[strategy],
1734 op=stats_analyser.rec_rate,
1735 mask=fixable,
1736 args={'ylabel': 'recovery rate'},
1737 name='fixable_recovery',
1738 ax=ax,
1739 )
1740 fig.tight_layout()
1741 fig.savefig(f'data/{stats_analyser.get_name()}-recoverable.pdf', transparent=True)
1743 fig, ax = plt.subplots(1, 1, figsize=(13, 4))
1744 stats_analyser.plot_recovery_thresholds(ax=ax, thresh_range=np.logspace(-1, 1, 1000))
1745 # ax.axvline(stats_analyser.get_thresh(BaseStrategy()), color='grey', ls=':', label='recovery threshold')
1746 ax.set_xscale('log')
1747 ax.legend(frameon=False)
1748 fig.tight_layout()
1749 fig.savefig(f'data/{stats_analyser.get_name()}-threshold.pdf', transparent=True)
1751 stats_analyser.plot_things_per_things(
1752 'total_iteration',
1753 'bit',
1754 True,
1755 op=stats_analyser.mean,
1756 mask=mask,
1757 args={'yscale': 'log', 'ylabel': 'total iterations'},
1758 )
1759 stats_analyser.plot_things_per_things(
1760 'total_iteration',
1761 'bit',
1762 True,
1763 op=stats_analyser.extra_mean,
1764 mask=mask,
1765 args={'yscale': 'linear', 'ylabel': 'extra iterations'},
1766 name='extra_iter',
1767 )
1768 stats_analyser.plot_things_per_things(
1769 'error', 'bit', True, op=stats_analyser.mean, mask=mask, args={'yscale': 'log'}
1770 )
1772 stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.9, 1.3, 100))
1773 plt.show()
1776if __name__ == "__main__":
1777 main()
1778 # compare_adaptivity_modes()
1779 # check_local_error()