Coverage for pySDC/projects/Resilience/fault_stats.py: 59%
437 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-09 14:59 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-09 14:59 +0000
1import numpy as np
2import matplotlib.pyplot as plt
3from mpi4py import MPI
4import pickle
6import pySDC.helpers.plot_helper as plot_helper
7from pySDC.helpers.stats_helper import get_sorted
9from pySDC.projects.Resilience.hook import hook_collection, LogUAllIter, LogData
10from pySDC.projects.Resilience.fault_injection import get_fault_injector_hook
11from pySDC.implementations.convergence_controller_classes.hotrod import HotRod
12from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
13from pySDC.implementations.hooks.log_errors import LogLocalErrorPostStep
14from pySDC.implementations.hooks.log_work import LogWork
16# these problems are available for testing
17from pySDC.projects.Resilience.advection import run_advection
18from pySDC.projects.Resilience.vdp import run_vdp
19from pySDC.projects.Resilience.piline import run_piline
20from pySDC.projects.Resilience.Lorenz import run_Lorenz
21from pySDC.projects.Resilience.Schroedinger import run_Schroedinger
22from pySDC.projects.Resilience.quench import run_quench
23from pySDC.projects.Resilience.AC import run_AC
25from pySDC.projects.Resilience.strategies import BaseStrategy, AdaptivityStrategy, IterateStrategy, HotRodStrategy
26import logging
28plot_helper.setup_mpl(reset=True)
30LOGGER_LEVEL = 40
32RECOVERY_THRESH_ABS = {
33 # run_quench: 5e-3,
34 # run_Schroedinger: 1.7e-6,
35}
38class FaultStats:
39 '''
40 Class to generate and analyse fault statistics
41 '''
43 def __init__(
44 self,
45 prob=None,
46 strategies=None,
47 faults=None,
48 reload=True,
49 recovery_thresh=1 + 1e-3,
50 recovery_thresh_abs=0.0,
51 num_procs=1,
52 mode='default',
53 stats_path='data/stats',
54 use_MPI=False,
55 **kwargs,
56 ):
57 '''
58 Initialization routine
60 Args:
61 prob: A function that runs a pySDC problem, see imports for available problems
62 strategies (list): List of resilience strategies
63 faults (list): List of booleans that describe whether to use faults or not
64 reload (bool): Load previously computed statistics and continue from there or start from scratch
65 recovery_thresh (float): Relative threshold for recovery
66 num_procs (int): Number of processes
67 mode (str): Mode for fault generation: Either 'random' or 'combination'
68 '''
69 self.prob = prob
70 self.strategies = [None] if strategies is None else strategies
71 self.faults = [False, True] if faults is None else faults
72 self.reload = reload
73 self.recovery_thresh = recovery_thresh
74 self.recovery_thresh_abs = recovery_thresh_abs
75 self.num_procs = num_procs
76 self.use_MPI = use_MPI
77 self.stats_path = stats_path
78 self.kwargs = {
79 'fault_frequency_iter': 500,
80 **kwargs,
81 }
83 # decide mode
84 if mode == 'default':
85 if prob.__name__ in ['run_vdp', 'run_Lorenz']:
86 self.mode = 'combination'
87 else:
88 self.mode = 'random'
89 else:
90 self.mode = mode
92 self.logger = logging.getLogger('FaultStats')
93 self.logger.level = LOGGER_LEVEL
95 msg = 'Starting FaultStats with attributes'
96 for key, val in self.__dict__.items():
97 if key in ['logger']:
98 continue
99 item = [str(me) for me in val] if type(val) == list else str(val)
100 msg += '\n\t' f'{key}: {item}'
101 self.logger.log(25, msg)
103 def get_Tend(self):
104 '''
105 Get the final time of runs for fault stats based on the problem
107 Returns:
108 float: Tend to put into the run
109 '''
110 return self.strategies[0].get_Tend(self.prob, self.num_procs)
112 def run_stats_generation(
113 self, runs=1000, step=None, comm=None, kwargs_range=None, faults=None, _reload=False, _runs_partial=0
114 ):
115 '''
116 Run the generation of stats for all strategies in the `self.strategies` variable
118 Args:
119 runs (int): Number of runs you want to do
120 step (int): Number of runs you want to do between saving
121 comm (MPI.Communicator): Communicator for distributing runs
122 faults (bool): Whether to do stats with faults or without
123 kw_args_range (dict): Range for the parameters
124 _reload, _runs_partial: Variables only used for recursion. Do not change!
126 Returns:
127 None
128 '''
129 if faults is None:
130 for f in self.faults:
131 self.run_stats_generation(runs=runs, step=step, comm=comm, kwargs_range=kwargs_range, faults=f)
132 return None
134 for key, val in kwargs_range.items() if kwargs_range is not None else {}:
135 if type(val) == int:
136 self.kwargs[key] = val
137 else:
138 for me in val:
139 kwargs_range_me = {**kwargs_range, key: me}
140 self.run_stats_generation(
141 runs=runs, step=step, comm=comm, kwargs_range=kwargs_range_me, faults=faults
142 )
143 return None
145 comm = MPI.COMM_WORLD if comm is None else comm
146 step = (runs if comm.size == 1 else comm.size) if step is None else step
147 _runs_partial = step if _runs_partial == 0 else _runs_partial
148 reload = self.reload or _reload
150 # see if we limit the number of runs we want to do
151 max_runs = (
152 (min(self.get_max_combinations(), runs) if self.mode == 'combination' else runs) if faults else min(runs, 5)
153 )
155 if reload:
156 # sort the strategies to do some load balancing
157 sorting_index = None
158 if comm.rank == 0:
159 already_completed = np.array(
160 [self.load(strategy=strategy, faults=faults).get('runs', 0) for strategy in self.strategies]
161 )
162 sorting_index_ = np.argsort(already_completed)
163 sorting_index = sorting_index_[already_completed[sorting_index_] < max_runs]
164 _runs_partial = max(min(already_completed), _runs_partial)
166 # tell all ranks what strategies to use
167 sorting_index = comm.bcast(sorting_index, root=0)
168 _runs_partial = comm.bcast(_runs_partial, root=0)
169 strategies = [self.strategies[i] for i in sorting_index]
170 if len(strategies) == 0: # check if we are already done
171 return None
172 else:
173 strategies = self.strategies
175 strategy_comm = comm.Split(comm.rank % len(strategies))
177 for j in range(0, len(strategies), comm.size):
178 self.generate_stats(
179 strategy=strategies[j + (comm.rank % len(strategies) % (len(strategies) - j))],
180 runs=min(_runs_partial, max_runs),
181 faults=faults,
182 reload=reload,
183 comm=strategy_comm,
184 )
185 strategy_comm.Free()
186 self.run_stats_generation(
187 runs=runs, step=step, comm=comm, faults=faults, _reload=True, _runs_partial=_runs_partial + step
188 )
190 return None
192 def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, comm=None):
193 '''
194 Generate statistics for recovery from bit flips
195 -----------------------------------------------
197 Every run is given a different random seed such that we have different faults and the results are then stored
199 Args:
200 strategy (Strategy): Resilience strategy
201 runs (int): Number of runs you want to do
202 reload (bool): Load previously computed statistics and continue from there or start from scratch
203 faults (bool): Whether to do stats with faults or without
204 comm (MPI.Communicator): Communicator for distributing runs
206 Returns:
207 None
208 '''
209 comm = MPI.COMM_WORLD if comm is None else comm
211 # initialize dictionary to store the stats in
212 dat = {
213 'level': np.zeros(runs),
214 'iteration': np.zeros(runs),
215 'node': np.zeros(runs),
216 'problem_pos': [],
217 'bit': np.zeros(runs),
218 'error': np.zeros(runs),
219 'total_iteration': np.zeros(runs),
220 'total_newton_iteration': np.zeros(runs),
221 'restarts': np.zeros(runs),
222 'target': np.zeros(runs),
223 'rank': np.zeros(runs),
224 }
226 # store arguments for storing and loading
227 identifier_args = {
228 'faults': faults,
229 'strategy': strategy,
230 }
232 # reload previously recorded stats and write them to dat
233 if reload:
234 already_completed_ = None
235 if comm.rank == 0:
236 already_completed_ = self.load(**identifier_args)
237 already_completed = comm.bcast(already_completed_, root=0)
238 if already_completed['runs'] > 0 and already_completed['runs'] <= runs and comm.rank == 0:
239 avail_len = min([already_completed['runs'], runs])
240 for k in dat.keys():
241 dat[k][:avail_len] = already_completed.get(k, [None] * avail_len)
242 else:
243 already_completed = {'runs': 0}
245 # prepare a message
246 involved_ranks = comm.gather(MPI.COMM_WORLD.rank, root=0)
247 msg = f'{comm.size} rank(s) ({involved_ranks}) doing {strategy.name}{" with faults" if faults else ""} with {self.num_procs} ranks for {self.prob.__name__} from {already_completed["runs"]} to {runs}'
248 if comm.rank == 0 and already_completed['runs'] < runs:
249 print(msg, flush=True)
251 space_comm = MPI.COMM_SELF.Split(True)
253 # perform the remaining experiments
254 for i in range(already_completed['runs'], runs):
255 if i % comm.size != comm.rank:
256 continue
258 # perform a single experiment with the correct random seed
259 stats, controller, crash = self.single_run(strategy=strategy, run=i, faults=faults, space_comm=space_comm)
261 # get the data from the stats
262 faults_run = get_sorted(stats, type='bitflip')
264 if faults:
265 assert len(faults_run) > 0, f"Did not record a fault in run {i} of {strategy.name}!"
266 dat['level'][i] = faults_run[0][1][0]
267 dat['iteration'][i] = faults_run[0][1][1]
268 dat['node'][i] = faults_run[0][1][2]
269 dat['problem_pos'] += [faults_run[0][1][3]]
270 dat['bit'][i] = faults_run[0][1][4]
271 dat['target'][i] = faults_run[0][1][5]
272 dat['rank'][i] = faults_run[0][1][6]
273 if crash:
274 print('Code crashed!')
275 continue
277 # record the rest of the data
278 t, u = get_sorted(stats, type='u', recomputed=False)[-1]
279 error = self.get_error(u, t, controller, strategy, self.get_Tend())
280 total_iteration = sum([k[1] for k in get_sorted(stats, type='k')])
281 total_newton_iteration = sum([k[1] for k in get_sorted(stats, type='work_newton')])
283 dat['error'][i] = error
284 dat['total_iteration'][i] = total_iteration
285 dat['total_newton_iteration'][i] = total_newton_iteration
286 dat['restarts'][i] = sum([me[1] for me in get_sorted(stats, type='restart')])
288 # free the space communicator
289 space_comm.Free()
291 dat_full = {}
292 for k in dat.keys():
293 dat_full[k] = comm.reduce(dat[k], op=MPI.SUM)
295 # store the completed stats
296 dat_full['runs'] = runs
298 if already_completed['runs'] < runs:
299 if comm.rank == 0:
300 self.store(dat_full, **identifier_args)
301 if self.faults:
302 self.get_recovered(strategy=strategy)
304 return None
306 def get_error(self, u, t, controller, strategy, Tend):
307 """
308 Compute the error.
310 Args:
311 u (dtype_u): The solution at the end of the run
312 t (float): Time at which `u` was recorded
313 controller (pySDC.controller.controller): The controller
314 strategy (Strategy): The resilience strategy
315 Tend (float): Time you want to run to
317 Returns:
318 float: Error
319 """
320 lvl = controller.MS[0].levels[0]
322 # decide if we reached Tend
323 Tend_reached = t + lvl.params.dt_initial >= Tend
325 if Tend_reached:
326 u_star = lvl.prob.u_exact(t=t)
327 return abs(u - u_star) / abs(u_star)
328 else:
329 print(f'Final time was not reached! Code crashed at t={t:.2f} instead of reaching Tend={Tend:.2f}')
330 return np.inf
332 def single_run(
333 self,
334 strategy,
335 run=0,
336 faults=False,
337 force_params=None,
338 hook_class=None,
339 space_comm=None,
340 Tend=None,
341 time_comm=None,
342 ):
343 '''
344 Run the problem once with the specified parameters
346 Args:
347 strategy (Strategy): The resilience strategy you plan on using
348 run (int): Index for fault generation
349 faults (bool): Whether or not to put faults in
350 force_params (dict): Change parameters in the description of the problem
351 space_comm (MPI.Communicator): A communicator for space parallelisation
352 Tend (float): Time you want to run to
353 time_comm (MPI.Intracomm.Communicator): Communicator in time
355 Returns:
356 dict: Stats object containing statistics for each step, each level and each iteration
357 pySDC.Controller: The controller of the run
358 float: The time the problem should have run to
359 '''
360 hook_class = hook_collection + [LogWork] + ([LogData] if hook_class is None else hook_class)
361 force_params = {} if force_params is None else force_params
363 # build the custom description
364 custom_description = strategy.get_custom_description_for_faults(self.prob, self.num_procs)
365 for k in force_params.keys():
366 custom_description[k] = {**custom_description.get(k, {}), **force_params[k]}
368 custom_controller_params = {
369 'logger_level': LOGGER_LEVEL,
370 **force_params.get('controller_params', {}),
371 }
373 if faults:
374 fault_stuff = {
375 'rng': None,
376 'args': strategy.get_fault_args(self.prob, self.num_procs),
377 'rnd_params': strategy.get_random_params(self.prob, self.num_procs),
378 }
380 # make parameters for faults:
381 if self.mode == 'random':
382 fault_stuff['rng'] = np.random.RandomState(run)
383 elif self.mode == 'combination':
384 fault_stuff['rng'] = run
385 elif self.mode == 'regular':
386 fault_stuff['rng'] = np.random.RandomState(run)
387 fault_stuff['fault_frequency_iter'] = self.kwargs['fault_frequency_iter']
388 fault_stuff['rnd_params'] = {
389 'bit': 12,
390 'min_node': 1,
391 }
392 else:
393 raise NotImplementedError(f'Don\'t know how to add faults in mode {self.mode}')
395 else:
396 fault_stuff = None
398 return self.prob(
399 custom_description=custom_description,
400 num_procs=self.num_procs,
401 hook_class=hook_class,
402 fault_stuff=fault_stuff,
403 Tend=self.get_Tend() if Tend is None else Tend,
404 custom_controller_params=custom_controller_params,
405 space_comm=space_comm,
406 comm=time_comm,
407 use_MPI=time_comm is not None,
408 )
410 def compare_strategies(self, run=0, faults=False, ax=None): # pragma: no cover
411 '''
412 Take a closer look at how the strategies compare for a specific run
414 Args:
415 run (int): The number of the run to get the appropriate random generator
416 faults (bool): Whether or not to include faults
417 ax (Matplotlib.axes): Somewhere to plot
419 Returns:
420 None
421 '''
422 if ax is None:
423 fig, ax = plt.subplots(1, 1)
424 store = True
425 else:
426 store = False
428 k_ax = ax.twinx()
429 ls = ['-.' if type(strategy) == HotRodStrategy else '-' for strategy in self.strategies]
430 [self.scrutinize_visual(self.strategies[i], run, faults, ax, k_ax, ls[i]) for i in range(len(self.strategies))]
432 # make a legend
433 [k_ax.plot([None], [None], label=strategy.label, color=strategy.color) for strategy in self.strategies]
434 k_ax.legend(frameon=True)
436 if store:
437 fig.tight_layout()
438 plt.savefig(f'data/{self.get_name()}-comparison.pdf', transparent=True)
440 def scrutinize_visual(
441 self, strategy, run, faults, ax=None, k_ax=None, ls='-', plot_restarts=False
442 ): # pragma: no cover
443 '''
444 Take a closer look at a specific run with a plot
446 Args:
447 strategy (Strategy): The resilience strategy you plan on using
448 run (int): The number of the run to get the appropriate random generator
449 faults (bool): Whether or not to include faults
450 ax (Matplotlib.axes): Somewhere to plot the error
451 k_ax (Matplotlib.axes): Somewhere to plot the iterations
452 plot_restarts (bool): Make vertical lines wherever restarts happened
454 Returns:
455 dict: Stats from the run
456 '''
457 if ax is None:
458 fig, ax = plt.subplots(1, 1)
459 store = True
460 else:
461 store = False
463 force_params = {}
465 stats, controller, Tend = self.single_run(
466 strategy=strategy,
467 run=run,
468 faults=faults,
469 force_params=force_params,
470 hook_class=hook_collection + [LogLocalErrorPostStep, LogData],
471 )
473 # plot the local error
474 e_loc = get_sorted(stats, type='e_local_post_step', recomputed=False)
475 ax.plot([me[0] for me in e_loc], [me[1] for me in e_loc], color=strategy.color, ls=ls)
477 # plot the iterations
478 k_ax = ax.twinx() if k_ax is None else k_ax
479 k = get_sorted(stats, type='k')
480 k_ax.plot([me[0] for me in k], np.cumsum([me[1] for me in k]), color=strategy.color, ls='--')
482 # plot the faults
483 faults = get_sorted(stats, type='bitflip')
484 for fault_time in [me[0] for me in faults]:
485 ax.axvline(fault_time, color='grey', ls=':')
487 # plot restarts
488 if plot_restarts:
489 restarts = get_sorted(stats, type='restart')
490 [ax.axvline(me[0], color='black', ls='-.') if me[1] else '' for me in restarts]
492 # decorate
493 ax.set_yscale('log')
494 ax.set_ylabel(r'$\epsilon$')
495 k_ax.set_ylabel('cumulative iterations (dashed)')
496 ax.set_xlabel(r'$t$')
498 if store:
499 fig.tight_layout()
500 plt.savefig(f'data/{self.get_name()}-{strategy.name}-details.pdf', transparent=True)
502 def scrutinize(self, strategy, logger_level=15, **kwargs):
503 '''
504 Take a closer look at a specific run
506 Args:
507 strategy (Strategy): The resilience strategy you plan on using
509 Returns:
510 None
511 '''
512 from pySDC.projects.Resilience.fault_injection import FaultInjector
514 force_params = {}
515 force_params['controller_params'] = {'logger_level': logger_level}
516 force_params['sweeper_params'] = {'skip_residual_computation': ()}
518 stats, controller, Tend = self.single_run(strategy=strategy, force_params=force_params, **kwargs)
520 u_all = get_sorted(stats, type='u', recomputed=False)
521 t, u = get_sorted(stats, type='u', recomputed=False)[np.argmax([me[0] for me in u_all])]
522 k = [me[1] for me in get_sorted(stats, type='k')]
524 fault_hook = [me for me in controller.hooks if type(me) == FaultInjector]
526 unhappened_faults = fault_hook[0].faults if len(fault_hook) > 0 else []
528 print(f'\nOverview for {strategy.name} strategy')
529 # print faults
530 faults = get_sorted(stats, type='bitflip')
531 print('\nfaults:')
532 print(' t | level | iter | node | bit | trgt | rank | pos')
533 print('--------+-------+------+------+-----+------+---------------------')
534 for f in faults:
535 print(
536 f' {f[0]:6.2f} | {f[1][0]:5d} | {f[1][1]:4d} | {f[1][2]:4d} | {f[1][4]:3d} | {f[1][5]:4d} | {f[1][6]:4d} |',
537 f[1][3],
538 )
539 print('--------+-------+------+------+-----+------+---------------------')
540 for f in unhappened_faults:
541 print(
542 f'!{f.time:6.2f} | {f.level_number:5d} | {f.iteration:4d} | {f.node:4d} | {f.bit:3d} | {f.target:4d} | {f.rank:4d} |',
543 f.problem_pos,
544 )
546 # see if we can determine if the faults where recovered
547 no_faults = self.load(strategy=strategy, faults=False)
548 e_star = np.mean(no_faults.get('error', [0]))
549 error = self.get_error(u, t, controller, strategy, Tend)
550 recovery_thresh = self.get_thresh(strategy)
552 print(
553 f'e={error:.2e}, e^*={e_star:.2e}, thresh: {recovery_thresh:.2e} -> recovered: {error < recovery_thresh}, e_rel = {error/abs(u):.2e}'
554 )
555 print(f'k: sum: {np.sum(k)}, min: {np.min(k)}, max: {np.max(k)}, mean: {np.mean(k):.2f},')
557 _newton_iter = get_sorted(stats, type='work_newton')
558 if len(_newton_iter) > 0:
559 newton_iter = [me[1] for me in _newton_iter]
560 print(
561 f'Newton: k: sum: {np.sum(newton_iter)}, min: {np.min(newton_iter)}, max: {np.max(newton_iter)}, mean: {np.mean(newton_iter):.2f},'
562 )
564 # checkout the step size
565 dt = [me[1] for me in get_sorted(stats, type='dt')]
566 print(f't: {t:.2f}, dt: min: {np.min(dt):.2e}, max: {np.max(dt):.2e}, mean: {np.mean(dt):.2e}')
568 # restarts
569 restarts = [me[1] for me in get_sorted(stats, type='restart')]
570 print(f'restarts: {sum(restarts)}, without faults: {no_faults["restarts"][0]}')
572 return stats
574 def convert_faults(self, faults):
575 '''
576 Make arrays of useable data from an entry in the stats object returned by pySDC
578 Args:
579 faults (list): The entry for faults returned by the pySDC run
581 Returns:
582 list: The times when the faults happened
583 list: The levels in which the faults happened
584 list: The iterations in which the faults happened
585 list: The nodes in which the faults happened
586 list: The problem positions in which the faults happened
587 list: The bits in which the faults happened
588 '''
589 time = [faults[i][0] for i in range(len(faults))]
590 level = [faults[i][1][0] for i in range(len(faults))]
591 iteration = [faults[i][1][1] for i in range(len(faults))]
592 node = [faults[i][1][2] for i in range(len(faults))]
593 problem_pos = [faults[i][1][3] for i in range(len(faults))]
594 bit = [faults[i][1][4] for i in range(len(faults))]
595 return time, level, iteration, node, problem_pos, bit
597 def get_path(self, **kwargs):
598 '''
599 Get the path to where the stats are stored
601 Args:
602 strategy (Strategy): The resilience strategy
603 faults (bool): Whether or not faults have been activated
605 Returns:
606 str: The path to what you are looking for
607 '''
608 return f'{self.stats_path}/{self.get_name(**kwargs)}.pickle'
610 def get_name(self, strategy=None, faults=True, mode=None):
611 '''
612 Function to get a unique name for a kind of statistics based on the problem and strategy that was used
614 Args:
615 strategy (Strategy): Resilience strategy
616 faults (bool): Whether or not faults where inserted
618 Returns:
619 str: The unique identifier
620 '''
621 if self.prob == run_advection:
622 prob_name = 'advection'
623 elif self.prob == run_vdp:
624 prob_name = 'vdp'
625 elif self.prob == run_piline:
626 prob_name = 'piline'
627 elif self.prob == run_Lorenz:
628 prob_name = 'Lorenz'
629 elif self.prob == run_Schroedinger:
630 prob_name = 'Schroedinger'
631 elif self.prob == run_quench:
632 prob_name = 'Quench'
633 elif self.prob.__name__ == 'run_AC':
634 prob_name = 'Allen-Cahn'
635 else:
636 raise NotImplementedError(f'Name not implemented for problem {self.prob}')
638 if faults:
639 fault_name = '-faults'
640 else:
641 fault_name = ''
643 if strategy is not None:
644 strategy_name = f'-{strategy.name}'
645 else:
646 strategy_name = ''
648 mode = self.mode if mode is None else mode
649 if mode == 'regular':
650 mode_thing = f'-regular{self.kwargs["fault_frequency_iter"] if faults else ""}'
651 else:
652 mode_thing = ''
654 mpi_thing = '-MPI' if self.use_MPI else ''
655 return f'{prob_name}{strategy_name}{fault_name}-{self.num_procs}procs{mode_thing}{mpi_thing}'
657 def store(self, dat, **kwargs):
658 '''
659 Stores the data for a run at a predefined path
661 Args:
662 dat (dict): The data of the recorded statistics
664 Returns:
665 None
666 '''
667 with open(self.get_path(**kwargs), 'wb') as f:
668 pickle.dump(dat, f)
669 return None
671 def load(self, **kwargs):
672 '''
673 Loads the stats belonging to a specific strategy and whether or not faults where inserted.
674 When no data has been generated yet, a dictionary is returned which only contains the number of completed runs,
675 which is 0 of course.
677 Returns:
678 dict: Data from previous run or if it is not available a placeholder dictionary
679 '''
680 kwargs['strategy'] = kwargs.get('strategy', self.strategies[MPI.COMM_WORLD.rank % len(self.strategies)])
682 try:
683 with open(self.get_path(**kwargs), 'rb') as f:
684 dat = pickle.load(f)
685 except FileNotFoundError:
686 self.logger.debug(f'File \"{self.get_path(**kwargs)}\" not found!')
687 return {'runs': 0}
688 return dat
690 def get_thresh(self, strategy=None):
691 """
692 Get recovery threshold based on relative and absolute tolerances
694 Args:
695 strategy (Strategy): The resilience strategy
696 """
697 fault_free = self.load(strategy=strategy, faults=False)
698 assert (
699 fault_free['error'].std() / fault_free['error'].mean() < 1e-5
700 ), f'Got too large variation of errors in {strategy} strategy!'
701 return max([self.recovery_thresh_abs, self.recovery_thresh * fault_free["error"].mean()])
703 def get_recovered(self, **kwargs):
704 '''
705 Determine the recovery rate for a specific strategy and store it to disk.
707 Returns:
708 None
709 '''
710 if 'strategy' not in kwargs.keys():
711 [self.get_recovered(strategy=strat, **kwargs) for strat in self.strategies]
712 else:
713 try:
714 with_faults = self.load(faults=True, **kwargs)
715 with_faults['recovered'] = with_faults['error'] < self.get_thresh(kwargs['strategy'])
716 self.store(faults=True, dat=with_faults, **kwargs)
717 except KeyError as error:
718 print(
719 f'Warning: Can\'t compute recovery rate for strategy {kwargs["strategy"].name} in {self.prob.__name__} problem right now: KeyError: {error}'
720 )
722 return None
724 def crash_rate(self, dat, no_faults, thingA, mask):
725 '''
726 Determine the rate of runs that crashed
728 Args:
729 dat (dict): The data of the recorded statistics with faults
730 no_faults (dict): The data of the corresponding fault-free stats
731 thingA (str): Some key stored in the stats that will go on the y-axis
732 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply before determining the rate
734 Returns:
735 int: Ratio of the runs which crashed and fall under the specific criteria set by the mask
736 '''
737 if len(dat[thingA][mask]) > 0:
738 crash = dat['error'] == np.inf
739 return len(dat[thingA][mask & crash]) / len(dat[thingA][mask])
740 else:
741 return None
743 def rec_rate(self, dat, no_faults, thingA, mask):
744 '''
745 Operation for plotting which returns the recovery rate for a given mask.
746 Which thingA you apply this to actually does not matter here since we compute a rate.
748 Args:
749 dat (dict): The recorded statistics
750 no_faults (dict): The corresponding fault-free stats
751 thingA (str): Some key stored in the stats
752 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
754 Returns:
755 float: Recovery rate
756 '''
757 if len(dat[thingA][mask]) > 0:
758 return len(dat[thingA][mask & dat['recovered']]) / len(dat[thingA][mask])
759 else:
760 return None
762 def mean(self, dat, no_faults, thingA, mask):
763 '''
764 Operation for plotting which returns the mean of thingA after applying the mask
766 Args:
767 dat (dict): The recorded statistics
768 no_faults (dict): The corresponding fault-free stats
769 thingA (str): Some key stored in the stats
770 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
772 Returns:
773 float: Mean of thingA after applying mask
774 '''
775 return np.mean(dat[thingA][mask])
777 def extra_mean(self, dat, no_faults, thingA, mask):
778 '''
779 Operation for plotting which returns the difference in mean of thingA between runs with and without faults after
780 applying the mask
782 Args:
783 dat (dict): The recorded statistics
784 no_faults (dict): The corresponding fault-free stats
785 thingA (str): Some key stored in the stats
786 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
788 Returns:
789 float: Difference in mean of thingA between runs with and without faults after applying mask
790 '''
791 if True in mask or int in [type(me) for me in mask]:
792 return np.mean(dat[thingA][mask]) - np.mean(no_faults[thingA])
793 else:
794 return None
796 def plot_thingA_per_thingB(
797 self,
798 strategy,
799 thingA,
800 thingB,
801 ax=None,
802 mask=None,
803 recovered=False,
804 op=None,
805 **kwargs,
806 ): # pragma: no cover
807 '''
808 Plot thingA vs. thingB for a single strategy
810 Args:
811 strategy (Strategy): The resilience strategy you want to plot
812 thingA (str): Some key stored in the stats that will go on the y-axis
813 thingB (str): Some key stored in the stats that will go on the x-axis
814 ax (Matplotlib.axes): Somewhere to plot
815 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes
816 recovered (bool): Show the plot for both all runs and only the recovered ones
817 op (function): Operation that is applied to thingA before plotting default is recovery rate
819 Returns:
820 None
821 '''
822 op = self.rec_rate if op is None else op
823 dat = self.load(strategy=strategy, faults=True)
824 no_faults = self.load(strategy=strategy, faults=False)
826 if mask is None:
827 mask = np.ones_like(dat[thingB], dtype=bool)
829 admissable_thingB = np.unique(dat[thingB][mask])
830 me = np.zeros(len(admissable_thingB))
831 me_recovered = np.zeros_like(me)
833 for i in range(len(me)):
834 _mask = (dat[thingB] == admissable_thingB[i]) & mask
835 if _mask.any():
836 me[i] = op(dat, no_faults, thingA, _mask)
837 me_recovered[i] = op(dat, no_faults, thingA, _mask & dat['recovered'])
839 if recovered:
840 ax.plot(
841 admissable_thingB,
842 me_recovered,
843 label=f'{strategy.label} (only recovered)',
844 color=strategy.color,
845 marker=strategy.marker,
846 ls='--',
847 linewidth=3,
848 **kwargs,
849 )
851 ax.plot(
852 admissable_thingB,
853 me,
854 label=f'{strategy.label}',
855 color=strategy.color,
856 marker=strategy.marker,
857 linewidth=2,
858 **kwargs,
859 )
861 ax.legend(frameon=False)
862 ax.set_xlabel(thingB)
863 ax.set_ylabel(thingA)
864 return None
866 def plot_things_per_things(
867 self,
868 thingA='bit',
869 thingB='bit',
870 recovered=False,
871 mask=None,
872 op=None,
873 args=None,
874 strategies=None,
875 name=None,
876 store=True,
877 ax=None,
878 fig=None,
879 plotting_args=None,
880 ): # pragma: no cover
881 '''
882 Plot thingA vs thingB for multiple strategies
884 Args:
885 thingA (str): Some key stored in the stats that will go on the y-axis
886 thingB (str): Some key stored in the stats that will go on the x-axis
887 recovered (bool): Show the plot for both all runs and only the recovered ones
888 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes
889 op (function): Operation that is applied to thingA before plotting default is recovery rate
890 args (dict): Parameters for how the plot should look
891 strategies (list): List of the strategies you want to plot, if None, all will be plotted
892 name (str): Optional name for the plot
893 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)
894 ax (Matplotlib.axes): Somewhere to plot
895 fig (Matplotlib.figure): Figure of the ax
897 Returns
898 None
899 '''
900 strategies = self.strategies if strategies is None else strategies
901 args = {} if args is None else args
903 # make sure we have something to plot in
904 if ax is None:
905 fig, ax = plt.subplots(1, 1)
906 elif fig is None:
907 store = False
909 # execute the plots for all strategies
910 for s in strategies:
911 self.plot_thingA_per_thingB(
912 s,
913 thingA=thingA,
914 thingB=thingB,
915 recovered=recovered,
916 ax=ax,
917 mask=mask,
918 op=op,
919 **(plotting_args if plotting_args else {}),
920 )
922 # set the parameters
923 [plt.setp(ax, k, v) for k, v in args.items()]
925 if store:
926 fig.tight_layout()
927 plt.savefig(f'data/{self.get_name()}-{thingA if name is None else name}_per_{thingB}.pdf', transparent=True)
928 plt.close(fig)
930 return None
932 def plot_recovery_thresholds(
933 self, strategies=None, thresh_range=None, ax=None, mask=None, **kwargs
934 ): # pragma: no cover
935 '''
936 Plot the recovery rate for a range of thresholds
938 Args:
939 strategies (list): List of the strategies you want to plot, if None, all will be plotted
940 thresh_range (list): thresholds for deciding whether to accept as recovered
941 ax (Matplotlib.axes): Somewhere to plot
942 mask (Numpy.ndarray of shape (n)): The mask you want to know about
944 Returns:
945 None
946 '''
947 # fill default values if nothing is specified
948 strategies = self.strategies if strategies is None else strategies
950 thresh_range = 1 + np.linspace(-4e-2, 4e-2, 100) if thresh_range is None else thresh_range
951 if ax is None:
952 fig, ax = plt.subplots(1, 1)
954 rec_rates = [[None] * len(thresh_range)] * len(strategies)
955 for strategy_idx in range(len(strategies)):
956 strategy = strategies[strategy_idx]
957 # load the stats
958 fault_free = self.load(strategy=strategy, faults=False)
959 with_faults = self.load(strategy=strategy, faults=True)
961 for thresh_idx in range(len(thresh_range)):
962 rec_mask = self.get_mask(
963 strategy=strategy,
964 key='error',
965 val=(thresh_range[thresh_idx] * fault_free['error'].mean()),
966 op='gt',
967 old_mask=mask,
968 )
969 rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len(
970 with_faults['error']
971 )
973 ax.plot(
974 thresh_range, rec_rates[strategy_idx], **{'color': strategy.color, 'label': strategy.label, **kwargs}
975 )
976 ax.legend(frameon=False)
977 ax.set_ylabel('recovery rate')
978 ax.set_xlabel('threshold as ratio to fault-free error')
980 return None
982 def analyse_adaptivity(self, mask): # pragma: no cover
983 '''
984 Analyse a set of runs with adaptivity
986 Args:
987 mask (Numpy.ndarray of shape (n)): The mask you want to know about
989 Returns:
990 None
991 '''
992 index = self.get_index(mask)
993 dat = self.load()
995 # make a header
996 print(' run | bit | node | iter | e_em^* | e_em | e_glob^* | e_glob ')
997 print('-------+-----+------+------+----------+----------+----------+----------')
998 for i in index:
999 e_em, e_glob = self.analyse_adaptivity_single(int(i))
1000 print(
1001 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_em[1]:.2e}\
1002 \
1003 | {e_em[0]:.2e} | {e_glob[1]:.2e} | {e_glob[0]:.2e}'
1004 )
1006 e_tol = AdaptivityStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][
1007 Adaptivity
1008 ]['e_tol']
1009 print(f'We only restart when e_em > e_tol = {e_tol:.2e}!')
1010 return None
1012 def analyse_adaptivity_single(self, run): # pragma: no cover
1013 '''
1014 Compute what the difference in embedded and global error are for a specific run with adaptivity
1016 Args:
1017 run (int): The run you want to know about
1019 Returns:
1020 list: Embedded error with fault and without for the last iteration in the step with a fault
1021 list: Global error with and without fault at the end of the run
1022 '''
1023 # perform one run with and one without faults
1024 stats = []
1025 controllers = []
1026 for faults in [True, False]:
1027 s, c, _ = self.single_run(
1028 strategy=AdaptivityStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]
1029 )
1030 stats += [s]
1031 controllers += [c]
1033 # figure out when the fault happened
1034 t_fault = get_sorted(stats[0], type='bitflip')[0][0]
1036 # get embedded error
1037 e_em = [
1038 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]
1039 for stat in stats
1040 ]
1042 # compute the global error
1043 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]
1044 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]
1046 return [e_em[i][-1] for i in [0, 1]], e_glob
1048 def analyse_HotRod(self, mask): # pragma: no cover
1049 '''
1050 Analyse a set of runs with Hot Rod
1052 Args:
1053 mask (Numpy.ndarray of shape (n)): The mask you want to know about
1055 Returns:
1056 None
1057 '''
1058 index = self.get_index(mask)
1059 dat = self.load()
1061 # make a header
1062 print(
1063 ' run | bit | node | iter | e_ex^* | e_ex | e_em^* | e_em | diff* | diff | e_glob^* \
1064| e_glob '
1065 )
1066 print(
1067 '-------+-----+------+------+----------+----------+----------+----------+----------+----------+----------\
1068+----------'
1069 )
1070 for i in index:
1071 e_em, e_ex, e_glob = self.analyse_HotRod_single(int(i))
1072 print(
1073 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_ex[1]:.2e}\
1074 \
1075 | {e_ex[0]:.2e} | {e_em[1]:.2e} | {e_em[0]:.2e} | {abs(e_em[1]-e_ex[1]):.2e} | {abs(e_em[0]-e_ex[0]):.2e} \
1076 | \
1077{e_glob[1]:.2e} | {e_glob[0]:.2e}'
1078 )
1080 tol = HotRodStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][HotRod][
1081 'HotRod_tol'
1082 ]
1083 print(f'We only restart when diff > tol = {tol:.2e}!')
1084 return None
1086 def analyse_HotRod_single(self, run): # pragma: no cover
1087 '''
1088 Compute what the difference in embedded, extrapolated and global error are for a specific run with Hot Rod
1090 Args:
1091 run (int): The run you want to know about
1093 Returns:
1094 list: Embedded error with fault and without for the last iteration in the step with a fault
1095 list: Extrapolation error with fault and without for the last iteration in the step with a fault
1096 list: Global error with and without fault at the end of the run
1097 '''
1098 # perform one run with and one without faults
1099 stats = []
1100 controllers = []
1101 for faults in [True, False]:
1102 s, c, _ = self.single_run(
1103 strategy=HotRodStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]
1104 )
1105 stats += [s]
1106 controllers += [c]
1108 # figure out when the fault happened
1109 t_fault = get_sorted(stats[0], type='bitflip')[0][0]
1111 # get embedded error
1112 e_em = [
1113 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]
1114 for stat in stats
1115 ]
1116 # get extrapolated error
1117 e_ex = [
1118 [me[1] for me in get_sorted(stat, type='error_extrapolation_estimate', time=t_fault, sortby='iter')]
1119 for stat in stats
1120 ]
1122 # compute the global error
1123 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]
1124 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]
1126 return [e_em[i][-1] for i in [0, 1]], [e_ex[i][-1] for i in [0, 1]], e_glob
1128 def print_faults(self, mask=None, strategy=None): # pragma: no cover
1129 '''
1130 Print all faults that happened within a certain mask
1132 Args:
1133 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of
1134 strategy (Strategy): The resilience strategy you want to see the error for
1136 Returns:
1137 None
1138 '''
1139 strategy = BaseStrategy() if strategy is None else strategy
1141 index = self.get_index(mask)
1142 dat = self.load(strategy=strategy)
1144 e_star = np.mean(self.load(faults=False, strategy=strategy).get('error', [0]))
1146 # make a header
1147 print(' run | bit | node | iter | rank | rel err | space pos')
1148 print('-------+-----+------+------+------+---------+-----------')
1149 for i in index:
1150 print(
1151 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {dat.get("rank", np.zeros_like(dat["iteration"]))[i]:4.0f} | {dat["error"][i] / e_star:.1e} | {dat["problem_pos"][i]}'
1152 )
1154 return None
1156 def get_mask(self, strategy=None, key=None, val=None, op='eq', old_mask=None, compare_faults=False):
1157 '''
1158 Make a mask to apply to stored data to filter out certain things
1160 Args:
1161 strategy (Strategy): The resilience strategy you want to apply the mask to. Most masks are the same for all
1162 strategies so None is fine
1163 key (str): The key in the stored statistics that you want to filter for some value
1164 val (str, float, int, bool): A value that you want to use for filtering. Dtype depends on key
1165 op (str): Operation that is applied for filtering
1166 old_mask (Numpy.ndarray of shape (n)): Apply this mask on top of the filter
1167 compare_faults (bool): instead of comparing to val, compare to the mean value for fault free runs
1169 Returns:
1170 Numpy.ndarray with boolean entries that can be used as a mask
1171 '''
1172 strategy = self.strategies[0] if strategy is None else strategy
1173 dat = self.load(strategy=strategy, faults=True)
1175 if compare_faults:
1176 if val is not None:
1177 raise ValueError('Can\'t use val and compare_faults in get_mask at the same time!')
1178 else:
1179 vals = self.load(strategy=strategy, faults=False)[key]
1180 val = sum(vals) / len(vals)
1182 if None in [key, val] and op not in ['isfinite']:
1183 mask = dat['bit'] == dat['bit']
1184 else:
1185 if op == 'uneq':
1186 mask = dat[key] != val
1187 elif op == 'eq':
1188 mask = dat[key] == val
1189 elif op == 'leq':
1190 mask = dat[key] <= val
1191 elif op == 'geq':
1192 mask = dat[key] >= val
1193 elif op == 'lt':
1194 mask = dat[key] < val
1195 elif op == 'gt':
1196 mask = dat[key] > val
1197 elif op == 'isfinite':
1198 mask = np.isfinite(dat[key])
1199 else:
1200 raise NotImplementedError(f'Please implement op={op}!')
1202 if old_mask is not None:
1203 return mask & old_mask
1204 else:
1205 return mask
1207 def get_fixable_faults_only(self, strategy):
1208 """
1209 Return a mask of only faults that can be fixed with a given strategy.
1211 Args:
1212 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all
1214 Returns:
1215 Numpy.ndarray with boolean entries that can be used as a mask
1216 """
1217 fixable = strategy.get_fixable_params(
1218 maxiter=strategy.get_custom_description(self.prob, self.num_procs)['step_params']['maxiter']
1219 )
1220 mask = self.get_mask(strategy=strategy)
1222 for kwargs in fixable:
1223 mask = self.get_mask(strategy=strategy, **kwargs, old_mask=mask)
1225 return mask
1227 def get_index(self, mask=None):
1228 '''
1229 Get the indeces of all runs in mask
1231 Args:
1232 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of
1234 Returns:
1235 Numpy.ndarray: Array of indeces
1236 '''
1237 if mask is None:
1238 dat = self.load()
1239 return np.arange(len(dat['iteration']))
1240 else:
1241 return np.arange(len(mask))[mask]
1243 def get_statistics_info(self, mask=None, strategy=None, print_all=False, ax=None): # pragma: no cover
1244 '''
1245 Get information about how many data points for faults we have given a particular mask
1247 Args:
1248 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1249 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all
1250 strategies, so you don't need to supply this argument
1251 print_all (bool): Whether to add information that is normally not useful to the table
1252 ax (Matplotlib.axes): Somewhere to plot the combinations histogram
1254 Returns:
1255 None
1256 '''
1258 # load some data from which to infer the number occurrences of some event
1259 strategy = self.strategies[0] if strategy is None else strategy
1260 dat = self.load(stratagy=strategy, faults=True)
1262 # make a dummy mask in case none is supplied
1263 if mask is None:
1264 mask = np.ones_like(dat['error'], dtype=bool)
1266 # print a header
1267 print(f' tot: {len(dat["error"][mask]):6} | avg. counts | mean deviation | unique entries')
1268 print('-------------------------------------------------------------')
1270 # make a list of all keys that you want to look at
1271 keys = ['iteration', 'bit', 'node']
1272 if print_all:
1273 keys += ['problem_pos', 'level', 'target']
1275 # print the table
1276 for key in keys:
1277 counts, dev, unique = self.count_occurrences(dat[key][mask])
1278 print(f' {key:11} | {counts:11.1f} | {dev:14.2f} | {unique:14}')
1280 return None
1282 def combinations_histogram(self, dat=None, keys=None, mask=None, ax=None): # pragma: no cover
1283 '''
1284 Make a histogram ouf of the occurrences of combinations
1286 Args:
1287 dat (dict): The data of the recorded statistics
1288 keys (list): The keys in dat that you want to know the combinations of
1289 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1291 Returns:
1292 Matplotlib.axes: The plot
1293 '''
1294 if ax is None:
1295 fig, ax = plt.subplots(1, 1)
1297 occurrences, bins = self.get_combination_histogram(dat, keys, mask)
1299 ax.bar(bins[:-1], occurrences)
1301 ax.set_xlabel('Occurrence of combinations')
1303 return ax
1305 def get_combination_histogram(self, dat=None, keys=None, mask=None): # pragma: no cover
1306 '''
1307 Check how many combinations of values we expect and how many we find to see if we need to do more experiments.
1308 It is assumed that each allowed value for each key appears at least once in dat after the mask was applied
1310 Args:
1311 dat (dict): The data of the recorded statistics
1312 keys (list): The keys in dat that you want to know the combinations of
1313 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1315 Returns:
1316 Numpy.ndarray: Number of occurrences of combinations
1317 Numpy.ndarray: Bins
1318 '''
1320 # load default values
1321 dat = self.load(strategy=self.strategies[0], faults=True) if dat is None else dat
1322 keys = ['iteration', 'bit', 'node'] if keys is None else keys
1323 if mask is None:
1324 mask = np.ones_like(dat['error'], dtype=bool)
1326 # get unique values and compute how many combinations you expect
1327 num_unique = [len(np.unique(dat[key][mask])) for key in keys]
1328 expected_number_of_combinations = np.prod(num_unique)
1330 # test what you actually get
1331 combination_counts = self.get_combination_counts(dat, keys, mask)
1333 # make a histogram with the result
1334 occurrences, bins = np.histogram(combination_counts, bins=np.arange(max(combination_counts) + 1))
1335 occurrences[0] = expected_number_of_combinations - len(combination_counts)
1337 return occurrences, bins
1339 def get_max_combinations(self, dat=None, strategy=None):
1340 '''
1341 Count how many combinations of parameters for faults are possible
1343 Args:
1344 dat (dict): The recorded statistics
1345 keys (list): The keys in dat that you want to know the combinations of
1346 strategy (Strategy): The resilience strategy
1348 Returns:
1349 int: Number of possible combinations
1350 '''
1351 strategy = self.strategies[0] if strategy is None else strategy
1352 stats, controller, Tend = self.single_run(strategy=strategy, run=0, faults=True)
1353 faultHook = get_fault_injector_hook(controller)
1354 ranges = [
1355 (0, faultHook.rnd_params['level_number']),
1356 (faultHook.rnd_params.get('min_node', 0), faultHook.rnd_params['node'] + 1),
1357 (1, faultHook.rnd_params['iteration'] + 1),
1358 (0, faultHook.rnd_params['bit']),
1359 (0, faultHook.rnd_params['rank']),
1360 ]
1361 ranges += [(0, i) for i in faultHook.rnd_params['problem_pos']]
1362 return np.prod([me[1] - me[0] for me in ranges], dtype=int)
1364 def get_combination_counts(self, dat, keys, mask):
1365 '''
1366 Get counts of how often all combinations of values of keys appear. This is done recursively to support arbitrary
1367 numbers of keys
1369 Args:
1370 dat (dict): The data of the recorded statistics
1371 keys (list): The keys in dat that you want to know the combinations of
1372 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1374 Returns:
1375 list: Occurrences of all combinations
1376 '''
1377 key = keys[0]
1378 unique_vals = np.unique(dat[key][mask])
1379 res = []
1381 for i in range(len(unique_vals)):
1382 inner_mask = self.get_mask(key=key, val=unique_vals[i], op='eq', old_mask=mask)
1383 if len(keys) > 1:
1384 res += self.get_combination_counts(dat, keys[1:], inner_mask)
1385 else:
1386 res += [self.count_occurrences(dat[key][inner_mask])[0]]
1387 return res
1389 def count_occurrences(self, vals):
1390 '''
1391 Count the occurrences of unique values in vals and compute average deviation from mean
1393 Args:
1394 vals (list): Values you want to check
1396 Returns:
1397 float: Mean of number of occurrences of unique values in vals
1398 float: Average deviation from mean number of occurrences
1399 int: Number of unique entries
1400 '''
1401 unique_vals, counts = np.unique(vals, return_counts=True)
1403 if len(counts) > 0:
1404 return counts.mean(), sum(abs(counts - counts.mean())) / len(counts), len(counts)
1405 else:
1406 return None, None, 0
1408 def bar_plot_thing(
1409 self, x=None, thing=None, ax=None, mask=None, store=False, faults=False, name=None, op=None, args=None
1410 ): # pragma: no cover
1411 '''
1412 Make a bar plot about something!
1414 Args:
1415 x (Numpy.ndarray of dimension 1): x values for bar plot
1416 thing (str): Some key stored in the stats that will go on the y-axis
1417 mask (Numpy.ndarray of shape (n)): The mask you want to apply before plotting
1418 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)
1419 faults (bool): Whether to load stats with faults or without
1420 name (str): Optional name for the plot
1421 op (function): Operation that is applied to thing before plotting default is recovery rate
1422 args (dict): Parameters for how the plot should look
1424 Returns:
1425 None
1426 '''
1427 if ax is None:
1428 fig, ax = plt.subplots(1, 1)
1429 store = True
1430 op = self.mean if op is None else op
1432 # get the values for the bars
1433 height = np.zeros(len(self.strategies))
1434 for strategy_idx in range(len(self.strategies)):
1435 strategy = self.strategies[strategy_idx]
1437 # load the values
1438 dat = self.load(strategy=strategy, faults=faults)
1439 no_faults = self.load(strategy=strategy, faults=False)
1441 # check if we have a mask
1442 if mask is None:
1443 mask = np.ones_like(dat[thing], dtype=bool)
1445 height[strategy_idx] = op(dat, no_faults, thing, mask)
1447 # get some x values
1448 x = np.arange(len(self.strategies)) if x is None else x
1450 # prepare labels
1451 ticks = [strategy.bar_plot_x_label for strategy in self.strategies]
1453 ax.bar(x, height, tick_label=ticks)
1455 # set the parameters
1456 ax.set_ylabel(thing)
1457 args = {} if args is None else args
1458 [plt.setp(ax, k, v) for k, v in args.items()]
1460 if store:
1461 fig.tight_layout()
1462 plt.savefig(f'data/{self.get_name()}-{thing if name is None else name}-barplot.pdf', transparent=True)
1463 plt.close(fig)
1465 return None
1467 def fault_frequency_plot(self, ax, iter_ax, kwargs_range, strategy=None): # pragma: no cover
1468 func_args = locals()
1469 func_args.pop('self', None)
1470 if strategy is None:
1471 for strat in self.strategies:
1472 args = {**func_args, 'strategy': strat}
1473 self.fault_frequency_plot(**args)
1474 return None
1476 # load data
1477 all_data = {}
1478 for me in kwargs_range['fault_frequency_iter']:
1479 self.kwargs['fault_frequency_iter'] = me
1480 self.get_recovered()
1481 all_data[me] = self.load(strategy=strategy, faults=True, mode='regular')
1483 # get_recovery_rate
1484 results = {}
1485 results['frequencies'] = list(all_data.keys())
1486 results['recovery_rate'] = [
1487 len(all_data[key]['recovered'][all_data[key]['recovered'] is True]) / len(all_data[key]['recovered'])
1488 for key in all_data.keys()
1489 ]
1490 # results['iterations'] = [np.mean(all_data[key]['total_iteration']) for key in all_data.keys()]
1491 results['iterations'] = [
1492 np.mean(all_data[key]['total_iteration'][all_data[key]['error'] != np.inf]) for key in all_data.keys()
1493 ]
1495 ax.plot(results['frequencies'], results['recovery_rate'], **strategy.style)
1496 iter_ax.plot(results['frequencies'], results['iterations'], **{**strategy.style, 'ls': '--'})
1498 ax.set_xscale('log')
1499 ax.set_xlabel('iterations between fault')
1500 ax.set_ylabel('recovery rate')
1501 iter_ax.set_ylabel('average total iterations if not crashed (dashed)')
1502 ax.legend(frameon=False)
1504 return None
1506 def get_HR_tol(self, verbose=False):
1507 from pySDC.implementations.convergence_controller_classes.hotrod import HotRod
1509 HR_strategy = HotRodStrategy(useMPI=self.use_MPI)
1511 description = HR_strategy.get_custom_description(self.prob, self.num_procs)
1512 description['convergence_controllers'][HotRod]['HotRod_tol'] = 1e2
1514 stats, _, _ = self.single_run(HR_strategy, force_params=description)
1516 e_extrapolation = get_sorted(stats, type='error_extrapolation_estimate')
1517 diff = []
1518 for i in range(len(e_extrapolation)):
1519 if e_extrapolation[i][1] is not None:
1520 e_embedded = get_sorted(stats, type='error_embedded_estimate', time=e_extrapolation[i][0])
1521 diff += [abs(e_extrapolation[i][1] - e_embedded[-1][1])]
1523 max_diff = max(diff)
1524 proposed_tol = 2 * max_diff
1525 if verbose:
1526 print(
1527 f'Max. diff: {max_diff:.6e} -> proposed HR tolerance: {proposed_tol:.6e} for {self.prob.__name__} problem with {self.num_procs} procs.'
1528 )
1529 else:
1530 print(f'{proposed_tol:.6e}')
1533def check_local_error(): # pragma: no cover
1534 """
1535 Make a plot of the resolution over time for all problems
1536 """
1537 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]
1538 problems = [run_quench]
1539 strategies = [BaseStrategy(), AdaptivityStrategy(), IterateStrategy()]
1541 for i in range(len(problems)):
1542 stats_analyser = FaultStats(
1543 prob=problems[i],
1544 strategies=strategies,
1545 faults=[False],
1546 reload=True,
1547 recovery_thresh=1.1,
1548 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1549 num_procs=1,
1550 mode='random',
1551 )
1552 stats_analyser.compare_strategies()
1553 plt.show()
1556def parse_args():
1557 import sys
1559 kwargs = {}
1561 for i in range(len(sys.argv)):
1562 if 'prob' in sys.argv[i]:
1563 if sys.argv[i + 1] == 'run_Lorenz':
1564 kwargs['prob'] = run_Lorenz
1565 elif sys.argv[i + 1] == 'run_vdp':
1566 kwargs['prob'] = run_vdp
1567 elif sys.argv[i + 1] == 'run_Schroedinger':
1568 kwargs['prob'] = run_Schroedinger
1569 elif sys.argv[i + 1] == 'run_quench':
1570 kwargs['prob'] = run_quench
1571 else:
1572 raise NotImplementedError
1573 elif 'num_procs' in sys.argv[i]:
1574 kwargs['num_procs'] = int(sys.argv[i + 1])
1575 elif 'runs' in sys.argv[i]:
1576 kwargs['runs'] = int(sys.argv[i + 1])
1577 elif 'mode' in sys.argv[i]:
1578 kwargs['mode'] = str(sys.argv[i + 1])
1579 elif 'reload' in sys.argv[i]:
1580 kwargs['reload'] = False if sys.argv[i + 1] == 'False' else True
1582 return kwargs
1585def compare_adaptivity_modes():
1586 from pySDC.projects.Resilience.strategies import AdaptivityRestartFirstStep
1588 fig, axs = plt.subplots(2, 2, sharey=True)
1589 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]
1590 titles = ['Van der Pol', 'Lorenz attractor', r'Schr\"odinger', 'Quench']
1592 for i in range(len(problems)):
1593 ax = axs.flatten()[i]
1595 ax.set_title(titles[i])
1597 stats_analyser = FaultStats(
1598 prob=problems[i],
1599 strategies=[AdaptivityStrategy()],
1600 faults=[False, True],
1601 reload=True,
1602 recovery_thresh=1.1,
1603 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1604 num_procs=1,
1605 mode='default',
1606 stats_path='data/stats-jusuf',
1607 )
1608 stats_analyser.get_recovered()
1610 for strategy in stats_analyser.strategies:
1611 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1612 stats_analyser.plot_things_per_things(
1613 'recovered',
1614 'bit',
1615 False,
1616 strategies=[strategy],
1617 op=stats_analyser.rec_rate,
1618 mask=fixable,
1619 args={'ylabel': 'recovery rate', 'label': 'bla'},
1620 name='fixable_recovery',
1621 ax=ax,
1622 )
1623 single_proc = ax.lines[-1]
1624 single_proc.set_label('single process')
1625 single_proc.set_color('black')
1627 stats_analyser = FaultStats(
1628 prob=problems[i],
1629 strategies=[AdaptivityStrategy(), AdaptivityRestartFirstStep()],
1630 faults=[False, True],
1631 reload=True,
1632 recovery_thresh=1.5,
1633 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1634 num_procs=4,
1635 mode='default',
1636 stats_path='data/stats-jusuf',
1637 )
1638 stats_analyser.get_recovered()
1640 for strategy in stats_analyser.strategies:
1641 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1642 stats_analyser.plot_things_per_things(
1643 'recovered',
1644 'bit',
1645 False,
1646 strategies=[strategy],
1647 op=stats_analyser.rec_rate,
1648 mask=fixable,
1649 args={'ylabel': 'recovery rate'},
1650 name='fixable_recovery',
1651 ax=ax,
1652 )
1653 fig.suptitle('Comparison of restart modes with adaptivity with 4 ranks')
1654 fig.tight_layout()
1655 plt.show()
1658def main():
1659 kwargs = {
1660 'prob': run_AC,
1661 'num_procs': 1,
1662 'mode': 'default',
1663 'runs': 2000,
1664 'reload': True,
1665 **parse_args(),
1666 }
1668 strategy_args = {
1669 'stop_at_nan': True,
1670 }
1672 from pySDC.projects.Resilience.strategies import AdaptivityPolynomialError, kAdaptivityStrategy
1674 stats_analyser = FaultStats(
1675 strategies=[
1676 BaseStrategy(**strategy_args),
1677 AdaptivityStrategy(**strategy_args),
1678 kAdaptivityStrategy(**strategy_args),
1679 HotRodStrategy(**strategy_args),
1680 AdaptivityPolynomialError(**strategy_args),
1681 ],
1682 faults=[False, True],
1683 recovery_thresh=1.15,
1684 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(kwargs.get('prob', None), 0),
1685 stats_path='data/stats-jusuf',
1686 **kwargs,
1687 )
1688 stats_analyser.run_stats_generation(runs=kwargs['runs'], step=12)
1690 if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data
1691 return None
1693 stats_analyser.get_recovered()
1694 mask = None
1696 # stats_analyser.compare_strategies()
1697 stats_analyser.plot_things_per_things(
1698 'recovered', 'node', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1699 )
1700 stats_analyser.plot_things_per_things(
1701 'recovered', 'iteration', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1702 )
1704 stats_analyser.plot_things_per_things(
1705 'recovered', 'bit', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1706 )
1708 # make a plot for only the faults that can be recovered
1709 fig, ax = plt.subplots(1, 1)
1710 for strategy in stats_analyser.strategies:
1711 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1712 stats_analyser.plot_things_per_things(
1713 'recovered',
1714 'bit',
1715 False,
1716 strategies=[strategy],
1717 op=stats_analyser.rec_rate,
1718 mask=fixable,
1719 args={'ylabel': 'recovery rate'},
1720 name='fixable_recovery',
1721 ax=ax,
1722 )
1723 fig.tight_layout()
1724 fig.savefig(f'data/{stats_analyser.get_name()}-recoverable.pdf', transparent=True)
1726 fig, ax = plt.subplots(1, 1, figsize=(13, 4))
1727 stats_analyser.plot_recovery_thresholds(ax=ax, thresh_range=np.logspace(-1, 1, 1000))
1728 # ax.axvline(stats_analyser.get_thresh(BaseStrategy()), color='grey', ls=':', label='recovery threshold')
1729 ax.set_xscale('log')
1730 ax.legend(frameon=False)
1731 fig.tight_layout()
1732 fig.savefig(f'data/{stats_analyser.get_name()}-threshold.pdf', transparent=True)
1734 stats_analyser.plot_things_per_things(
1735 'total_iteration',
1736 'bit',
1737 True,
1738 op=stats_analyser.mean,
1739 mask=mask,
1740 args={'yscale': 'log', 'ylabel': 'total iterations'},
1741 )
1742 stats_analyser.plot_things_per_things(
1743 'total_iteration',
1744 'bit',
1745 True,
1746 op=stats_analyser.extra_mean,
1747 mask=mask,
1748 args={'yscale': 'linear', 'ylabel': 'extra iterations'},
1749 name='extra_iter',
1750 )
1751 stats_analyser.plot_things_per_things(
1752 'error', 'bit', True, op=stats_analyser.mean, mask=mask, args={'yscale': 'log'}
1753 )
1755 stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.9, 1.3, 100))
1756 plt.show()
1759if __name__ == "__main__":
1760 main()
1761 # compare_adaptivity_modes()
1762 # check_local_error()