Coverage for pySDC/projects/Resilience/fault_stats.py: 57%
445 statements
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-20 14:51 +0000
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-20 14:51 +0000
1import numpy as np
2import matplotlib.pyplot as plt
3from mpi4py import MPI
4import pickle
6import pySDC.helpers.plot_helper as plot_helper
7from pySDC.helpers.stats_helper import get_sorted
9from pySDC.projects.Resilience.hook import hook_collection, LogUAllIter, LogData
10from pySDC.projects.Resilience.fault_injection import get_fault_injector_hook
11from pySDC.implementations.convergence_controller_classes.hotrod import HotRod
12from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
13from pySDC.implementations.hooks.log_errors import LogLocalErrorPostStep
14from pySDC.implementations.hooks.log_work import LogWork
16# these problems are available for testing
17from pySDC.projects.Resilience.advection import run_advection
18from pySDC.projects.Resilience.vdp import run_vdp
19from pySDC.projects.Resilience.piline import run_piline
20from pySDC.projects.Resilience.Lorenz import run_Lorenz
21from pySDC.projects.Resilience.Schroedinger import run_Schroedinger
22from pySDC.projects.Resilience.quench import run_quench
23from pySDC.projects.Resilience.AC import run_AC
24from pySDC.projects.Resilience.RBC import run_RBC
26from pySDC.projects.Resilience.strategies import BaseStrategy, AdaptivityStrategy, IterateStrategy, HotRodStrategy
27import logging
29plot_helper.setup_mpl(reset=True)
31LOGGER_LEVEL = 40
33RECOVERY_THRESH_ABS = {
34 # run_quench: 5e-3,
35 # run_Schroedinger: 1.7e-6,
36}
39class FaultStats:
40 '''
41 Class to generate and analyse fault statistics
42 '''
44 def __init__(
45 self,
46 prob=None,
47 strategies=None,
48 faults=None,
49 reload=True,
50 recovery_thresh=1 + 1e-3,
51 recovery_thresh_abs=0.0,
52 num_procs=1,
53 mode='default',
54 stats_path='data/stats',
55 use_MPI=False,
56 **kwargs,
57 ):
58 '''
59 Initialization routine
61 Args:
62 prob: A function that runs a pySDC problem, see imports for available problems
63 strategies (list): List of resilience strategies
64 faults (list): List of booleans that describe whether to use faults or not
65 reload (bool): Load previously computed statistics and continue from there or start from scratch
66 recovery_thresh (float): Relative threshold for recovery
67 num_procs (int): Number of processes
68 mode (str): Mode for fault generation: Either 'random' or 'combination'
69 '''
70 self.prob = prob
71 self.strategies = [None] if strategies is None else strategies
72 self.faults = [False, True] if faults is None else faults
73 self.reload = reload
74 self.recovery_thresh = recovery_thresh
75 self.recovery_thresh_abs = recovery_thresh_abs
76 self.num_procs = num_procs
77 self.use_MPI = use_MPI
78 self.stats_path = stats_path
79 self.kwargs = {
80 'fault_frequency_iter': 500,
81 **kwargs,
82 }
84 # decide mode
85 if mode == 'default':
86 if prob.__name__ in ['run_vdp', 'run_Lorenz']:
87 self.mode = 'combination'
88 else:
89 self.mode = 'random'
90 else:
91 self.mode = mode
93 self.logger = logging.getLogger('FaultStats')
94 self.logger.level = LOGGER_LEVEL
96 msg = 'Starting FaultStats with attributes'
97 for key, val in self.__dict__.items():
98 if key in ['logger']:
99 continue
100 item = [str(me) for me in val] if type(val) == list else str(val)
101 msg += '\n\t' f'{key}: {item}'
102 self.logger.log(25, msg)
104 def get_Tend(self):
105 '''
106 Get the final time of runs for fault stats based on the problem
108 Returns:
109 float: Tend to put into the run
110 '''
111 return self.strategies[0].get_Tend(self.prob, self.num_procs)
113 def run_stats_generation(
114 self, runs=1000, step=None, comm=None, kwargs_range=None, faults=None, _reload=False, _runs_partial=0
115 ):
116 '''
117 Run the generation of stats for all strategies in the `self.strategies` variable
119 Args:
120 runs (int): Number of runs you want to do
121 step (int): Number of runs you want to do between saving
122 comm (MPI.Communicator): Communicator for distributing runs
123 faults (bool): Whether to do stats with faults or without
124 kw_args_range (dict): Range for the parameters
125 _reload, _runs_partial: Variables only used for recursion. Do not change!
127 Returns:
128 None
129 '''
130 if faults is None:
131 for f in self.faults:
132 self.run_stats_generation(runs=runs, step=step, comm=comm, kwargs_range=kwargs_range, faults=f)
133 return None
135 for key, val in kwargs_range.items() if kwargs_range is not None else {}:
136 if type(val) == int:
137 self.kwargs[key] = val
138 else:
139 for me in val:
140 kwargs_range_me = {**kwargs_range, key: me}
141 self.run_stats_generation(
142 runs=runs, step=step, comm=comm, kwargs_range=kwargs_range_me, faults=faults
143 )
144 return None
146 comm = MPI.COMM_WORLD if comm is None else comm
147 step = (runs if comm.size == 1 else comm.size) if step is None else step
148 _runs_partial = step if _runs_partial == 0 else _runs_partial
149 reload = self.reload or _reload
151 # see if we limit the number of runs we want to do
152 max_runs = (
153 (min(self.get_max_combinations(), runs) if self.mode == 'combination' else runs) if faults else min(runs, 5)
154 )
156 if reload:
157 # sort the strategies to do some load balancing
158 sorting_index = None
159 if comm.rank == 0:
160 already_completed = np.array(
161 [self.load(strategy=strategy, faults=faults).get('runs', 0) for strategy in self.strategies]
162 )
163 sorting_index_ = np.argsort(already_completed)
164 sorting_index = sorting_index_[already_completed[sorting_index_] < max_runs]
165 _runs_partial = max(min(already_completed), _runs_partial)
167 # tell all ranks what strategies to use
168 sorting_index = comm.bcast(sorting_index, root=0)
169 _runs_partial = comm.bcast(_runs_partial, root=0)
170 strategies = [self.strategies[i] for i in sorting_index]
171 if len(strategies) == 0: # check if we are already done
172 return None
173 else:
174 strategies = self.strategies
176 strategy_comm = comm.Split(comm.rank % len(strategies))
178 for j in range(0, len(strategies), comm.size):
179 self.generate_stats(
180 strategy=strategies[j + (comm.rank % len(strategies) % (len(strategies) - j))],
181 runs=min(_runs_partial, max_runs),
182 faults=faults,
183 reload=reload,
184 comm=strategy_comm,
185 )
186 strategy_comm.Free()
187 self.run_stats_generation(
188 runs=runs, step=step, comm=comm, faults=faults, _reload=True, _runs_partial=_runs_partial + step
189 )
191 return None
193 def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, comm=None):
194 '''
195 Generate statistics for recovery from bit flips
196 -----------------------------------------------
198 Every run is given a different random seed such that we have different faults and the results are then stored
200 Args:
201 strategy (Strategy): Resilience strategy
202 runs (int): Number of runs you want to do
203 reload (bool): Load previously computed statistics and continue from there or start from scratch
204 faults (bool): Whether to do stats with faults or without
205 comm (MPI.Communicator): Communicator for distributing runs
207 Returns:
208 None
209 '''
210 comm = MPI.COMM_WORLD if comm is None else comm
212 # initialize dictionary to store the stats in
213 dat = {
214 'level': np.zeros(runs),
215 'iteration': np.zeros(runs),
216 'node': np.zeros(runs),
217 'problem_pos': [],
218 'bit': np.zeros(runs),
219 'error': np.zeros(runs),
220 'total_iteration': np.zeros(runs),
221 'total_newton_iteration': np.zeros(runs),
222 'restarts': np.zeros(runs),
223 'target': np.zeros(runs),
224 'rank': np.zeros(runs),
225 }
227 # store arguments for storing and loading
228 identifier_args = {
229 'faults': faults,
230 'strategy': strategy,
231 }
233 # reload previously recorded stats and write them to dat
234 if reload:
235 already_completed_ = None
236 if comm.rank == 0:
237 already_completed_ = self.load(**identifier_args)
238 already_completed = comm.bcast(already_completed_, root=0)
239 if already_completed['runs'] > 0 and already_completed['runs'] <= runs and comm.rank == 0:
240 avail_len = min([already_completed['runs'], runs])
241 for k in dat.keys():
242 dat[k][:avail_len] = already_completed.get(k, [None] * avail_len)
243 else:
244 already_completed = {'runs': 0}
246 # prepare a message
247 involved_ranks = comm.gather(MPI.COMM_WORLD.rank, root=0)
248 msg = f'{comm.size} rank(s) ({involved_ranks}) doing {strategy.name}{" with faults" if faults else ""} with {self.num_procs} ranks for {self.prob.__name__} from {already_completed["runs"]} to {runs}'
249 if comm.rank == 0 and already_completed['runs'] < runs:
250 print(msg, flush=True)
252 space_comm = MPI.COMM_SELF.Split(True)
254 # perform the remaining experiments
255 for i in range(already_completed['runs'], runs):
256 if i % comm.size != comm.rank:
257 continue
259 # perform a single experiment with the correct random seed
260 stats, controller, crash = self.single_run(strategy=strategy, run=i, faults=faults, space_comm=space_comm)
262 # get the data from the stats
263 faults_run = get_sorted(stats, type='bitflip')
265 if faults:
266 assert len(faults_run) > 0, f"Did not record a fault in run {i} of {strategy.name}!"
267 dat['level'][i] = faults_run[0][1][0]
268 dat['iteration'][i] = faults_run[0][1][1]
269 dat['node'][i] = faults_run[0][1][2]
270 dat['problem_pos'] += [faults_run[0][1][3]]
271 dat['bit'][i] = faults_run[0][1][4]
272 dat['target'][i] = faults_run[0][1][5]
273 dat['rank'][i] = faults_run[0][1][6]
275 if crash:
276 print('Code crashed!')
277 dat['error'][i] = np.inf
278 continue
280 # record the rest of the data
281 t, u = get_sorted(stats, type='u', recomputed=False)[-1]
282 error = self.get_error(u, t, controller, strategy, self.get_Tend())
283 total_iteration = sum([k[1] for k in get_sorted(stats, type='k')])
284 total_newton_iteration = sum([k[1] for k in get_sorted(stats, type='work_newton')])
286 dat['error'][i] = error
287 dat['total_iteration'][i] = total_iteration
288 dat['total_newton_iteration'][i] = total_newton_iteration
289 dat['restarts'][i] = sum([me[1] for me in get_sorted(stats, type='restart')])
291 # free the space communicator
292 space_comm.Free()
294 dat_full = {}
295 for k in dat.keys():
296 dat_full[k] = comm.reduce(dat[k], op=MPI.SUM)
298 # store the completed stats
299 dat_full['runs'] = runs
301 if already_completed['runs'] < runs:
302 if comm.rank == 0:
303 self.store(dat_full, **identifier_args)
304 if self.faults:
305 self.get_recovered(strategy=strategy)
307 return None
309 def get_error(self, u, t, controller, strategy, Tend):
310 """
311 Compute the error.
313 Args:
314 u (dtype_u): The solution at the end of the run
315 t (float): Time at which `u` was recorded
316 controller (pySDC.controller.controller): The controller
317 strategy (Strategy): The resilience strategy
318 Tend (float): Time you want to run to
320 Returns:
321 float: Error
322 """
323 lvl = controller.MS[0].levels[0]
325 # decide if we reached Tend
326 Tend_reached = t + lvl.params.dt_initial >= Tend
328 if Tend_reached:
329 u_star = lvl.prob.u_exact(t=t)
330 return abs(u - u_star) / abs(u_star)
331 else:
332 print(f'Final time was not reached! Code crashed at t={t:.2f} instead of reaching Tend={Tend:.2f}')
333 return np.inf
335 def single_run(
336 self,
337 strategy,
338 run=0,
339 faults=False,
340 force_params=None,
341 hook_class=None,
342 space_comm=None,
343 Tend=None,
344 time_comm=None,
345 ):
346 '''
347 Run the problem once with the specified parameters
349 Args:
350 strategy (Strategy): The resilience strategy you plan on using
351 run (int): Index for fault generation
352 faults (bool): Whether or not to put faults in
353 force_params (dict): Change parameters in the description of the problem
354 space_comm (MPI.Communicator): A communicator for space parallelisation
355 Tend (float): Time you want to run to
356 time_comm (MPI.Intracomm.Communicator): Communicator in time
358 Returns:
359 dict: Stats object containing statistics for each step, each level and each iteration
360 pySDC.Controller: The controller of the run
361 float: The time the problem should have run to
362 '''
363 hook_class = hook_collection + [LogWork] + ([LogData] if hook_class is None else hook_class)
364 force_params = {} if force_params is None else force_params
366 # build the custom description
367 custom_description = strategy.get_custom_description_for_faults(self.prob, self.num_procs)
368 for k in force_params.keys():
369 custom_description[k] = {**custom_description.get(k, {}), **force_params[k]}
371 custom_controller_params = {
372 'logger_level': LOGGER_LEVEL,
373 **force_params.get('controller_params', {}),
374 }
376 if faults:
377 fault_stuff = {
378 'rng': None,
379 'args': strategy.get_fault_args(self.prob, self.num_procs),
380 'rnd_params': strategy.get_random_params(self.prob, self.num_procs),
381 }
383 # make parameters for faults:
384 if self.mode == 'random':
385 fault_stuff['rng'] = np.random.RandomState(run)
386 elif self.mode == 'combination':
387 fault_stuff['rng'] = run
388 elif self.mode == 'regular':
389 fault_stuff['rng'] = np.random.RandomState(run)
390 fault_stuff['fault_frequency_iter'] = self.kwargs['fault_frequency_iter']
391 fault_stuff['rnd_params'] = {
392 'bit': 12,
393 'min_node': 1,
394 }
395 else:
396 raise NotImplementedError(f'Don\'t know how to add faults in mode {self.mode}')
398 else:
399 fault_stuff = None
401 return self.prob(
402 custom_description=custom_description,
403 num_procs=self.num_procs,
404 hook_class=hook_class,
405 fault_stuff=fault_stuff,
406 Tend=self.get_Tend() if Tend is None else Tend,
407 custom_controller_params=custom_controller_params,
408 space_comm=space_comm,
409 comm=time_comm,
410 use_MPI=time_comm is not None,
411 )
413 def compare_strategies(self, run=0, faults=False, ax=None): # pragma: no cover
414 '''
415 Take a closer look at how the strategies compare for a specific run
417 Args:
418 run (int): The number of the run to get the appropriate random generator
419 faults (bool): Whether or not to include faults
420 ax (Matplotlib.axes): Somewhere to plot
422 Returns:
423 None
424 '''
425 if ax is None:
426 fig, ax = plt.subplots(1, 1)
427 store = True
428 else:
429 store = False
431 k_ax = ax.twinx()
432 ls = ['-.' if type(strategy) == HotRodStrategy else '-' for strategy in self.strategies]
433 [self.scrutinize_visual(self.strategies[i], run, faults, ax, k_ax, ls[i]) for i in range(len(self.strategies))]
435 # make a legend
436 [k_ax.plot([None], [None], label=strategy.label, color=strategy.color) for strategy in self.strategies]
437 k_ax.legend(frameon=True)
439 if store:
440 fig.tight_layout()
441 plt.savefig(f'data/{self.get_name()}-comparison.pdf', transparent=True)
443 def scrutinize_visual(
444 self, strategy, run, faults, ax=None, k_ax=None, ls='-', plot_restarts=False
445 ): # pragma: no cover
446 '''
447 Take a closer look at a specific run with a plot
449 Args:
450 strategy (Strategy): The resilience strategy you plan on using
451 run (int): The number of the run to get the appropriate random generator
452 faults (bool): Whether or not to include faults
453 ax (Matplotlib.axes): Somewhere to plot the error
454 k_ax (Matplotlib.axes): Somewhere to plot the iterations
455 plot_restarts (bool): Make vertical lines wherever restarts happened
457 Returns:
458 dict: Stats from the run
459 '''
460 if ax is None:
461 fig, ax = plt.subplots(1, 1)
462 store = True
463 else:
464 store = False
466 force_params = {}
468 stats, controller, Tend = self.single_run(
469 strategy=strategy,
470 run=run,
471 faults=faults,
472 force_params=force_params,
473 hook_class=hook_collection + [LogLocalErrorPostStep, LogData],
474 )
476 # plot the local error
477 e_loc = get_sorted(stats, type='e_local_post_step', recomputed=False)
478 ax.plot([me[0] for me in e_loc], [me[1] for me in e_loc], color=strategy.color, ls=ls)
480 # plot the iterations
481 k_ax = ax.twinx() if k_ax is None else k_ax
482 k = get_sorted(stats, type='k')
483 k_ax.plot([me[0] for me in k], np.cumsum([me[1] for me in k]), color=strategy.color, ls='--')
485 # plot the faults
486 faults = get_sorted(stats, type='bitflip')
487 for fault_time in [me[0] for me in faults]:
488 ax.axvline(fault_time, color='grey', ls=':')
490 # plot restarts
491 if plot_restarts:
492 restarts = get_sorted(stats, type='restart')
493 [ax.axvline(me[0], color='black', ls='-.') if me[1] else '' for me in restarts]
495 # decorate
496 ax.set_yscale('log')
497 ax.set_ylabel(r'$\epsilon$')
498 k_ax.set_ylabel('cumulative iterations (dashed)')
499 ax.set_xlabel(r'$t$')
501 if store:
502 fig.tight_layout()
503 plt.savefig(f'data/{self.get_name()}-{strategy.name}-details.pdf', transparent=True)
505 def scrutinize(self, strategy, logger_level=15, **kwargs):
506 '''
507 Take a closer look at a specific run
509 Args:
510 strategy (Strategy): The resilience strategy you plan on using
512 Returns:
513 None
514 '''
515 from pySDC.projects.Resilience.fault_injection import FaultInjector
517 force_params = {}
518 force_params['controller_params'] = {'logger_level': logger_level}
519 force_params['sweeper_params'] = {'skip_residual_computation': ()}
521 stats, controller, Tend = self.single_run(strategy=strategy, force_params=force_params, **kwargs)
523 u_all = get_sorted(stats, type='u', recomputed=False)
524 t, u = get_sorted(stats, type='u', recomputed=False)[np.argmax([me[0] for me in u_all])]
525 k = [me[1] for me in get_sorted(stats, type='k')]
527 fault_hook = [me for me in controller.hooks if type(me) == FaultInjector]
529 unhappened_faults = fault_hook[0].faults if len(fault_hook) > 0 else []
531 print(f'\nOverview for {strategy.name} strategy')
532 # print faults
533 faults = get_sorted(stats, type='bitflip')
534 print('\nfaults:')
535 print(' t | level | iter | node | bit | trgt | rank | pos')
536 print('--------+-------+------+------+-----+------+---------------------')
537 for f in faults:
538 print(
539 f' {f[0]:6.2f} | {f[1][0]:5d} | {f[1][1]:4d} | {f[1][2]:4d} | {f[1][4]:3d} | {f[1][5]:4d} | {f[1][6]:4d} |',
540 f[1][3],
541 )
542 print('--------+-------+------+------+-----+------+---------------------')
543 for f in unhappened_faults:
544 print(
545 f'!{f.time:6.2f} | {f.level_number:5d} | {f.iteration:4d} | {f.node:4d} | {f.bit:3d} | {f.target:4d} | {f.rank:4d} |',
546 f.problem_pos,
547 )
549 # see if we can determine if the faults where recovered
550 no_faults = self.load(strategy=strategy, faults=False)
551 e_star = np.mean(no_faults.get('error', [0]))
552 error = self.get_error(u, t, controller, strategy, Tend)
553 recovery_thresh = self.get_thresh(strategy)
555 print(
556 f'e={error:.2e}, e^*={e_star:.2e}, thresh: {recovery_thresh:.2e} -> recovered: {error < recovery_thresh}, e_rel = {error/abs(u):.2e}'
557 )
558 print(f'k: sum: {np.sum(k)}, min: {np.min(k)}, max: {np.max(k)}, mean: {np.mean(k):.2f},')
560 _newton_iter = get_sorted(stats, type='work_newton')
561 if len(_newton_iter) > 0:
562 newton_iter = [me[1] for me in _newton_iter]
563 print(
564 f'Newton: k: sum: {np.sum(newton_iter)}, min: {np.min(newton_iter)}, max: {np.max(newton_iter)}, mean: {np.mean(newton_iter):.2f},'
565 )
567 # checkout the step size
568 dt = [me[1] for me in get_sorted(stats, type='dt')]
569 print(f't: {t:.2f}, dt: min: {np.min(dt):.2e}, max: {np.max(dt):.2e}, mean: {np.mean(dt):.2e}')
571 # restarts
572 restarts = [me[1] for me in get_sorted(stats, type='restart')]
573 print(f'restarts: {sum(restarts)}, without faults: {no_faults["restarts"][0]}')
575 return stats
577 def convert_faults(self, faults):
578 '''
579 Make arrays of useable data from an entry in the stats object returned by pySDC
581 Args:
582 faults (list): The entry for faults returned by the pySDC run
584 Returns:
585 list: The times when the faults happened
586 list: The levels in which the faults happened
587 list: The iterations in which the faults happened
588 list: The nodes in which the faults happened
589 list: The problem positions in which the faults happened
590 list: The bits in which the faults happened
591 '''
592 time = [faults[i][0] for i in range(len(faults))]
593 level = [faults[i][1][0] for i in range(len(faults))]
594 iteration = [faults[i][1][1] for i in range(len(faults))]
595 node = [faults[i][1][2] for i in range(len(faults))]
596 problem_pos = [faults[i][1][3] for i in range(len(faults))]
597 bit = [faults[i][1][4] for i in range(len(faults))]
598 return time, level, iteration, node, problem_pos, bit
600 def get_path(self, **kwargs):
601 '''
602 Get the path to where the stats are stored
604 Args:
605 strategy (Strategy): The resilience strategy
606 faults (bool): Whether or not faults have been activated
608 Returns:
609 str: The path to what you are looking for
610 '''
611 return f'{self.stats_path}/{self.get_name(**kwargs)}.pickle'
613 def get_name(self, strategy=None, faults=True, mode=None):
614 '''
615 Function to get a unique name for a kind of statistics based on the problem and strategy that was used
617 Args:
618 strategy (Strategy): Resilience strategy
619 faults (bool): Whether or not faults where inserted
621 Returns:
622 str: The unique identifier
623 '''
624 if self.prob == run_advection:
625 prob_name = 'advection'
626 elif self.prob == run_vdp:
627 prob_name = 'vdp'
628 elif self.prob == run_piline:
629 prob_name = 'piline'
630 elif self.prob == run_Lorenz:
631 prob_name = 'Lorenz'
632 elif self.prob == run_Schroedinger:
633 prob_name = 'Schroedinger'
634 elif self.prob == run_quench:
635 prob_name = 'Quench'
636 elif self.prob.__name__ == 'run_AC':
637 prob_name = 'Allen-Cahn'
638 elif self.prob.__name__ == 'run_RBC':
639 prob_name = 'Rayleigh-Benard'
640 else:
641 raise NotImplementedError(f'Name not implemented for problem {self.prob}')
643 if faults:
644 fault_name = '-faults'
645 else:
646 fault_name = ''
648 if strategy is not None:
649 strategy_name = f'-{strategy.name}'
650 else:
651 strategy_name = ''
653 mode = self.mode if mode is None else mode
654 if mode == 'regular':
655 mode_thing = f'-regular{self.kwargs["fault_frequency_iter"] if faults else ""}'
656 else:
657 mode_thing = ''
659 mpi_thing = '-MPI' if self.use_MPI else ''
660 return f'{prob_name}{strategy_name}{fault_name}-{self.num_procs}procs{mode_thing}{mpi_thing}'
662 def store(self, dat, **kwargs):
663 '''
664 Stores the data for a run at a predefined path
666 Args:
667 dat (dict): The data of the recorded statistics
669 Returns:
670 None
671 '''
672 with open(self.get_path(**kwargs), 'wb') as f:
673 pickle.dump(dat, f)
674 return None
676 def load(self, **kwargs):
677 '''
678 Loads the stats belonging to a specific strategy and whether or not faults where inserted.
679 When no data has been generated yet, a dictionary is returned which only contains the number of completed runs,
680 which is 0 of course.
682 Returns:
683 dict: Data from previous run or if it is not available a placeholder dictionary
684 '''
685 kwargs['strategy'] = kwargs.get('strategy', self.strategies[MPI.COMM_WORLD.rank % len(self.strategies)])
687 try:
688 with open(self.get_path(**kwargs), 'rb') as f:
689 dat = pickle.load(f)
690 except FileNotFoundError:
691 self.logger.debug(f'File \"{self.get_path(**kwargs)}\" not found!')
692 return {'runs': 0}
693 return dat
695 def get_thresh(self, strategy=None):
696 """
697 Get recovery threshold based on relative and absolute tolerances
699 Args:
700 strategy (Strategy): The resilience strategy
701 """
702 fault_free = self.load(strategy=strategy, faults=False)
703 assert (
704 fault_free['error'].std() / fault_free['error'].mean() < 1e-5
705 ), f'Got too large variation of errors in {strategy} strategy!'
706 return max([self.recovery_thresh_abs, self.recovery_thresh * fault_free["error"].mean()])
708 def get_recovered(self, **kwargs):
709 '''
710 Determine the recovery rate for a specific strategy and store it to disk.
712 Returns:
713 None
714 '''
715 if 'strategy' not in kwargs.keys():
716 [self.get_recovered(strategy=strat, **kwargs) for strat in self.strategies]
717 else:
718 try:
719 with_faults = self.load(faults=True, **kwargs)
720 with_faults['recovered'] = with_faults['error'] < self.get_thresh(kwargs['strategy'])
721 self.store(faults=True, dat=with_faults, **kwargs)
722 except KeyError as error:
723 print(
724 f'Warning: Can\'t compute recovery rate for strategy {kwargs["strategy"].name} in {self.prob.__name__} problem right now: KeyError: {error}'
725 )
727 return None
729 def crash_rate(self, dat, no_faults, thingA, mask):
730 '''
731 Determine the rate of runs that crashed
733 Args:
734 dat (dict): The data of the recorded statistics with faults
735 no_faults (dict): The data of the corresponding fault-free stats
736 thingA (str): Some key stored in the stats that will go on the y-axis
737 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply before determining the rate
739 Returns:
740 int: Ratio of the runs which crashed and fall under the specific criteria set by the mask
741 '''
742 if len(dat[thingA][mask]) > 0:
743 crash = dat['error'] == np.inf
744 return len(dat[thingA][mask & crash]) / len(dat[thingA][mask])
745 else:
746 return None
748 def rec_rate(self, dat, no_faults, thingA, mask):
749 '''
750 Operation for plotting which returns the recovery rate for a given mask.
751 Which thingA you apply this to actually does not matter here since we compute a rate.
753 Args:
754 dat (dict): The recorded statistics
755 no_faults (dict): The corresponding fault-free stats
756 thingA (str): Some key stored in the stats
757 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
759 Returns:
760 float: Recovery rate
761 '''
762 if len(dat[thingA][mask]) > 0:
763 return len(dat[thingA][mask & dat['recovered']]) / len(dat[thingA][mask])
764 else:
765 return None
767 def mean(self, dat, no_faults, thingA, mask):
768 '''
769 Operation for plotting which returns the mean of thingA after applying the mask
771 Args:
772 dat (dict): The recorded statistics
773 no_faults (dict): The corresponding fault-free stats
774 thingA (str): Some key stored in the stats
775 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
777 Returns:
778 float: Mean of thingA after applying mask
779 '''
780 return np.mean(dat[thingA][mask])
782 def extra_mean(self, dat, no_faults, thingA, mask):
783 '''
784 Operation for plotting which returns the difference in mean of thingA between runs with and without faults after
785 applying the mask
787 Args:
788 dat (dict): The recorded statistics
789 no_faults (dict): The corresponding fault-free stats
790 thingA (str): Some key stored in the stats
791 mask (Numpy.ndarray of shape (n)): Arbitrary mask for filtering
793 Returns:
794 float: Difference in mean of thingA between runs with and without faults after applying mask
795 '''
796 if True in mask or int in [type(me) for me in mask]:
797 return np.mean(dat[thingA][mask]) - np.mean(no_faults[thingA])
798 else:
799 return None
801 def plot_thingA_per_thingB(
802 self,
803 strategy,
804 thingA,
805 thingB,
806 ax=None,
807 mask=None,
808 recovered=False,
809 op=None,
810 **kwargs,
811 ): # pragma: no cover
812 '''
813 Plot thingA vs. thingB for a single strategy
815 Args:
816 strategy (Strategy): The resilience strategy you want to plot
817 thingA (str): Some key stored in the stats that will go on the y-axis
818 thingB (str): Some key stored in the stats that will go on the x-axis
819 ax (Matplotlib.axes): Somewhere to plot
820 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes
821 recovered (bool): Show the plot for both all runs and only the recovered ones
822 op (function): Operation that is applied to thingA before plotting default is recovery rate
824 Returns:
825 None
826 '''
827 op = self.rec_rate if op is None else op
828 dat = self.load(strategy=strategy, faults=True)
829 no_faults = self.load(strategy=strategy, faults=False)
831 if mask is None:
832 mask = np.ones_like(dat[thingB], dtype=bool)
834 admissable_thingB = np.unique(dat[thingB][mask])
835 me = np.zeros(len(admissable_thingB))
836 me_recovered = np.zeros_like(me)
838 for i in range(len(me)):
839 _mask = (dat[thingB] == admissable_thingB[i]) & mask
840 if _mask.any():
841 me[i] = op(dat, no_faults, thingA, _mask)
842 me_recovered[i] = op(dat, no_faults, thingA, _mask & dat['recovered'])
844 if recovered:
845 ax.plot(
846 admissable_thingB,
847 me_recovered,
848 label=f'{strategy.label} (only recovered)',
849 color=strategy.color,
850 marker=strategy.marker,
851 ls='--',
852 linewidth=3,
853 **kwargs,
854 )
856 ax.plot(
857 admissable_thingB,
858 me,
859 label=f'{strategy.label}',
860 color=strategy.color,
861 marker=strategy.marker,
862 linewidth=2,
863 **kwargs,
864 )
866 ax.legend(frameon=False)
867 ax.set_xlabel(thingB)
868 ax.set_ylabel(thingA)
869 return None
871 def plot_things_per_things(
872 self,
873 thingA='bit',
874 thingB='bit',
875 recovered=False,
876 mask=None,
877 op=None,
878 args=None,
879 strategies=None,
880 name=None,
881 store=False,
882 ax=None,
883 fig=None,
884 plotting_args=None,
885 ): # pragma: no cover
886 '''
887 Plot thingA vs thingB for multiple strategies
889 Args:
890 thingA (str): Some key stored in the stats that will go on the y-axis
891 thingB (str): Some key stored in the stats that will go on the x-axis
892 recovered (bool): Show the plot for both all runs and only the recovered ones
893 mask (Numpy.ndarray of shape (n)): Arbitrary mask to apply to both axes
894 op (function): Operation that is applied to thingA before plotting default is recovery rate
895 args (dict): Parameters for how the plot should look
896 strategies (list): List of the strategies you want to plot, if None, all will be plotted
897 name (str): Optional name for the plot
898 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)
899 ax (Matplotlib.axes): Somewhere to plot
900 fig (Matplotlib.figure): Figure of the ax
902 Returns
903 None
904 '''
905 strategies = self.strategies if strategies is None else strategies
906 args = {} if args is None else args
908 # make sure we have something to plot in
909 if ax is None:
910 fig, ax = plt.subplots(1, 1)
911 elif fig is None:
912 store = False
914 # execute the plots for all strategies
915 for s in strategies:
916 self.plot_thingA_per_thingB(
917 s,
918 thingA=thingA,
919 thingB=thingB,
920 recovered=recovered,
921 ax=ax,
922 mask=mask,
923 op=op,
924 **(plotting_args if plotting_args else {}),
925 )
927 # set the parameters
928 [plt.setp(ax, k, v) for k, v in args.items()]
930 if store:
931 fig.tight_layout()
932 plt.savefig(f'data/{self.get_name()}-{thingA if name is None else name}_per_{thingB}.pdf', transparent=True)
933 plt.close(fig)
935 return None
937 def plot_recovery_thresholds(
938 self, strategies=None, thresh_range=None, ax=None, recoverable_only=False, **kwargs
939 ): # pragma: no cover
940 '''
941 Plot the recovery rate for a range of thresholds
943 Args:
944 strategies (list): List of the strategies you want to plot, if None, all will be plotted
945 thresh_range (list): thresholds for deciding whether to accept as recovered
946 ax (Matplotlib.axes): Somewhere to plot
948 Returns:
949 None
950 '''
951 # fill default values if nothing is specified
952 strategies = self.strategies if strategies is None else strategies
954 thresh_range = 1 + np.linspace(-4e-2, 4e-2, 100) if thresh_range is None else thresh_range
955 if ax is None:
956 fig, ax = plt.subplots(1, 1)
958 rec_rates = [[None] * len(thresh_range)] * len(strategies)
959 for strategy_idx in range(len(strategies)):
960 strategy = strategies[strategy_idx]
961 # load the stats
962 fault_free = self.load(strategy=strategy, faults=False)
963 with_faults = self.load(strategy=strategy, faults=True)
965 if recoverable_only:
966 recoverable_mask = self.get_fixable_faults_only(strategy)
967 else:
968 recoverable_mask = self.get_mask()
970 for thresh_idx in range(len(thresh_range)):
971 rec_mask = self.get_mask(
972 strategy=strategy,
973 key='error',
974 val=(thresh_range[thresh_idx] * fault_free['error'].mean()),
975 op='gt',
976 old_mask=recoverable_mask,
977 )
978 rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len(
979 with_faults['error'][recoverable_mask]
980 )
982 ax.plot(
983 thresh_range, rec_rates[strategy_idx], **{'color': strategy.color, 'label': strategy.label, **kwargs}
984 )
985 ax.legend(frameon=False)
986 ax.set_ylabel('recovery rate')
987 ax.set_xlabel('threshold as ratio to fault-free error')
989 return None
991 def analyse_adaptivity(self, mask): # pragma: no cover
992 '''
993 Analyse a set of runs with adaptivity
995 Args:
996 mask (Numpy.ndarray of shape (n)): The mask you want to know about
998 Returns:
999 None
1000 '''
1001 index = self.get_index(mask)
1002 dat = self.load()
1004 # make a header
1005 print(' run | bit | node | iter | e_em^* | e_em | e_glob^* | e_glob ')
1006 print('-------+-----+------+------+----------+----------+----------+----------')
1007 for i in index:
1008 e_em, e_glob = self.analyse_adaptivity_single(int(i))
1009 print(
1010 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_em[1]:.2e}\
1011 | {e_em[0]:.2e} | {e_glob[1]:.2e} | {e_glob[0]:.2e}'
1012 )
1014 e_tol = AdaptivityStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][
1015 Adaptivity
1016 ]['e_tol']
1017 print(f'We only restart when e_em > e_tol = {e_tol:.2e}!')
1018 return None
1020 def analyse_adaptivity_single(self, run): # pragma: no cover
1021 '''
1022 Compute what the difference in embedded and global error are for a specific run with adaptivity
1024 Args:
1025 run (int): The run you want to know about
1027 Returns:
1028 list: Embedded error with fault and without for the last iteration in the step with a fault
1029 list: Global error with and without fault at the end of the run
1030 '''
1031 # perform one run with and one without faults
1032 stats = []
1033 controllers = []
1034 for faults in [True, False]:
1035 s, c, _ = self.single_run(
1036 strategy=AdaptivityStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]
1037 )
1038 stats += [s]
1039 controllers += [c]
1041 # figure out when the fault happened
1042 t_fault = get_sorted(stats[0], type='bitflip')[0][0]
1044 # get embedded error
1045 e_em = [
1046 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]
1047 for stat in stats
1048 ]
1050 # compute the global error
1051 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]
1052 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]
1054 return [e_em[i][-1] for i in [0, 1]], e_glob
1056 def analyse_HotRod(self, mask): # pragma: no cover
1057 '''
1058 Analyse a set of runs with Hot Rod
1060 Args:
1061 mask (Numpy.ndarray of shape (n)): The mask you want to know about
1063 Returns:
1064 None
1065 '''
1066 index = self.get_index(mask)
1067 dat = self.load()
1069 # make a header
1070 print(
1071 ' run | bit | node | iter | e_ex^* | e_ex | e_em^* | e_em | diff* | diff | e_glob^* \
1072| e_glob '
1073 )
1074 print(
1075 '-------+-----+------+------+----------+----------+----------+----------+----------+----------+----------\
1076+----------'
1077 )
1078 for i in index:
1079 e_em, e_ex, e_glob = self.analyse_HotRod_single(int(i))
1080 print(
1081 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {e_ex[1]:.2e}\
1082 | {e_ex[0]:.2e} | {e_em[1]:.2e} | {e_em[0]:.2e} | {abs(e_em[1]-e_ex[1]):.2e} | {abs(e_em[0]-e_ex[0]):.2e} | \
1083{e_glob[1]:.2e} | {e_glob[0]:.2e}'
1084 )
1086 tol = HotRodStrategy().get_custom_description(self.prob, self.num_procs)['convergence_controllers'][HotRod][
1087 'HotRod_tol'
1088 ]
1089 print(f'We only restart when diff > tol = {tol:.2e}!')
1090 return None
1092 def analyse_HotRod_single(self, run): # pragma: no cover
1093 '''
1094 Compute what the difference in embedded, extrapolated and global error are for a specific run with Hot Rod
1096 Args:
1097 run (int): The run you want to know about
1099 Returns:
1100 list: Embedded error with fault and without for the last iteration in the step with a fault
1101 list: Extrapolation error with fault and without for the last iteration in the step with a fault
1102 list: Global error with and without fault at the end of the run
1103 '''
1104 # perform one run with and one without faults
1105 stats = []
1106 controllers = []
1107 for faults in [True, False]:
1108 s, c, _ = self.single_run(
1109 strategy=HotRodStrategy(), run=run, faults=faults, hook_class=hook_collection + [LogUAllIter]
1110 )
1111 stats += [s]
1112 controllers += [c]
1114 # figure out when the fault happened
1115 t_fault = get_sorted(stats[0], type='bitflip')[0][0]
1117 # get embedded error
1118 e_em = [
1119 [me[1] for me in get_sorted(stat, type='error_embedded_estimate', time=t_fault, sortby='iter')]
1120 for stat in stats
1121 ]
1122 # get extrapolated error
1123 e_ex = [
1124 [me[1] for me in get_sorted(stat, type='error_extrapolation_estimate', time=t_fault, sortby='iter')]
1125 for stat in stats
1126 ]
1128 # compute the global error
1129 u_end = [get_sorted(stat, type='u')[-1] for stat in stats]
1130 e_glob = [abs(u_end[i][1] - controllers[i].MS[0].levels[0].prob.u_exact(t=u_end[i][0])) for i in [0, 1]]
1132 return [e_em[i][-1] for i in [0, 1]], [e_ex[i][-1] for i in [0, 1]], e_glob
1134 def print_faults(self, mask=None, strategy=None): # pragma: no cover
1135 '''
1136 Print all faults that happened within a certain mask
1138 Args:
1139 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of
1140 strategy (Strategy): The resilience strategy you want to see the error for
1142 Returns:
1143 None
1144 '''
1145 strategy = BaseStrategy() if strategy is None else strategy
1147 index = self.get_index(mask)
1148 dat = self.load(strategy=strategy)
1150 e_star = np.mean(self.load(faults=False, strategy=strategy).get('error', [0]))
1152 # make a header
1153 print(' run | bit | node | iter | rank | rel err | space pos')
1154 print('-------+-----+------+------+------+---------+-----------')
1155 for i in index:
1156 print(
1157 f' {i:5d} | {dat["bit"][i]:3.0f} | {dat["node"][i]:4.0f} | {dat["iteration"][i]:4.0f} | {dat.get("rank", np.zeros_like(dat["iteration"]))[i]:4.0f} | {dat["error"][i] / e_star:.1e} | {dat["problem_pos"][i]}'
1158 )
1160 return None
1162 def get_mask(self, strategy=None, key=None, val=None, op='eq', old_mask=None, compare_faults=False):
1163 '''
1164 Make a mask to apply to stored data to filter out certain things
1166 Args:
1167 strategy (Strategy): The resilience strategy you want to apply the mask to. Most masks are the same for all
1168 strategies so None is fine
1169 key (str): The key in the stored statistics that you want to filter for some value
1170 val (str, float, int, bool): A value that you want to use for filtering. Dtype depends on key
1171 op (str): Operation that is applied for filtering
1172 old_mask (Numpy.ndarray of shape (n)): Apply this mask on top of the filter
1173 compare_faults (bool): instead of comparing to val, compare to the mean value for fault free runs
1175 Returns:
1176 Numpy.ndarray with boolean entries that can be used as a mask
1177 '''
1178 strategy = self.strategies[0] if strategy is None else strategy
1179 dat = self.load(strategy=strategy, faults=True)
1181 if compare_faults:
1182 if val is not None:
1183 raise ValueError('Can\'t use val and compare_faults in get_mask at the same time!')
1184 else:
1185 vals = self.load(strategy=strategy, faults=False)[key]
1186 val = sum(vals) / len(vals)
1188 if None in [key, val] and op not in ['isfinite']:
1189 mask = dat['bit'] == dat['bit']
1190 else:
1191 if op == 'uneq':
1192 mask = dat[key] != val
1193 elif op == 'eq':
1194 mask = dat[key] == val
1195 elif op == 'leq':
1196 mask = dat[key] <= val
1197 elif op == 'geq':
1198 mask = dat[key] >= val
1199 elif op == 'lt':
1200 mask = dat[key] < val
1201 elif op == 'gt':
1202 mask = dat[key] > val
1203 elif op == 'isfinite':
1204 mask = np.isfinite(dat[key])
1205 else:
1206 raise NotImplementedError(f'Please implement op={op}!')
1208 if old_mask is not None:
1209 return mask & old_mask
1210 else:
1211 return mask
1213 def get_fixable_faults_only(self, strategy):
1214 """
1215 Return a mask of only faults that can be fixed with a given strategy.
1217 Args:
1218 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all
1220 Returns:
1221 Numpy.ndarray with boolean entries that can be used as a mask
1222 """
1223 fixable = strategy.get_fixable_params(
1224 maxiter=strategy.get_custom_description(self.prob, self.num_procs)['step_params']['maxiter']
1225 )
1226 mask = self.get_mask(strategy=strategy)
1228 for kwargs in fixable:
1229 mask = self.get_mask(strategy=strategy, **kwargs, old_mask=mask)
1231 return mask
1233 def get_index(self, mask=None):
1234 '''
1235 Get the indeces of all runs in mask
1237 Args:
1238 mask (Numpy.ndarray of shape (n)): The mask you want to know the contents of
1240 Returns:
1241 Numpy.ndarray: Array of indeces
1242 '''
1243 if mask is None:
1244 dat = self.load()
1245 return np.arange(len(dat['iteration']))
1246 else:
1247 return np.arange(len(mask))[mask]
1249 def get_statistics_info(self, mask=None, strategy=None, print_all=False, ax=None): # pragma: no cover
1250 '''
1251 Get information about how many data points for faults we have given a particular mask
1253 Args:
1254 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1255 strategy (Strategy): The resilience strategy you want to look at. In normal use it's the same for all
1256 strategies, so you don't need to supply this argument
1257 print_all (bool): Whether to add information that is normally not useful to the table
1258 ax (Matplotlib.axes): Somewhere to plot the combinations histogram
1260 Returns:
1261 None
1262 '''
1264 # load some data from which to infer the number occurrences of some event
1265 strategy = self.strategies[0] if strategy is None else strategy
1266 dat = self.load(stratagy=strategy, faults=True)
1268 # make a dummy mask in case none is supplied
1269 if mask is None:
1270 mask = np.ones_like(dat['error'], dtype=bool)
1272 # print a header
1273 print(f' tot: {len(dat["error"][mask]):6} | avg. counts | mean deviation | unique entries')
1274 print('-------------------------------------------------------------')
1276 # make a list of all keys that you want to look at
1277 keys = ['iteration', 'bit', 'node']
1278 if print_all:
1279 keys += ['problem_pos', 'level', 'target']
1281 # print the table
1282 for key in keys:
1283 counts, dev, unique = self.count_occurrences(dat[key][mask])
1284 print(f' {key:11} | {counts:11.1f} | {dev:14.2f} | {unique:14}')
1286 return None
1288 def combinations_histogram(self, dat=None, keys=None, mask=None, ax=None): # pragma: no cover
1289 '''
1290 Make a histogram ouf of the occurrences of combinations
1292 Args:
1293 dat (dict): The data of the recorded statistics
1294 keys (list): The keys in dat that you want to know the combinations of
1295 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1297 Returns:
1298 Matplotlib.axes: The plot
1299 '''
1300 if ax is None:
1301 fig, ax = plt.subplots(1, 1)
1303 occurrences, bins = self.get_combination_histogram(dat, keys, mask)
1305 ax.bar(bins[:-1], occurrences)
1307 ax.set_xlabel('Occurrence of combinations')
1309 return ax
1311 def get_combination_histogram(self, dat=None, keys=None, mask=None): # pragma: no cover
1312 '''
1313 Check how many combinations of values we expect and how many we find to see if we need to do more experiments.
1314 It is assumed that each allowed value for each key appears at least once in dat after the mask was applied
1316 Args:
1317 dat (dict): The data of the recorded statistics
1318 keys (list): The keys in dat that you want to know the combinations of
1319 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1321 Returns:
1322 Numpy.ndarray: Number of occurrences of combinations
1323 Numpy.ndarray: Bins
1324 '''
1326 # load default values
1327 dat = self.load(strategy=self.strategies[0], faults=True) if dat is None else dat
1328 keys = ['iteration', 'bit', 'node'] if keys is None else keys
1329 if mask is None:
1330 mask = np.ones_like(dat['error'], dtype=bool)
1332 # get unique values and compute how many combinations you expect
1333 num_unique = [len(np.unique(dat[key][mask])) for key in keys]
1334 expected_number_of_combinations = np.prod(num_unique)
1336 # test what you actually get
1337 combination_counts = self.get_combination_counts(dat, keys, mask)
1339 # make a histogram with the result
1340 occurrences, bins = np.histogram(combination_counts, bins=np.arange(max(combination_counts) + 1))
1341 occurrences[0] = expected_number_of_combinations - len(combination_counts)
1343 return occurrences, bins
1345 def get_max_combinations(self, dat=None, strategy=None):
1346 '''
1347 Count how many combinations of parameters for faults are possible
1349 Args:
1350 dat (dict): The recorded statistics
1351 keys (list): The keys in dat that you want to know the combinations of
1352 strategy (Strategy): The resilience strategy
1354 Returns:
1355 int: Number of possible combinations
1356 '''
1357 strategy = self.strategies[0] if strategy is None else strategy
1358 stats, controller, Tend = self.single_run(strategy=strategy, run=0, faults=True)
1359 faultHook = get_fault_injector_hook(controller)
1360 ranges = [
1361 (0, faultHook.rnd_params['level_number']),
1362 (faultHook.rnd_params.get('min_node', 0), faultHook.rnd_params['node'] + 1),
1363 (1, faultHook.rnd_params['iteration'] + 1),
1364 (0, faultHook.rnd_params['bit']),
1365 (0, faultHook.rnd_params['rank']),
1366 ]
1367 ranges += [(0, i) for i in faultHook.rnd_params['problem_pos']]
1368 return np.prod([me[1] - me[0] for me in ranges], dtype=int)
1370 def get_combination_counts(self, dat, keys, mask):
1371 '''
1372 Get counts of how often all combinations of values of keys appear. This is done recursively to support arbitrary
1373 numbers of keys
1375 Args:
1376 dat (dict): The data of the recorded statistics
1377 keys (list): The keys in dat that you want to know the combinations of
1378 mask (Numpy.ndarray of shape (n)): The mask you want to apply before counting
1380 Returns:
1381 list: Occurrences of all combinations
1382 '''
1383 key = keys[0]
1384 unique_vals = np.unique(dat[key][mask])
1385 res = []
1387 for i in range(len(unique_vals)):
1388 inner_mask = self.get_mask(key=key, val=unique_vals[i], op='eq', old_mask=mask)
1389 if len(keys) > 1:
1390 res += self.get_combination_counts(dat, keys[1:], inner_mask)
1391 else:
1392 res += [self.count_occurrences(dat[key][inner_mask])[0]]
1393 return res
1395 def count_occurrences(self, vals):
1396 '''
1397 Count the occurrences of unique values in vals and compute average deviation from mean
1399 Args:
1400 vals (list): Values you want to check
1402 Returns:
1403 float: Mean of number of occurrences of unique values in vals
1404 float: Average deviation from mean number of occurrences
1405 int: Number of unique entries
1406 '''
1407 unique_vals, counts = np.unique(vals, return_counts=True)
1409 if len(counts) > 0:
1410 return counts.mean(), sum(abs(counts - counts.mean())) / len(counts), len(counts)
1411 else:
1412 return None, None, 0
1414 def bar_plot_thing(
1415 self, x=None, thing=None, ax=None, mask=None, store=False, faults=False, name=None, op=None, args=None
1416 ): # pragma: no cover
1417 '''
1418 Make a bar plot about something!
1420 Args:
1421 x (Numpy.ndarray of dimension 1): x values for bar plot
1422 thing (str): Some key stored in the stats that will go on the y-axis
1423 mask (Numpy.ndarray of shape (n)): The mask you want to apply before plotting
1424 store (bool): Store the plot at a predefined path or not (for jupyter notebooks)
1425 faults (bool): Whether to load stats with faults or without
1426 name (str): Optional name for the plot
1427 op (function): Operation that is applied to thing before plotting default is recovery rate
1428 args (dict): Parameters for how the plot should look
1430 Returns:
1431 None
1432 '''
1433 if ax is None:
1434 fig, ax = plt.subplots(1, 1)
1435 store = True
1436 op = self.mean if op is None else op
1438 # get the values for the bars
1439 height = np.zeros(len(self.strategies))
1440 for strategy_idx in range(len(self.strategies)):
1441 strategy = self.strategies[strategy_idx]
1443 # load the values
1444 dat = self.load(strategy=strategy, faults=faults)
1445 no_faults = self.load(strategy=strategy, faults=False)
1447 # check if we have a mask
1448 if mask is None:
1449 mask = np.ones_like(dat[thing], dtype=bool)
1451 height[strategy_idx] = op(dat, no_faults, thing, mask)
1453 # get some x values
1454 x = np.arange(len(self.strategies)) if x is None else x
1456 # prepare labels
1457 ticks = [strategy.bar_plot_x_label for strategy in self.strategies]
1459 ax.bar(x, height, tick_label=ticks)
1461 # set the parameters
1462 ax.set_ylabel(thing)
1463 args = {} if args is None else args
1464 [plt.setp(ax, k, v) for k, v in args.items()]
1466 if store:
1467 fig.tight_layout()
1468 plt.savefig(f'data/{self.get_name()}-{thing if name is None else name}-barplot.pdf', transparent=True)
1469 plt.close(fig)
1471 return None
1473 def fault_frequency_plot(self, ax, iter_ax, kwargs_range, strategy=None): # pragma: no cover
1474 func_args = locals()
1475 func_args.pop('self', None)
1476 if strategy is None:
1477 for strat in self.strategies:
1478 args = {**func_args, 'strategy': strat}
1479 self.fault_frequency_plot(**args)
1480 return None
1482 # load data
1483 all_data = {}
1484 for me in kwargs_range['fault_frequency_iter']:
1485 self.kwargs['fault_frequency_iter'] = me
1486 self.get_recovered()
1487 all_data[me] = self.load(strategy=strategy, faults=True, mode='regular')
1489 # get_recovery_rate
1490 results = {}
1491 results['frequencies'] = list(all_data.keys())
1492 results['recovery_rate'] = [
1493 len(all_data[key]['recovered'][all_data[key]['recovered'] is True]) / len(all_data[key]['recovered'])
1494 for key in all_data.keys()
1495 ]
1496 # results['iterations'] = [np.mean(all_data[key]['total_iteration']) for key in all_data.keys()]
1497 results['iterations'] = [
1498 np.mean(all_data[key]['total_iteration'][all_data[key]['error'] != np.inf]) for key in all_data.keys()
1499 ]
1501 ax.plot(results['frequencies'], results['recovery_rate'], **strategy.style)
1502 iter_ax.plot(results['frequencies'], results['iterations'], **{**strategy.style, 'ls': '--'})
1504 ax.set_xscale('log')
1505 ax.set_xlabel('iterations between fault')
1506 ax.set_ylabel('recovery rate')
1507 iter_ax.set_ylabel('average total iterations if not crashed (dashed)')
1508 ax.legend(frameon=False)
1510 return None
1512 def get_HR_tol(self, verbose=False):
1513 from pySDC.implementations.convergence_controller_classes.hotrod import HotRod
1515 HR_strategy = HotRodStrategy(useMPI=self.use_MPI)
1517 description = HR_strategy.get_custom_description(self.prob, self.num_procs)
1518 description['convergence_controllers'][HotRod]['HotRod_tol'] = 1e2
1520 stats, _, _ = self.single_run(HR_strategy, force_params=description)
1522 e_extrapolation = get_sorted(stats, type='error_extrapolation_estimate')
1523 diff = []
1524 for i in range(len(e_extrapolation)):
1525 if e_extrapolation[i][1] is not None:
1526 e_embedded = get_sorted(stats, type='error_embedded_estimate', time=e_extrapolation[i][0])
1527 diff += [abs(e_extrapolation[i][1] - e_embedded[-1][1])]
1529 max_diff = max(diff)
1530 proposed_tol = 2 * max_diff
1531 if verbose:
1532 print(
1533 f'Max. diff: {max_diff:.6e} -> proposed HR tolerance: {proposed_tol:.6e} for {self.prob.__name__} problem with {self.num_procs} procs.'
1534 )
1535 else:
1536 print(f'{proposed_tol:.6e}')
1539def check_local_error(): # pragma: no cover
1540 """
1541 Make a plot of the resolution over time for all problems
1542 """
1543 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]
1544 problems = [run_quench]
1545 strategies = [BaseStrategy(), AdaptivityStrategy(), IterateStrategy()]
1547 for i in range(len(problems)):
1548 stats_analyser = FaultStats(
1549 prob=problems[i],
1550 strategies=strategies,
1551 faults=[False],
1552 reload=True,
1553 recovery_thresh=1.1,
1554 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1555 num_procs=1,
1556 mode='random',
1557 )
1558 stats_analyser.compare_strategies()
1559 plt.show()
1562def parse_args():
1563 import sys
1565 kwargs = {}
1567 for i in range(len(sys.argv)):
1568 if 'prob' in sys.argv[i]:
1569 if sys.argv[i + 1] == 'run_Lorenz':
1570 kwargs['prob'] = run_Lorenz
1571 elif sys.argv[i + 1] == 'run_vdp':
1572 kwargs['prob'] = run_vdp
1573 elif sys.argv[i + 1] == 'run_Schroedinger':
1574 kwargs['prob'] = run_Schroedinger
1575 elif sys.argv[i + 1] == 'run_quench':
1576 kwargs['prob'] = run_quench
1577 elif sys.argv[i + 1] == 'run_AC':
1578 kwargs['prob'] = run_AC
1579 elif sys.argv[i + 1] == 'run_RBC':
1580 kwargs['prob'] = run_RBC
1581 else:
1582 raise NotImplementedError
1583 elif 'num_procs' in sys.argv[i]:
1584 kwargs['num_procs'] = int(sys.argv[i + 1])
1585 elif 'runs' in sys.argv[i]:
1586 kwargs['runs'] = int(sys.argv[i + 1])
1587 elif 'mode' in sys.argv[i]:
1588 kwargs['mode'] = str(sys.argv[i + 1])
1589 elif 'reload' in sys.argv[i]:
1590 kwargs['reload'] = False if sys.argv[i + 1] == 'False' else True
1592 return kwargs
1595def compare_adaptivity_modes():
1596 from pySDC.projects.Resilience.strategies import AdaptivityRestartFirstStep
1598 fig, axs = plt.subplots(2, 2, sharey=True)
1599 problems = [run_vdp, run_Lorenz, run_Schroedinger, run_quench]
1600 titles = ['Van der Pol', 'Lorenz attractor', r'Schr\"odinger', 'Quench']
1602 for i in range(len(problems)):
1603 ax = axs.flatten()[i]
1605 ax.set_title(titles[i])
1607 stats_analyser = FaultStats(
1608 prob=problems[i],
1609 strategies=[AdaptivityStrategy()],
1610 faults=[False, True],
1611 reload=True,
1612 recovery_thresh=1.1,
1613 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1614 num_procs=1,
1615 mode='default',
1616 stats_path='data/stats-jusuf',
1617 )
1618 stats_analyser.get_recovered()
1620 for strategy in stats_analyser.strategies:
1621 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1622 stats_analyser.plot_things_per_things(
1623 'recovered',
1624 'bit',
1625 False,
1626 strategies=[strategy],
1627 op=stats_analyser.rec_rate,
1628 mask=fixable,
1629 args={'ylabel': 'recovery rate', 'label': 'bla'},
1630 name='fixable_recovery',
1631 ax=ax,
1632 )
1633 single_proc = ax.lines[-1]
1634 single_proc.set_label('single process')
1635 single_proc.set_color('black')
1637 stats_analyser = FaultStats(
1638 prob=problems[i],
1639 strategies=[AdaptivityStrategy(), AdaptivityRestartFirstStep()],
1640 faults=[False, True],
1641 reload=True,
1642 recovery_thresh=1.5,
1643 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(problems[i], 0),
1644 num_procs=4,
1645 mode='default',
1646 stats_path='data/stats-jusuf',
1647 )
1648 stats_analyser.get_recovered()
1650 for strategy in stats_analyser.strategies:
1651 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1652 stats_analyser.plot_things_per_things(
1653 'recovered',
1654 'bit',
1655 False,
1656 strategies=[strategy],
1657 op=stats_analyser.rec_rate,
1658 mask=fixable,
1659 args={'ylabel': 'recovery rate'},
1660 name='fixable_recovery',
1661 ax=ax,
1662 )
1663 fig.suptitle('Comparison of restart modes with adaptivity with 4 ranks')
1664 fig.tight_layout()
1665 plt.show()
1668def main():
1669 kwargs = {
1670 'prob': run_RBC,
1671 'num_procs': 1,
1672 'mode': 'default',
1673 'runs': 2000,
1674 'reload': False,
1675 **parse_args(),
1676 }
1678 strategy_args = {
1679 'stop_at_nan': True,
1680 }
1682 from pySDC.projects.Resilience.strategies import AdaptivityPolynomialError, kAdaptivityStrategy
1684 stats_analyser = FaultStats(
1685 strategies=[
1686 BaseStrategy(**strategy_args),
1687 AdaptivityStrategy(**strategy_args),
1688 kAdaptivityStrategy(**strategy_args),
1689 HotRodStrategy(**strategy_args),
1690 AdaptivityPolynomialError(**strategy_args),
1691 ],
1692 faults=[False, True],
1693 recovery_thresh=1.15,
1694 recovery_thresh_abs=RECOVERY_THRESH_ABS.get(kwargs.get('prob', None), 0),
1695 stats_path='data/stats-jusuf',
1696 **kwargs,
1697 )
1698 stats_analyser.run_stats_generation(runs=kwargs['runs'], step=25)
1700 if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data
1701 return None
1703 stats_analyser.get_recovered()
1704 mask = None
1706 # stats_analyser.compare_strategies()
1707 stats_analyser.plot_things_per_things(
1708 'recovered', 'node', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1709 )
1710 stats_analyser.plot_things_per_things(
1711 'recovered', 'iteration', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1712 )
1714 stats_analyser.plot_things_per_things(
1715 'recovered', 'bit', False, op=stats_analyser.rec_rate, mask=mask, args={'ylabel': 'recovery rate'}
1716 )
1718 # make a plot for only the faults that can be recovered
1719 fig, ax = plt.subplots(1, 1)
1720 for strategy in stats_analyser.strategies:
1721 fixable = stats_analyser.get_fixable_faults_only(strategy=strategy)
1722 stats_analyser.plot_things_per_things(
1723 'recovered',
1724 'bit',
1725 False,
1726 strategies=[strategy],
1727 op=stats_analyser.rec_rate,
1728 mask=fixable,
1729 args={'ylabel': 'recovery rate'},
1730 name='fixable_recovery',
1731 ax=ax,
1732 )
1733 fig.tight_layout()
1734 fig.savefig(f'data/{stats_analyser.get_name()}-recoverable.pdf', transparent=True)
1736 fig, ax = plt.subplots(1, 1, figsize=(13, 4))
1737 stats_analyser.plot_recovery_thresholds(ax=ax, thresh_range=np.logspace(-1, 1, 1000))
1738 # ax.axvline(stats_analyser.get_thresh(BaseStrategy()), color='grey', ls=':', label='recovery threshold')
1739 ax.set_xscale('log')
1740 ax.legend(frameon=False)
1741 fig.tight_layout()
1742 fig.savefig(f'data/{stats_analyser.get_name()}-threshold.pdf', transparent=True)
1744 stats_analyser.plot_things_per_things(
1745 'total_iteration',
1746 'bit',
1747 True,
1748 op=stats_analyser.mean,
1749 mask=mask,
1750 args={'yscale': 'log', 'ylabel': 'total iterations'},
1751 )
1752 stats_analyser.plot_things_per_things(
1753 'total_iteration',
1754 'bit',
1755 True,
1756 op=stats_analyser.extra_mean,
1757 mask=mask,
1758 args={'yscale': 'linear', 'ylabel': 'extra iterations'},
1759 name='extra_iter',
1760 )
1761 stats_analyser.plot_things_per_things(
1762 'error', 'bit', True, op=stats_analyser.mean, mask=mask, args={'yscale': 'log'}
1763 )
1765 stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.9, 1.3, 100))
1766 plt.show()
1769if __name__ == "__main__":
1770 main()
1771 # compare_adaptivity_modes()
1772 # check_local_error()