Coverage for pySDC / projects / Resilience / fault_injection.py: 91%
184 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-27 07:06 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-27 07:06 +0000
1import struct
2import numpy as np
4from pySDC.core.hooks import Hooks
5from pySDC.implementations.datatype_classes.mesh import mesh
6from pySDC.helpers.pysdc_helper import FrozenClass
9def get_combination_from_index(index, options):
10 """
11 Transform an index into a set of combinations. This is used when trying all possible combinations for fault insertion.
13 For instance if you want to insert a fault in any iteration in any node than you have k options for iterations and M
14 options for the node for a total of M * k possibilities. You can then pass an index between 0 and M * k to this
15 function together with the options [k, M] which will return a unique value for both k and M from the index. For
16 instance, index = 0 will return [0, 0] and index = k * M will return [k-1, M-1].
18 Args:
19 index (int): Index of the combination
20 options (list): The number of options for the each combination.
21 """
23 if len(options) == 1:
24 return [index % options[0]]
25 else:
26 return [index % options[0]] + get_combination_from_index(index // options[0], options[1:])
29class Fault(FrozenClass):
30 '''
31 Class for storing all the data that belongs to a fault, i.e. when and where it happens
32 '''
34 def __init__(self, params=None):
35 '''
36 Initialization routine for faults
38 Args:
39 params (dict): Parameters regarding when the fault will be inserted
40 '''
42 params = {} if params is None else params
44 self.time = None
45 self.timestep = None
46 self.level_number = None
47 self.iteration = None
48 self.node = None
49 self.problem_pos = None
50 self.bit = None
51 self.rank = None
52 self.target = 0
53 self.when = 'after' # before or after an iteration?
55 for k, v in params.items():
56 setattr(self, k, v)
58 self._freeze()
60 @classmethod
61 def random(cls, args, rnd_params, random_generator=None):
62 '''
63 Classmethod to initialize a random fault
65 Args:
66 args (dict): Supply variables that will be exempt from randomization here
67 rnd_params (dict): Supply attributes to the randomization such as maximum values here
68 random_generator (numpy.random.RandomState): Give a random generator to ensure repeatability
70 Returns Fault: Randomly generated fault
71 '''
73 if random_generator is None:
74 random_generator = np.random.RandomState(2187)
76 random = {
77 'level_number': random_generator.randint(low=0, high=rnd_params['level_number']),
78 'node': random_generator.randint(low=rnd_params.get('min_node', 0), high=rnd_params['node'] + 1),
79 'iteration': random_generator.randint(low=1, high=rnd_params['iteration'] + 1),
80 'problem_pos': [random_generator.randint(low=0, high=i) for i in rnd_params['problem_pos']],
81 'bit': random_generator.randint(low=0, high=rnd_params['bit']),
82 'rank': random_generator.randint(low=0, high=rnd_params['rank']),
83 }
84 return cls({**random, **args})
86 @classmethod
87 def index_to_combination(cls, args, rnd_params, generator=None):
88 '''
89 Classmethod to initialize a fault based on an index to translate to a combination of fault parameters, in order
90 to loop through all combinations. Probably only makes sense for ODEs.
92 First, we get the number of possible combinations m, and then get a value for each fault parameter as
93 i = m % i_max (plus modifications to make sure we get a sensible value)
95 Args:
96 args (dict): Supply variables that will be exempt from randomization here.
97 rnd_params (dict): Supply attributes to the randomization such as maximum values here
98 generator (int): Index for specific combination
100 Returns:
101 Fault: Generated from a specific combination of parameters
102 '''
104 ranges = [
105 (0, rnd_params['level_number']),
106 (rnd_params.get('min_node', 0), rnd_params['node'] + 1),
107 (1, rnd_params['iteration'] + 1),
108 (0, rnd_params['bit']),
109 (0, rnd_params['rank']),
110 ]
111 ranges += [(0, i) for i in rnd_params['problem_pos']]
113 # get values for taking modulo later
114 mods = [me[1] - me[0] for me in ranges]
116 # get the combinations from the index
117 combinations = get_combination_from_index(generator, mods)
119 # translate the combinations into a fault that we want to add
120 combination = {
121 'level_number': range(*ranges[0])[combinations[0]],
122 'node': range(*ranges[1])[combinations[1]],
123 'iteration': range(*ranges[2])[combinations[2]],
124 'bit': range(*ranges[3])[combinations[3]],
125 'rank': range(*ranges[4])[combinations[4]],
126 'problem_pos': [range(*ranges[5])[combinations[5 + i]] for i in range(len(rnd_params['problem_pos']))],
127 }
129 return cls({**combination, **args})
132class FaultInjector(Hooks):
133 '''
134 Class to use as base for hooks class instead of abstract hooks class to insert faults using hooks
135 '''
137 def __init__(self):
138 '''
139 Initialization routine
140 '''
141 super().__init__()
142 self.fault_frequency_time = np.inf
143 self.fault_frequency_iter = np.inf
144 self.faults = []
145 self.fault_init = [] # add faults to this list when the random parameters have not been set up yet
146 self.rnd_params = {}
147 self.random_generator = np.random.RandomState(2187) # number of the cell in which Princess Leia is held
149 @classmethod
150 def generate_fault_stuff_single_fault(
151 cls, bit=0, iteration=1, problem_pos=None, level_number=0, node=1, time=None, rank=0
152 ):
153 """
154 Generate a fault stuff object which will insert a single fault at the supplied parameters. Because there will
155 be some parameter set for everything, there is no randomization anymore.
157 Args:
158 bit (int): Which bit to flip
159 iteration (int): After which iteration to flip
160 problem_pos: Where in the problem to flip a bit, type depends on the problem
161 level_number (int): In which level you want to flip
162 node (int): In which node to flip
163 time (float): The bitflip will occur in the time step after this time is reached
164 rank (int): The rank you want to insert the fault into
166 Returns:
167 dict: Can be supplied to the run functions in the resilience project to generate the single fault
168 """
169 assert problem_pos is not None, "Please supply a spatial position for the fault as `problem_pos`!"
170 assert time is not None, "Please supply a time for the fault as `time`!"
171 fault_stuff = {
172 'rng': np.random.RandomState(0),
173 'args': {
174 'bit': bit,
175 'iteration': iteration,
176 'level_number': level_number,
177 'problem_pos': problem_pos,
178 'node': node,
179 'time': time,
180 'rank': rank,
181 },
182 }
183 fault_stuff['rnd_args'] = fault_stuff['args']
184 return fault_stuff
186 def add_fault(self, args, rnd_args):
187 if type(self.random_generator) == int:
188 self.add_fault_from_combination(args, rnd_args)
189 elif type(self.random_generator) == np.random.RandomState:
190 self.add_random_fault(args, rnd_args)
191 else:
192 raise NotImplementedError(f'Don\'t know how to add fault with generator of type \
193{type(self.random_generator)}')
195 def add_stored_faults(self):
196 '''
197 Method to add faults that are recorded for later adding in the pre run hook
199 Returns:
200 None
201 '''
202 for f in self.fault_init:
203 if f['kind'] == 'random':
204 self.add_random_fault(args=f['args'], rnd_args=f['rnd_args'])
205 elif f['kind'] == 'combination':
206 self.add_fault_from_combination(args=f['args'], rnd_args=f['rnd_args'])
207 else:
208 raise NotImplementedError(f'I don\'t know how to add stored fault of kind {f["kind"]}')
210 def add_random_fault(self, args=None, rnd_args=None):
211 '''
212 Method to generate a random fault and add it to the list of faults to be injected at some point
214 Args:
215 args (dict): parameters for fault initialization that should not be randomized
216 rnd_args (dict): special parameters for randomization other than the default ones
218 Returns:
219 None
220 '''
222 # replace args and rnd_args with empty dict if we didn't specify anything
223 args = {} if args is None else args
224 rnd_args = {} if rnd_args is None else rnd_args
226 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook
227 if self.rnd_params == {}:
228 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'random'}]
229 else:
230 self.faults += [
231 Fault.random(
232 args=args, rnd_params={**self.rnd_params, **rnd_args}, random_generator=self.random_generator
233 )
234 ]
236 return None
238 def add_fault_from_combination(self, args=None, rnd_args=None):
239 '''
240 Method to generate a random fault and add it to the list of faults to be injected at some point
242 Args:
243 args (dict): parameters for fault initialization that override the combinations
244 rnd_args (dict): possible values that the parameters can take
246 Returns:
247 None
248 '''
250 # replace args and rnd_args with empty dict if we didn't specify anything
251 args = {} if args is None else args
252 rnd_args = {} if rnd_args is None else rnd_args
254 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook
255 if self.rnd_params == {}:
256 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'combination'}]
257 else:
258 self.faults += [
259 Fault.index_to_combination(
260 args=args, rnd_params={**self.rnd_params, **rnd_args}, generator=self.random_generator
261 )
262 ]
264 return None
266 def inject_fault(self, step, f):
267 '''
268 Method to inject a fault into a step.
270 Args:
271 step (pySDC.Step.step): Step to inject the fault into
272 f (Fault): fault that should be injected
274 Returns:
275 None
276 '''
277 L = step.levels[f.level_number]
278 _abs_before = None
279 _abs_after = None
281 # insert the fault in some target
282 if f.target == 0:
283 '''
284 Target 0 means we flip a bit in the solution.
286 To make sure the faults have some impact, we have to reevaluate the right hand side. Otherwise the fault is
287 fixed automatically in this implementation, as the right hand side is assembled only from f(t, u) and u is
288 tempered with after computing f(t, u).
290 To be fair to iteration based resilience strategies, we also reevaluate the residual. Otherwise, when a
291 fault happens in the last iteration, it will not show up in the residual and the iteration is wrongly
292 stopped.
293 '''
294 _abs_before = abs(L.u[f.node][tuple(f.problem_pos)])
295 L.u[f.node][tuple(f.problem_pos)] = self.flip_bit(L.u[f.node][tuple(f.problem_pos)], f.bit)
296 L.f[f.node] = L.prob.eval_f(L.u[f.node], L.time + L.dt * L.sweep.coll.nodes[max([0, f.node - 1])])
297 L.sweep.compute_residual()
298 _abs_after = abs(L.u[f.node][tuple(f.problem_pos)])
299 else:
300 raise NotImplementedError(f'Target {f.target} for faults not implemented!')
302 # log what happened to stats and screen
303 self.logger.info(
304 f'Flipping bit {f.bit} {f.when} iteration {f.iteration} in node {f.node} on rank {f.rank}. Target: {f.target}. Abs: {_abs_before:.4e} -> {_abs_after:.4e}'
305 )
306 self.add_to_stats(
307 process=step.status.slot,
308 time=L.time,
309 level=L.level_index,
310 iter=step.status.iter,
311 sweep=L.status.sweep,
312 type='bitflip',
313 value=(f.level_number, f.iteration, f.node, f.problem_pos, f.bit, f.target, f.rank),
314 )
316 # remove the fault from the list to make sure it happens only once
317 self.faults.remove(f)
319 return None
321 def pre_run(self, step, level_number):
322 '''
323 Setup random parameters and add the faults that we couldn't before here
325 Args:
326 step (pySDC.Step.step): the current step
327 level_number (int): the current level number
329 Returns:
330 None
331 '''
333 super().pre_run(step, level_number)
335 if not type(step.levels[level_number].u[0]) == mesh:
336 raise NotImplementedError(f'Fault insertion is only implemented for type mesh, not \
337{type(step.levels[level_number].u[0])}')
339 dtype = step.levels[level_number].prob.u_exact(t=0).dtype
340 if dtype in [float, np.float64]:
341 bit = 64
342 elif dtype in [complex]:
343 bit = 128
344 else:
345 raise NotImplementedError(f'Don\'t know how many bits type {dtype} has')
347 # define parameters for randomization
348 self.rnd_params = {
349 'level_number': len(step.levels),
350 'node': step.levels[0].sweep.params.num_nodes,
351 'iteration': step.params.maxiter,
352 'problem_pos': step.levels[level_number].u[0].shape,
353 'bit': bit, # change manually if you ever have something else
354 'rank': 0,
355 **self.rnd_params,
356 }
358 # initialize the faults have been added before we knew the random parameters
359 if step.status.first:
360 self.add_stored_faults()
362 if self.rnd_params['level_number'] > 1:
363 raise NotImplementedError('I don\'t know how to insert faults in this multi-level madness :(')
365 # initialize parameters for periodic fault injection
366 self.timestep_idx = 0
367 self.iter_idx = 0
369 return None
371 def pre_step(self, step, level_number):
372 '''
373 Deal with periodic fault injection here:
374 - Increment the index for counting time steps
375 - Add a random fault in this time step if it is time for it based on the frequency
377 Args:
378 step (pySDC.Step.step): the current step
379 level_number (int): the current level number
381 Returns:
382 None
383 '''
384 super().pre_step(step, level_number)
386 self.timestep_idx += 1
388 if self.timestep_idx % self.fault_frequency_time == 0 and not self.timestep_idx == 0:
389 self.add_random_fault(args={'timestep': self.timestep_idx})
391 return None
393 def pre_iteration(self, step, level_number):
394 '''
395 Check if we have a fault that should be inserted here and deal with periodic injection per iteration count
397 Args:
398 step (pySDC.Step.step): the current step
399 level_number (int): the current level number
401 Returns:
402 None
403 '''
405 super().pre_iteration(step, level_number)
407 # check if the fault-free iteration count period has elapsed
408 if self.iter_idx % self.fault_frequency_iter == 0 and not self.iter_idx == 0:
409 self.add_random_fault(args={'timestep': self.timestep_idx, 'iteration': step.status.iter})
411 # loop though all faults that have not yet happened and check if they are scheduled now
412 for f in [me for me in self.faults if me.when == 'before']:
413 # based on iteration number
414 if self.timestep_idx == f.timestep and step.status.iter == f.iteration:
415 self.inject_fault(step, f)
416 # based on time
417 elif f.time is not None:
418 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank:
419 self.inject_fault(step, f)
421 self.iter_idx += 1
423 return None
425 def post_iteration(self, step, level_number):
426 '''
427 Check if we have a fault that should be inserted here
429 Args:
430 step (pySDC.Step.step): the current step
431 level_number (int): the current level number
433 Returns:
434 None
435 '''
437 super().post_iteration(step, level_number)
439 # loop though all unhappened faults and check if they are scheduled now
440 for f in [me for me in self.faults if me.when == 'after']:
441 # based on iteration number
442 if self.timestep_idx == f.timestep and step.status.iter == f.iteration:
443 self.inject_fault(step, f)
444 # based on time
445 elif f.time is not None:
446 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank:
447 self.inject_fault(step, f)
449 return None
451 @classmethod
452 def to_binary(cls, f):
453 '''
454 Converts a single float in a string containing its binary representation in memory following IEEE754
455 The struct.pack function returns the input with the applied conversion code in 8 bit blocks, which are then
456 concatenated as a string. Complex numbers will be returned as two consecutive strings.
458 Args:
459 f (float, np.float64, np.float32): number to be converted to binary representation
461 Returns:
462 (str) Binary representation of f following IEEE754 as a string
463 '''
464 if type(f) in [np.float64, float]:
465 conversion_code = '>d' # big endian, double
466 elif type(f) in [np.float32]:
467 conversion_code = '>f' # big endian, float
468 elif type(f) in [np.complex128]:
469 return f'{cls.to_binary(f.real)}{cls.to_binary(f.imag)}'
470 else:
471 raise NotImplementedError(f'Don\'t know how to convert number of type {type(f)} to binary')
473 return ''.join('{:0>8b}'.format(c) for c in struct.pack(conversion_code, f))
475 @classmethod
476 def to_float(cls, s):
477 '''
478 Converts a string of a IEEE754 binary representation in a float. The string is converted to integer with base 2
479 and converted to bytes, which can be unpacked into a Python float by the struct module.
481 Args:
482 s (str): binary representation of a float number of 32 or 64 bit length following IEEE754
484 Returns:
485 (float) floating point representation of the binary string
486 '''
487 if len(s) == 64:
488 conversion_code = '>d' # big endian, double
489 byte_count = 8
490 elif len(s) == 32:
491 conversion_code = '>f' # big endian, float
492 byte_count = 4
493 elif len(s) == 128: # complex floats
494 real = s[0:64]
495 imag = s[64:128]
496 return cls.to_float(real) + cls.to_float(imag) * 1j
498 else:
499 raise NotImplementedError(f'Don\'t know how to convert string of length {len(s)} to float')
501 return struct.unpack(conversion_code, int(s, 2).to_bytes(byte_count, 'big'))[0]
503 @classmethod
504 def flip_bit(cls, target, bit):
505 '''
506 Flips a bit at position bit in a target using the bitwise xor operator
508 Args:
509 target (float, np.float64, np.float32): the floating point number in which you want to flip a bit
510 bit (int): the bit which you intend to flip
512 Returns:
513 (float) The floating point number resulting from flipping the respective bit in target
514 '''
515 binary = cls.to_binary(target)
516 return cls.to_float(f'{binary[:bit]}{int(binary[bit]) ^ 1}{binary[bit+1:]}')
519def prepare_controller_for_faults(controller, fault_stuff, rnd_args=None, args=None):
520 """
521 Prepare the controller for a run with faults. That means the fault injection hook is added and supplied with the
522 relevant parameters.
524 Args:
525 controller (pySDC.controller): The controller
526 fault_stuff (dict): A dictionary with information on how to add faults
527 rnd_args (dict): Default arguments for how to add random faults in a specific problem
528 args (dict): Default arguments for where to add faults in a specific problem
530 Returns:
531 None
532 """
533 args = {} if args is None else args
534 rnd_args = {} if rnd_args is None else rnd_args
536 faultHook = get_fault_injector_hook(controller)
537 faultHook.random_generator = fault_stuff['rng']
539 for key in ['fault_frequency_iter']:
540 if key in fault_stuff.keys():
541 faultHook.__dict__[key] = fault_stuff[key]
543 if not len(faultHook.rnd_params.keys()) > 0:
544 faultHook.add_fault(
545 rnd_args={**rnd_args, **fault_stuff.get('rnd_params', {})},
546 args={**args, **fault_stuff.get('args', {})},
547 )
549 for key, val in fault_stuff.get('rnd_params', {}).items():
550 faultHook.rnd_params[key] = val
552 faultHook.rnd_params['rank'] = {'rank': len(controller.MS), **rnd_args, **fault_stuff.get('rnd_params', {})}.get(
553 'rank', 1
554 )
557def get_fault_injector_hook(controller):
558 """
559 Get the fault injector hook from the list of hooks in the controller.
560 If there is not one already, it is added here.
562 Args:
563 controller (pySDC.controller): The controller
565 Returns:
566 pySDC.hook.FaultInjector: The fault injecting hook
567 """
568 hook_types = [type(me) for me in controller.hooks]
570 if FaultInjector not in hook_types:
571 controller.add_hook(FaultInjector)
572 return get_fault_injector_hook(controller)
573 else:
574 hook_idx = [i for i in range(len(hook_types)) if hook_types[i] == FaultInjector]
575 assert len(hook_idx) == 1, f'Expected exactly one FaultInjector, got {len(hook_idx)}!'
576 return controller.hooks[hook_idx[0]]