Coverage for pySDC/projects/Resilience/fault_injection.py: 91%
184 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-19 09:13 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-19 09:13 +0000
1import struct
2import numpy as np
4from pySDC.core.hooks import Hooks
5from pySDC.implementations.datatype_classes.mesh import mesh
6from pySDC.helpers.pysdc_helper import FrozenClass
9def get_combination_from_index(index, options):
10 """
11 Transform an index into a set of combinations. This is used when trying all possible combinations for fault insertion.
13 For instance if you want to insert a fault in any iteration in any node than you have k options for iterations and M
14 options for the node for a total of M * k possibilities. You can then pass an index between 0 and M * k to this
15 function together with the options [k, M] which will return a unique value for both k and M from the index. For
16 instance, index = 0 will return [0, 0] and index = k * M will return [k-1, M-1].
18 Args:
19 index (int): Index of the combination
20 options (list): The number of options for the each combination.
21 """
23 if len(options) == 1:
24 return [index % options[0]]
25 else:
26 return [index % options[0]] + get_combination_from_index(index // options[0], options[1:])
29class Fault(FrozenClass):
30 '''
31 Class for storing all the data that belongs to a fault, i.e. when and where it happens
32 '''
34 def __init__(self, params=None):
35 '''
36 Initialization routine for faults
38 Args:
39 params (dict): Parameters regarding when the fault will be inserted
40 '''
42 params = {} if params is None else params
44 self.time = None
45 self.timestep = None
46 self.level_number = None
47 self.iteration = None
48 self.node = None
49 self.problem_pos = None
50 self.bit = None
51 self.rank = None
52 self.target = 0
53 self.when = 'after' # before or after an iteration?
55 for k, v in params.items():
56 setattr(self, k, v)
58 self._freeze()
60 @classmethod
61 def random(cls, args, rnd_params, random_generator=None):
62 '''
63 Classmethod to initialize a random fault
65 Args:
66 args (dict): Supply variables that will be exempt from randomization here
67 rnd_params (dict): Supply attributes to the randomization such as maximum values here
68 random_generator (numpy.random.RandomState): Give a random generator to ensure repeatability
70 Returns Fault: Randomly generated fault
71 '''
73 if random_generator is None:
74 random_generator = np.random.RandomState(2187)
76 random = {
77 'level_number': random_generator.randint(low=0, high=rnd_params['level_number']),
78 'node': random_generator.randint(low=rnd_params.get('min_node', 0), high=rnd_params['node'] + 1),
79 'iteration': random_generator.randint(low=1, high=rnd_params['iteration'] + 1),
80 'problem_pos': [random_generator.randint(low=0, high=i) for i in rnd_params['problem_pos']],
81 'bit': random_generator.randint(low=0, high=rnd_params['bit']),
82 'rank': random_generator.randint(low=0, high=rnd_params['rank']),
83 }
84 return cls({**random, **args})
86 @classmethod
87 def index_to_combination(cls, args, rnd_params, generator=None):
88 '''
89 Classmethod to initialize a fault based on an index to translate to a combination of fault parameters, in order
90 to loop through all combinations. Probably only makes sense for ODEs.
92 First, we get the number of possible combinations m, and then get a value for each fault parameter as
93 i = m % i_max (plus modifications to make sure we get a sensible value)
95 Args:
96 args (dict): Supply variables that will be exempt from randomization here.
97 rnd_params (dict): Supply attributes to the randomization such as maximum values here
98 generator (int): Index for specific combination
100 Returns:
101 Fault: Generated from a specific combination of parameters
102 '''
104 ranges = [
105 (0, rnd_params['level_number']),
106 (rnd_params.get('min_node', 0), rnd_params['node'] + 1),
107 (1, rnd_params['iteration'] + 1),
108 (0, rnd_params['bit']),
109 (0, rnd_params['rank']),
110 ]
111 ranges += [(0, i) for i in rnd_params['problem_pos']]
113 # get values for taking modulo later
114 mods = [me[1] - me[0] for me in ranges]
116 # get the combinations from the index
117 combinations = get_combination_from_index(generator, mods)
119 # translate the combinations into a fault that we want to add
120 combination = {
121 'level_number': range(*ranges[0])[combinations[0]],
122 'node': range(*ranges[1])[combinations[1]],
123 'iteration': range(*ranges[2])[combinations[2]],
124 'bit': range(*ranges[3])[combinations[3]],
125 'rank': range(*ranges[4])[combinations[4]],
126 'problem_pos': [range(*ranges[5])[combinations[5 + i]] for i in range(len(rnd_params['problem_pos']))],
127 }
129 return cls({**combination, **args})
132class FaultInjector(Hooks):
133 '''
134 Class to use as base for hooks class instead of abstract hooks class to insert faults using hooks
135 '''
137 def __init__(self):
138 '''
139 Initialization routine
140 '''
141 super().__init__()
142 self.fault_frequency_time = np.inf
143 self.fault_frequency_iter = np.inf
144 self.faults = []
145 self.fault_init = [] # add faults to this list when the random parameters have not been set up yet
146 self.rnd_params = {}
147 self.random_generator = np.random.RandomState(2187) # number of the cell in which Princess Leia is held
149 @classmethod
150 def generate_fault_stuff_single_fault(
151 cls, bit=0, iteration=1, problem_pos=None, level_number=0, node=1, time=None, rank=0
152 ):
153 """
154 Generate a fault stuff object which will insert a single fault at the supplied parameters. Because there will
155 be some parameter set for everything, there is no randomization anymore.
157 Args:
158 bit (int): Which bit to flip
159 iteration (int): After which iteration to flip
160 problem_pos: Where in the problem to flip a bit, type depends on the problem
161 level_number (int): In which level you want to flip
162 node (int): In which node to flip
163 time (float): The bitflip will occur in the time step after this time is reached
164 rank (int): The rank you want to insert the fault into
166 Returns:
167 dict: Can be supplied to the run functions in the resilience project to generate the single fault
168 """
169 assert problem_pos is not None, "Please supply a spatial position for the fault as `problem_pos`!"
170 assert time is not None, "Please supply a time for the fault as `time`!"
171 fault_stuff = {
172 'rng': np.random.RandomState(0),
173 'args': {
174 'bit': bit,
175 'iteration': iteration,
176 'level_number': level_number,
177 'problem_pos': problem_pos,
178 'node': node,
179 'time': time,
180 'rank': rank,
181 },
182 }
183 fault_stuff['rnd_args'] = fault_stuff['args']
184 return fault_stuff
186 def add_fault(self, args, rnd_args):
187 if type(self.random_generator) == int:
188 self.add_fault_from_combination(args, rnd_args)
189 elif type(self.random_generator) == np.random.RandomState:
190 self.add_random_fault(args, rnd_args)
191 else:
192 raise NotImplementedError(
193 f' \
194 Don\'t know how to add fault with generator of type \
195{type(self.random_generator)}'
196 )
198 def add_stored_faults(self):
199 '''
200 Method to add faults that are recorded for later adding in the pre run hook
202 Returns:
203 None
204 '''
205 for f in self.fault_init:
206 if f['kind'] == 'random':
207 self.add_random_fault(args=f['args'], rnd_args=f['rnd_args'])
208 elif f['kind'] == 'combination':
209 self.add_fault_from_combination(args=f['args'], rnd_args=f['rnd_args'])
210 else:
211 raise NotImplementedError(f'I don\'t know how to add stored fault of kind {f["kind"]}')
213 def add_random_fault(self, args=None, rnd_args=None):
214 '''
215 Method to generate a random fault and add it to the list of faults to be injected at some point
217 Args:
218 args (dict): parameters for fault initialization that should not be randomized
219 rnd_args (dict): special parameters for randomization other than the default ones
221 Returns:
222 None
223 '''
225 # replace args and rnd_args with empty dict if we didn't specify anything
226 args = {} if args is None else args
227 rnd_args = {} if rnd_args is None else rnd_args
229 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook
230 if self.rnd_params == {}:
231 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'random'}]
232 else:
233 self.faults += [
234 Fault.random(
235 args=args, rnd_params={**self.rnd_params, **rnd_args}, random_generator=self.random_generator
236 )
237 ]
239 return None
241 def add_fault_from_combination(self, args=None, rnd_args=None):
242 '''
243 Method to generate a random fault and add it to the list of faults to be injected at some point
245 Args:
246 args (dict): parameters for fault initialization that override the combinations
247 rnd_args (dict): possible values that the parameters can take
249 Returns:
250 None
251 '''
253 # replace args and rnd_args with empty dict if we didn't specify anything
254 args = {} if args is None else args
255 rnd_args = {} if rnd_args is None else rnd_args
257 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook
258 if self.rnd_params == {}:
259 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'combination'}]
260 else:
261 self.faults += [
262 Fault.index_to_combination(
263 args=args, rnd_params={**self.rnd_params, **rnd_args}, generator=self.random_generator
264 )
265 ]
267 return None
269 def inject_fault(self, step, f):
270 '''
271 Method to inject a fault into a step.
273 Args:
274 step (pySDC.Step.step): Step to inject the fault into
275 f (Fault): fault that should be injected
277 Returns:
278 None
279 '''
280 L = step.levels[f.level_number]
281 _abs_before = None
282 _abs_after = None
284 # insert the fault in some target
285 if f.target == 0:
286 '''
287 Target 0 means we flip a bit in the solution.
289 To make sure the faults have some impact, we have to reevaluate the right hand side. Otherwise the fault is
290 fixed automatically in this implementation, as the right hand side is assembled only from f(t, u) and u is
291 tempered with after computing f(t, u).
293 To be fair to iteration based resilience strategies, we also reevaluate the residual. Otherwise, when a
294 fault happens in the last iteration, it will not show up in the residual and the iteration is wrongly
295 stopped.
296 '''
297 _abs_before = abs(L.u[f.node][tuple(f.problem_pos)])
298 L.u[f.node][tuple(f.problem_pos)] = self.flip_bit(L.u[f.node][tuple(f.problem_pos)], f.bit)
299 L.f[f.node] = L.prob.eval_f(L.u[f.node], L.time + L.dt * L.sweep.coll.nodes[max([0, f.node - 1])])
300 L.sweep.compute_residual()
301 _abs_after = abs(L.u[f.node][tuple(f.problem_pos)])
302 else:
303 raise NotImplementedError(f'Target {f.target} for faults not implemented!')
305 # log what happened to stats and screen
306 self.logger.info(
307 f'Flipping bit {f.bit} {f.when} iteration {f.iteration} in node {f.node} on rank {f.rank}. Target: {f.target}. Abs: {_abs_before:.4e} -> {_abs_after:.4e}'
308 )
309 self.add_to_stats(
310 process=step.status.slot,
311 time=L.time,
312 level=L.level_index,
313 iter=step.status.iter,
314 sweep=L.status.sweep,
315 type='bitflip',
316 value=(f.level_number, f.iteration, f.node, f.problem_pos, f.bit, f.target, f.rank),
317 )
319 # remove the fault from the list to make sure it happens only once
320 self.faults.remove(f)
322 return None
324 def pre_run(self, step, level_number):
325 '''
326 Setup random parameters and add the faults that we couldn't before here
328 Args:
329 step (pySDC.Step.step): the current step
330 level_number (int): the current level number
332 Returns:
333 None
334 '''
336 super().pre_run(step, level_number)
338 if not type(step.levels[level_number].u[0]) == mesh:
339 raise NotImplementedError(
340 f' \
341 Fault insertion is only implemented for type mesh, not \
342{type(step.levels[level_number].u[0])}'
343 )
345 dtype = step.levels[level_number].prob.u_exact(t=0).dtype
346 if dtype in [float, np.float64]:
347 bit = 64
348 elif dtype in [complex]:
349 bit = 128
350 else:
351 raise NotImplementedError(f'Don\'t know how many bits type {dtype} has')
353 # define parameters for randomization
354 self.rnd_params = {
355 'level_number': len(step.levels),
356 'node': step.levels[0].sweep.params.num_nodes,
357 'iteration': step.params.maxiter,
358 'problem_pos': step.levels[level_number].u[0].shape,
359 'bit': bit, # change manually if you ever have something else
360 'rank': 0,
361 **self.rnd_params,
362 }
364 # initialize the faults have been added before we knew the random parameters
365 if step.status.first:
366 self.add_stored_faults()
368 if self.rnd_params['level_number'] > 1:
369 raise NotImplementedError('I don\'t know how to insert faults in this multi-level madness :(')
371 # initialize parameters for periodic fault injection
372 self.timestep_idx = 0
373 self.iter_idx = 0
375 return None
377 def pre_step(self, step, level_number):
378 '''
379 Deal with periodic fault injection here:
380 - Increment the index for counting time steps
381 - Add a random fault in this time step if it is time for it based on the frequency
383 Args:
384 step (pySDC.Step.step): the current step
385 level_number (int): the current level number
387 Returns:
388 None
389 '''
390 super().pre_step(step, level_number)
392 self.timestep_idx += 1
394 if self.timestep_idx % self.fault_frequency_time == 0 and not self.timestep_idx == 0:
395 self.add_random_fault(args={'timestep': self.timestep_idx})
397 return None
399 def pre_iteration(self, step, level_number):
400 '''
401 Check if we have a fault that should be inserted here and deal with periodic injection per iteration count
403 Args:
404 step (pySDC.Step.step): the current step
405 level_number (int): the current level number
407 Returns:
408 None
409 '''
411 super().pre_iteration(step, level_number)
413 # check if the fault-free iteration count period has elapsed
414 if self.iter_idx % self.fault_frequency_iter == 0 and not self.iter_idx == 0:
415 self.add_random_fault(args={'timestep': self.timestep_idx, 'iteration': step.status.iter})
417 # loop though all faults that have not yet happened and check if they are scheduled now
418 for f in [me for me in self.faults if me.when == 'before']:
419 # based on iteration number
420 if self.timestep_idx == f.timestep and step.status.iter == f.iteration:
421 self.inject_fault(step, f)
422 # based on time
423 elif f.time is not None:
424 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank:
425 self.inject_fault(step, f)
427 self.iter_idx += 1
429 return None
431 def post_iteration(self, step, level_number):
432 '''
433 Check if we have a fault that should be inserted here
435 Args:
436 step (pySDC.Step.step): the current step
437 level_number (int): the current level number
439 Returns:
440 None
441 '''
443 super().post_iteration(step, level_number)
445 # loop though all unhappened faults and check if they are scheduled now
446 for f in [me for me in self.faults if me.when == 'after']:
447 # based on iteration number
448 if self.timestep_idx == f.timestep and step.status.iter == f.iteration:
449 self.inject_fault(step, f)
450 # based on time
451 elif f.time is not None:
452 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank:
453 self.inject_fault(step, f)
455 return None
457 @classmethod
458 def to_binary(cls, f):
459 '''
460 Converts a single float in a string containing its binary representation in memory following IEEE754
461 The struct.pack function returns the input with the applied conversion code in 8 bit blocks, which are then
462 concatenated as a string. Complex numbers will be returned as two consecutive strings.
464 Args:
465 f (float, np.float64, np.float32): number to be converted to binary representation
467 Returns:
468 (str) Binary representation of f following IEEE754 as a string
469 '''
470 if type(f) in [np.float64, float]:
471 conversion_code = '>d' # big endian, double
472 elif type(f) in [np.float32]:
473 conversion_code = '>f' # big endian, float
474 elif type(f) in [np.complex128]:
475 return f'{cls.to_binary(f.real)}{cls.to_binary(f.imag)}'
476 else:
477 raise NotImplementedError(f'Don\'t know how to convert number of type {type(f)} to binary')
479 return ''.join('{:0>8b}'.format(c) for c in struct.pack(conversion_code, f))
481 @classmethod
482 def to_float(cls, s):
483 '''
484 Converts a string of a IEEE754 binary representation in a float. The string is converted to integer with base 2
485 and converted to bytes, which can be unpacked into a Python float by the struct module.
487 Args:
488 s (str): binary representation of a float number of 32 or 64 bit length following IEEE754
490 Returns:
491 (float) floating point representation of the binary string
492 '''
493 if len(s) == 64:
494 conversion_code = '>d' # big endian, double
495 byte_count = 8
496 elif len(s) == 32:
497 conversion_code = '>f' # big endian, float
498 byte_count = 4
499 elif len(s) == 128: # complex floats
500 real = s[0:64]
501 imag = s[64:128]
502 return cls.to_float(real) + cls.to_float(imag) * 1j
504 else:
505 raise NotImplementedError(f'Don\'t know how to convert string of length {len(s)} to float')
507 return struct.unpack(conversion_code, int(s, 2).to_bytes(byte_count, 'big'))[0]
509 @classmethod
510 def flip_bit(cls, target, bit):
511 '''
512 Flips a bit at position bit in a target using the bitwise xor operator
514 Args:
515 target (float, np.float64, np.float32): the floating point number in which you want to flip a bit
516 bit (int): the bit which you intend to flip
518 Returns:
519 (float) The floating point number resulting from flipping the respective bit in target
520 '''
521 binary = cls.to_binary(target)
522 return cls.to_float(f'{binary[:bit]}{int(binary[bit]) ^ 1}{binary[bit+1:]}')
525def prepare_controller_for_faults(controller, fault_stuff, rnd_args=None, args=None):
526 """
527 Prepare the controller for a run with faults. That means the fault injection hook is added and supplied with the
528 relevant parameters.
530 Args:
531 controller (pySDC.controller): The controller
532 fault_stuff (dict): A dictionary with information on how to add faults
533 rnd_args (dict): Default arguments for how to add random faults in a specific problem
534 args (dict): Default arguments for where to add faults in a specific problem
536 Returns:
537 None
538 """
539 args = {} if args is None else args
540 rnd_args = {} if rnd_args is None else rnd_args
542 faultHook = get_fault_injector_hook(controller)
543 faultHook.random_generator = fault_stuff['rng']
545 for key in ['fault_frequency_iter']:
546 if key in fault_stuff.keys():
547 faultHook.__dict__[key] = fault_stuff[key]
549 if not len(faultHook.rnd_params.keys()) > 0:
550 faultHook.add_fault(
551 rnd_args={**rnd_args, **fault_stuff.get('rnd_params', {})},
552 args={**args, **fault_stuff.get('args', {})},
553 )
555 for key, val in fault_stuff.get('rnd_params', {}).items():
556 faultHook.rnd_params[key] = val
558 faultHook.rnd_params['rank'] = {'rank': len(controller.MS), **rnd_args, **fault_stuff.get('rnd_params', {})}.get(
559 'rank', 1
560 )
563def get_fault_injector_hook(controller):
564 """
565 Get the fault injector hook from the list of hooks in the controller.
566 If there is not one already, it is added here.
568 Args:
569 controller (pySDC.controller): The controller
571 Returns:
572 pySDC.hook.FaultInjector: The fault injecting hook
573 """
574 hook_types = [type(me) for me in controller.hooks]
576 if FaultInjector not in hook_types:
577 controller.add_hook(FaultInjector)
578 return get_fault_injector_hook(controller)
579 else:
580 hook_idx = [i for i in range(len(hook_types)) if hook_types[i] == FaultInjector]
581 assert len(hook_idx) == 1, f'Expected exactly one FaultInjector, got {len(hook_idx)}!'
582 return controller.hooks[hook_idx[0]]