Coverage for pySDC/projects/Resilience/fault_injection.py: 91%
184 statements
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-20 14:51 +0000
« prev ^ index » next coverage.py v7.6.9, created at 2024-12-20 14:51 +0000
1import struct
2import numpy as np
4from pySDC.core.hooks import Hooks
5from pySDC.implementations.datatype_classes.mesh import mesh
6from pySDC.helpers.pysdc_helper import FrozenClass
9def get_combination_from_index(index, options):
10 """
11 Transform an index into a set of combinations. This is used when trying all possible combinations for fault insertion.
13 For instance if you want to insert a fault in any iteration in any node than you have k options for iterations and M
14 options for the node for a total of M * k possibilities. You can then pass an index between 0 and M * k to this
15 function together with the options [k, M] which will return a unique value for both k and M from the index. For
16 instance, index = 0 will return [0, 0] and index = k * M will return [k-1, M-1].
18 Args:
19 index (int): Index of the combination
20 options (list): The number of options for the each combination.
21 """
23 if len(options) == 1:
24 return [index % options[0]]
25 else:
26 return [index % options[0]] + get_combination_from_index(index // options[0], options[1:])
29class Fault(FrozenClass):
30 '''
31 Class for storing all the data that belongs to a fault, i.e. when and where it happens
32 '''
34 def __init__(self, params=None):
35 '''
36 Initialization routine for faults
38 Args:
39 params (dict): Parameters regarding when the fault will be inserted
40 '''
42 params = {} if params is None else params
44 self.time = None
45 self.timestep = None
46 self.level_number = None
47 self.iteration = None
48 self.node = None
49 self.problem_pos = None
50 self.bit = None
51 self.rank = None
52 self.target = 0
53 self.when = 'after' # before or after an iteration?
55 for k, v in params.items():
56 setattr(self, k, v)
58 self._freeze()
60 @classmethod
61 def random(cls, args, rnd_params, random_generator=None):
62 '''
63 Classmethod to initialize a random fault
65 Args:
66 args (dict): Supply variables that will be exempt from randomization here
67 rnd_params (dict): Supply attributes to the randomization such as maximum values here
68 random_generator (numpy.random.RandomState): Give a random generator to ensure repeatability
70 Returns Fault: Randomly generated fault
71 '''
73 if random_generator is None:
74 random_generator = np.random.RandomState(2187)
76 random = {
77 'level_number': random_generator.randint(low=0, high=rnd_params['level_number']),
78 'node': random_generator.randint(low=rnd_params.get('min_node', 0), high=rnd_params['node'] + 1),
79 'iteration': random_generator.randint(low=1, high=rnd_params['iteration'] + 1),
80 'problem_pos': [random_generator.randint(low=0, high=i) for i in rnd_params['problem_pos']],
81 'bit': random_generator.randint(low=0, high=rnd_params['bit']),
82 'rank': random_generator.randint(low=0, high=rnd_params['rank']),
83 }
84 return cls({**random, **args})
86 @classmethod
87 def index_to_combination(cls, args, rnd_params, generator=None):
88 '''
89 Classmethod to initialize a fault based on an index to translate to a combination of fault parameters, in order
90 to loop through all combinations. Probably only makes sense for ODEs.
92 First, we get the number of possible combinations m, and then get a value for each fault parameter as
93 i = m % i_max (plus modifications to make sure we get a sensible value)
95 Args:
96 args (dict): Supply variables that will be exempt from randomization here.
97 rnd_params (dict): Supply attributes to the randomization such as maximum values here
98 generator (int): Index for specific combination
100 Returns:
101 Fault: Generated from a specific combination of parameters
102 '''
104 ranges = [
105 (0, rnd_params['level_number']),
106 (rnd_params.get('min_node', 0), rnd_params['node'] + 1),
107 (1, rnd_params['iteration'] + 1),
108 (0, rnd_params['bit']),
109 (0, rnd_params['rank']),
110 ]
111 ranges += [(0, i) for i in rnd_params['problem_pos']]
113 # get values for taking modulo later
114 mods = [me[1] - me[0] for me in ranges]
116 # get the combinations from the index
117 combinations = get_combination_from_index(generator, mods)
119 # translate the combinations into a fault that we want to add
120 combination = {
121 'level_number': range(*ranges[0])[combinations[0]],
122 'node': range(*ranges[1])[combinations[1]],
123 'iteration': range(*ranges[2])[combinations[2]],
124 'bit': range(*ranges[3])[combinations[3]],
125 'rank': range(*ranges[4])[combinations[4]],
126 'problem_pos': [range(*ranges[5])[combinations[5 + i]] for i in range(len(rnd_params['problem_pos']))],
127 }
129 return cls({**combination, **args})
132class FaultInjector(Hooks):
133 '''
134 Class to use as base for hooks class instead of abstract hooks class to insert faults using hooks
135 '''
137 def __init__(self):
138 '''
139 Initialization routine
140 '''
141 super().__init__()
142 self.fault_frequency_time = np.inf
143 self.fault_frequency_iter = np.inf
144 self.faults = []
145 self.fault_init = [] # add faults to this list when the random parameters have not been set up yet
146 self.rnd_params = {}
147 self.random_generator = np.random.RandomState(2187) # number of the cell in which Princess Leia is held
149 @classmethod
150 def generate_fault_stuff_single_fault(
151 cls, bit=0, iteration=1, problem_pos=None, level_number=0, node=1, time=None, rank=0
152 ):
153 """
154 Generate a fault stuff object which will insert a single fault at the supplied parameters. Because there will
155 be some parameter set for everything, there is no randomization anymore.
157 Args:
158 bit (int): Which bit to flip
159 iteration (int): After which iteration to flip
160 problem_pos: Where in the problem to flip a bit, type depends on the problem
161 level_number (int): In which level you want to flip
162 node (int): In which node to flip
163 time (float): The bitflip will occur in the time step after this time is reached
164 rank (int): The rank you want to insert the fault into
166 Returns:
167 dict: Can be supplied to the run functions in the resilience project to generate the single fault
168 """
169 assert problem_pos is not None, "Please supply a spatial position for the fault as `problem_pos`!"
170 assert time is not None, "Please supply a time for the fault as `time`!"
171 fault_stuff = {
172 'rng': np.random.RandomState(0),
173 'args': {
174 'bit': bit,
175 'iteration': iteration,
176 'level_number': level_number,
177 'problem_pos': problem_pos,
178 'node': node,
179 'time': time,
180 'rank': rank,
181 },
182 }
183 fault_stuff['rnd_args'] = fault_stuff['args']
184 return fault_stuff
186 def add_fault(self, args, rnd_args):
187 if type(self.random_generator) == int:
188 self.add_fault_from_combination(args, rnd_args)
189 elif type(self.random_generator) == np.random.RandomState:
190 self.add_random_fault(args, rnd_args)
191 else:
192 raise NotImplementedError(
193 f'Don\'t know how to add fault with generator of type \
194{type(self.random_generator)}'
195 )
197 def add_stored_faults(self):
198 '''
199 Method to add faults that are recorded for later adding in the pre run hook
201 Returns:
202 None
203 '''
204 for f in self.fault_init:
205 if f['kind'] == 'random':
206 self.add_random_fault(args=f['args'], rnd_args=f['rnd_args'])
207 elif f['kind'] == 'combination':
208 self.add_fault_from_combination(args=f['args'], rnd_args=f['rnd_args'])
209 else:
210 raise NotImplementedError(f'I don\'t know how to add stored fault of kind {f["kind"]}')
212 def add_random_fault(self, args=None, rnd_args=None):
213 '''
214 Method to generate a random fault and add it to the list of faults to be injected at some point
216 Args:
217 args (dict): parameters for fault initialization that should not be randomized
218 rnd_args (dict): special parameters for randomization other than the default ones
220 Returns:
221 None
222 '''
224 # replace args and rnd_args with empty dict if we didn't specify anything
225 args = {} if args is None else args
226 rnd_args = {} if rnd_args is None else rnd_args
228 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook
229 if self.rnd_params == {}:
230 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'random'}]
231 else:
232 self.faults += [
233 Fault.random(
234 args=args, rnd_params={**self.rnd_params, **rnd_args}, random_generator=self.random_generator
235 )
236 ]
238 return None
240 def add_fault_from_combination(self, args=None, rnd_args=None):
241 '''
242 Method to generate a random fault and add it to the list of faults to be injected at some point
244 Args:
245 args (dict): parameters for fault initialization that override the combinations
246 rnd_args (dict): possible values that the parameters can take
248 Returns:
249 None
250 '''
252 # replace args and rnd_args with empty dict if we didn't specify anything
253 args = {} if args is None else args
254 rnd_args = {} if rnd_args is None else rnd_args
256 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook
257 if self.rnd_params == {}:
258 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'combination'}]
259 else:
260 self.faults += [
261 Fault.index_to_combination(
262 args=args, rnd_params={**self.rnd_params, **rnd_args}, generator=self.random_generator
263 )
264 ]
266 return None
268 def inject_fault(self, step, f):
269 '''
270 Method to inject a fault into a step.
272 Args:
273 step (pySDC.Step.step): Step to inject the fault into
274 f (Fault): fault that should be injected
276 Returns:
277 None
278 '''
279 L = step.levels[f.level_number]
280 _abs_before = None
281 _abs_after = None
283 # insert the fault in some target
284 if f.target == 0:
285 '''
286 Target 0 means we flip a bit in the solution.
288 To make sure the faults have some impact, we have to reevaluate the right hand side. Otherwise the fault is
289 fixed automatically in this implementation, as the right hand side is assembled only from f(t, u) and u is
290 tempered with after computing f(t, u).
292 To be fair to iteration based resilience strategies, we also reevaluate the residual. Otherwise, when a
293 fault happens in the last iteration, it will not show up in the residual and the iteration is wrongly
294 stopped.
295 '''
296 _abs_before = abs(L.u[f.node][tuple(f.problem_pos)])
297 L.u[f.node][tuple(f.problem_pos)] = self.flip_bit(L.u[f.node][tuple(f.problem_pos)], f.bit)
298 L.f[f.node] = L.prob.eval_f(L.u[f.node], L.time + L.dt * L.sweep.coll.nodes[max([0, f.node - 1])])
299 L.sweep.compute_residual()
300 _abs_after = abs(L.u[f.node][tuple(f.problem_pos)])
301 else:
302 raise NotImplementedError(f'Target {f.target} for faults not implemented!')
304 # log what happened to stats and screen
305 self.logger.info(
306 f'Flipping bit {f.bit} {f.when} iteration {f.iteration} in node {f.node} on rank {f.rank}. Target: {f.target}. Abs: {_abs_before:.4e} -> {_abs_after:.4e}'
307 )
308 self.add_to_stats(
309 process=step.status.slot,
310 time=L.time,
311 level=L.level_index,
312 iter=step.status.iter,
313 sweep=L.status.sweep,
314 type='bitflip',
315 value=(f.level_number, f.iteration, f.node, f.problem_pos, f.bit, f.target, f.rank),
316 )
318 # remove the fault from the list to make sure it happens only once
319 self.faults.remove(f)
321 return None
323 def pre_run(self, step, level_number):
324 '''
325 Setup random parameters and add the faults that we couldn't before here
327 Args:
328 step (pySDC.Step.step): the current step
329 level_number (int): the current level number
331 Returns:
332 None
333 '''
335 super().pre_run(step, level_number)
337 if not type(step.levels[level_number].u[0]) == mesh:
338 raise NotImplementedError(
339 f'Fault insertion is only implemented for type mesh, not \
340{type(step.levels[level_number].u[0])}'
341 )
343 dtype = step.levels[level_number].prob.u_exact(t=0).dtype
344 if dtype in [float, np.float64]:
345 bit = 64
346 elif dtype in [complex]:
347 bit = 128
348 else:
349 raise NotImplementedError(f'Don\'t know how many bits type {dtype} has')
351 # define parameters for randomization
352 self.rnd_params = {
353 'level_number': len(step.levels),
354 'node': step.levels[0].sweep.params.num_nodes,
355 'iteration': step.params.maxiter,
356 'problem_pos': step.levels[level_number].u[0].shape,
357 'bit': bit, # change manually if you ever have something else
358 'rank': 0,
359 **self.rnd_params,
360 }
362 # initialize the faults have been added before we knew the random parameters
363 if step.status.first:
364 self.add_stored_faults()
366 if self.rnd_params['level_number'] > 1:
367 raise NotImplementedError('I don\'t know how to insert faults in this multi-level madness :(')
369 # initialize parameters for periodic fault injection
370 self.timestep_idx = 0
371 self.iter_idx = 0
373 return None
375 def pre_step(self, step, level_number):
376 '''
377 Deal with periodic fault injection here:
378 - Increment the index for counting time steps
379 - Add a random fault in this time step if it is time for it based on the frequency
381 Args:
382 step (pySDC.Step.step): the current step
383 level_number (int): the current level number
385 Returns:
386 None
387 '''
388 super().pre_step(step, level_number)
390 self.timestep_idx += 1
392 if self.timestep_idx % self.fault_frequency_time == 0 and not self.timestep_idx == 0:
393 self.add_random_fault(args={'timestep': self.timestep_idx})
395 return None
397 def pre_iteration(self, step, level_number):
398 '''
399 Check if we have a fault that should be inserted here and deal with periodic injection per iteration count
401 Args:
402 step (pySDC.Step.step): the current step
403 level_number (int): the current level number
405 Returns:
406 None
407 '''
409 super().pre_iteration(step, level_number)
411 # check if the fault-free iteration count period has elapsed
412 if self.iter_idx % self.fault_frequency_iter == 0 and not self.iter_idx == 0:
413 self.add_random_fault(args={'timestep': self.timestep_idx, 'iteration': step.status.iter})
415 # loop though all faults that have not yet happened and check if they are scheduled now
416 for f in [me for me in self.faults if me.when == 'before']:
417 # based on iteration number
418 if self.timestep_idx == f.timestep and step.status.iter == f.iteration:
419 self.inject_fault(step, f)
420 # based on time
421 elif f.time is not None:
422 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank:
423 self.inject_fault(step, f)
425 self.iter_idx += 1
427 return None
429 def post_iteration(self, step, level_number):
430 '''
431 Check if we have a fault that should be inserted here
433 Args:
434 step (pySDC.Step.step): the current step
435 level_number (int): the current level number
437 Returns:
438 None
439 '''
441 super().post_iteration(step, level_number)
443 # loop though all unhappened faults and check if they are scheduled now
444 for f in [me for me in self.faults if me.when == 'after']:
445 # based on iteration number
446 if self.timestep_idx == f.timestep and step.status.iter == f.iteration:
447 self.inject_fault(step, f)
448 # based on time
449 elif f.time is not None:
450 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank:
451 self.inject_fault(step, f)
453 return None
455 @classmethod
456 def to_binary(cls, f):
457 '''
458 Converts a single float in a string containing its binary representation in memory following IEEE754
459 The struct.pack function returns the input with the applied conversion code in 8 bit blocks, which are then
460 concatenated as a string. Complex numbers will be returned as two consecutive strings.
462 Args:
463 f (float, np.float64, np.float32): number to be converted to binary representation
465 Returns:
466 (str) Binary representation of f following IEEE754 as a string
467 '''
468 if type(f) in [np.float64, float]:
469 conversion_code = '>d' # big endian, double
470 elif type(f) in [np.float32]:
471 conversion_code = '>f' # big endian, float
472 elif type(f) in [np.complex128]:
473 return f'{cls.to_binary(f.real)}{cls.to_binary(f.imag)}'
474 else:
475 raise NotImplementedError(f'Don\'t know how to convert number of type {type(f)} to binary')
477 return ''.join('{:0>8b}'.format(c) for c in struct.pack(conversion_code, f))
479 @classmethod
480 def to_float(cls, s):
481 '''
482 Converts a string of a IEEE754 binary representation in a float. The string is converted to integer with base 2
483 and converted to bytes, which can be unpacked into a Python float by the struct module.
485 Args:
486 s (str): binary representation of a float number of 32 or 64 bit length following IEEE754
488 Returns:
489 (float) floating point representation of the binary string
490 '''
491 if len(s) == 64:
492 conversion_code = '>d' # big endian, double
493 byte_count = 8
494 elif len(s) == 32:
495 conversion_code = '>f' # big endian, float
496 byte_count = 4
497 elif len(s) == 128: # complex floats
498 real = s[0:64]
499 imag = s[64:128]
500 return cls.to_float(real) + cls.to_float(imag) * 1j
502 else:
503 raise NotImplementedError(f'Don\'t know how to convert string of length {len(s)} to float')
505 return struct.unpack(conversion_code, int(s, 2).to_bytes(byte_count, 'big'))[0]
507 @classmethod
508 def flip_bit(cls, target, bit):
509 '''
510 Flips a bit at position bit in a target using the bitwise xor operator
512 Args:
513 target (float, np.float64, np.float32): the floating point number in which you want to flip a bit
514 bit (int): the bit which you intend to flip
516 Returns:
517 (float) The floating point number resulting from flipping the respective bit in target
518 '''
519 binary = cls.to_binary(target)
520 return cls.to_float(f'{binary[:bit]}{int(binary[bit]) ^ 1}{binary[bit+1:]}')
523def prepare_controller_for_faults(controller, fault_stuff, rnd_args=None, args=None):
524 """
525 Prepare the controller for a run with faults. That means the fault injection hook is added and supplied with the
526 relevant parameters.
528 Args:
529 controller (pySDC.controller): The controller
530 fault_stuff (dict): A dictionary with information on how to add faults
531 rnd_args (dict): Default arguments for how to add random faults in a specific problem
532 args (dict): Default arguments for where to add faults in a specific problem
534 Returns:
535 None
536 """
537 args = {} if args is None else args
538 rnd_args = {} if rnd_args is None else rnd_args
540 faultHook = get_fault_injector_hook(controller)
541 faultHook.random_generator = fault_stuff['rng']
543 for key in ['fault_frequency_iter']:
544 if key in fault_stuff.keys():
545 faultHook.__dict__[key] = fault_stuff[key]
547 if not len(faultHook.rnd_params.keys()) > 0:
548 faultHook.add_fault(
549 rnd_args={**rnd_args, **fault_stuff.get('rnd_params', {})},
550 args={**args, **fault_stuff.get('args', {})},
551 )
553 for key, val in fault_stuff.get('rnd_params', {}).items():
554 faultHook.rnd_params[key] = val
556 faultHook.rnd_params['rank'] = {'rank': len(controller.MS), **rnd_args, **fault_stuff.get('rnd_params', {})}.get(
557 'rank', 1
558 )
561def get_fault_injector_hook(controller):
562 """
563 Get the fault injector hook from the list of hooks in the controller.
564 If there is not one already, it is added here.
566 Args:
567 controller (pySDC.controller): The controller
569 Returns:
570 pySDC.hook.FaultInjector: The fault injecting hook
571 """
572 hook_types = [type(me) for me in controller.hooks]
574 if FaultInjector not in hook_types:
575 controller.add_hook(FaultInjector)
576 return get_fault_injector_hook(controller)
577 else:
578 hook_idx = [i for i in range(len(hook_types)) if hook_types[i] == FaultInjector]
579 assert len(hook_idx) == 1, f'Expected exactly one FaultInjector, got {len(hook_idx)}!'
580 return controller.hooks[hook_idx[0]]