Coverage for pySDC / projects / Resilience / fault_injection.py: 91%

184 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-27 07:06 +0000

1import struct 

2import numpy as np 

3 

4from pySDC.core.hooks import Hooks 

5from pySDC.implementations.datatype_classes.mesh import mesh 

6from pySDC.helpers.pysdc_helper import FrozenClass 

7 

8 

9def get_combination_from_index(index, options): 

10 """ 

11 Transform an index into a set of combinations. This is used when trying all possible combinations for fault insertion. 

12 

13 For instance if you want to insert a fault in any iteration in any node than you have k options for iterations and M 

14 options for the node for a total of M * k possibilities. You can then pass an index between 0 and M * k to this 

15 function together with the options [k, M] which will return a unique value for both k and M from the index. For 

16 instance, index = 0 will return [0, 0] and index = k * M will return [k-1, M-1]. 

17 

18 Args: 

19 index (int): Index of the combination 

20 options (list): The number of options for the each combination. 

21 """ 

22 

23 if len(options) == 1: 

24 return [index % options[0]] 

25 else: 

26 return [index % options[0]] + get_combination_from_index(index // options[0], options[1:]) 

27 

28 

29class Fault(FrozenClass): 

30 ''' 

31 Class for storing all the data that belongs to a fault, i.e. when and where it happens 

32 ''' 

33 

34 def __init__(self, params=None): 

35 ''' 

36 Initialization routine for faults 

37 

38 Args: 

39 params (dict): Parameters regarding when the fault will be inserted 

40 ''' 

41 

42 params = {} if params is None else params 

43 

44 self.time = None 

45 self.timestep = None 

46 self.level_number = None 

47 self.iteration = None 

48 self.node = None 

49 self.problem_pos = None 

50 self.bit = None 

51 self.rank = None 

52 self.target = 0 

53 self.when = 'after' # before or after an iteration? 

54 

55 for k, v in params.items(): 

56 setattr(self, k, v) 

57 

58 self._freeze() 

59 

60 @classmethod 

61 def random(cls, args, rnd_params, random_generator=None): 

62 ''' 

63 Classmethod to initialize a random fault 

64 

65 Args: 

66 args (dict): Supply variables that will be exempt from randomization here 

67 rnd_params (dict): Supply attributes to the randomization such as maximum values here 

68 random_generator (numpy.random.RandomState): Give a random generator to ensure repeatability 

69 

70 Returns Fault: Randomly generated fault 

71 ''' 

72 

73 if random_generator is None: 

74 random_generator = np.random.RandomState(2187) 

75 

76 random = { 

77 'level_number': random_generator.randint(low=0, high=rnd_params['level_number']), 

78 'node': random_generator.randint(low=rnd_params.get('min_node', 0), high=rnd_params['node'] + 1), 

79 'iteration': random_generator.randint(low=1, high=rnd_params['iteration'] + 1), 

80 'problem_pos': [random_generator.randint(low=0, high=i) for i in rnd_params['problem_pos']], 

81 'bit': random_generator.randint(low=0, high=rnd_params['bit']), 

82 'rank': random_generator.randint(low=0, high=rnd_params['rank']), 

83 } 

84 return cls({**random, **args}) 

85 

86 @classmethod 

87 def index_to_combination(cls, args, rnd_params, generator=None): 

88 ''' 

89 Classmethod to initialize a fault based on an index to translate to a combination of fault parameters, in order 

90 to loop through all combinations. Probably only makes sense for ODEs. 

91 

92 First, we get the number of possible combinations m, and then get a value for each fault parameter as 

93 i = m % i_max (plus modifications to make sure we get a sensible value) 

94 

95 Args: 

96 args (dict): Supply variables that will be exempt from randomization here. 

97 rnd_params (dict): Supply attributes to the randomization such as maximum values here 

98 generator (int): Index for specific combination 

99 

100 Returns: 

101 Fault: Generated from a specific combination of parameters 

102 ''' 

103 

104 ranges = [ 

105 (0, rnd_params['level_number']), 

106 (rnd_params.get('min_node', 0), rnd_params['node'] + 1), 

107 (1, rnd_params['iteration'] + 1), 

108 (0, rnd_params['bit']), 

109 (0, rnd_params['rank']), 

110 ] 

111 ranges += [(0, i) for i in rnd_params['problem_pos']] 

112 

113 # get values for taking modulo later 

114 mods = [me[1] - me[0] for me in ranges] 

115 

116 # get the combinations from the index 

117 combinations = get_combination_from_index(generator, mods) 

118 

119 # translate the combinations into a fault that we want to add 

120 combination = { 

121 'level_number': range(*ranges[0])[combinations[0]], 

122 'node': range(*ranges[1])[combinations[1]], 

123 'iteration': range(*ranges[2])[combinations[2]], 

124 'bit': range(*ranges[3])[combinations[3]], 

125 'rank': range(*ranges[4])[combinations[4]], 

126 'problem_pos': [range(*ranges[5])[combinations[5 + i]] for i in range(len(rnd_params['problem_pos']))], 

127 } 

128 

129 return cls({**combination, **args}) 

130 

131 

132class FaultInjector(Hooks): 

133 ''' 

134 Class to use as base for hooks class instead of abstract hooks class to insert faults using hooks 

135 ''' 

136 

137 def __init__(self): 

138 ''' 

139 Initialization routine 

140 ''' 

141 super().__init__() 

142 self.fault_frequency_time = np.inf 

143 self.fault_frequency_iter = np.inf 

144 self.faults = [] 

145 self.fault_init = [] # add faults to this list when the random parameters have not been set up yet 

146 self.rnd_params = {} 

147 self.random_generator = np.random.RandomState(2187) # number of the cell in which Princess Leia is held 

148 

149 @classmethod 

150 def generate_fault_stuff_single_fault( 

151 cls, bit=0, iteration=1, problem_pos=None, level_number=0, node=1, time=None, rank=0 

152 ): 

153 """ 

154 Generate a fault stuff object which will insert a single fault at the supplied parameters. Because there will 

155 be some parameter set for everything, there is no randomization anymore. 

156 

157 Args: 

158 bit (int): Which bit to flip 

159 iteration (int): After which iteration to flip 

160 problem_pos: Where in the problem to flip a bit, type depends on the problem 

161 level_number (int): In which level you want to flip 

162 node (int): In which node to flip 

163 time (float): The bitflip will occur in the time step after this time is reached 

164 rank (int): The rank you want to insert the fault into 

165 

166 Returns: 

167 dict: Can be supplied to the run functions in the resilience project to generate the single fault 

168 """ 

169 assert problem_pos is not None, "Please supply a spatial position for the fault as `problem_pos`!" 

170 assert time is not None, "Please supply a time for the fault as `time`!" 

171 fault_stuff = { 

172 'rng': np.random.RandomState(0), 

173 'args': { 

174 'bit': bit, 

175 'iteration': iteration, 

176 'level_number': level_number, 

177 'problem_pos': problem_pos, 

178 'node': node, 

179 'time': time, 

180 'rank': rank, 

181 }, 

182 } 

183 fault_stuff['rnd_args'] = fault_stuff['args'] 

184 return fault_stuff 

185 

186 def add_fault(self, args, rnd_args): 

187 if type(self.random_generator) == int: 

188 self.add_fault_from_combination(args, rnd_args) 

189 elif type(self.random_generator) == np.random.RandomState: 

190 self.add_random_fault(args, rnd_args) 

191 else: 

192 raise NotImplementedError(f'Don\'t know how to add fault with generator of type \ 

193{type(self.random_generator)}') 

194 

195 def add_stored_faults(self): 

196 ''' 

197 Method to add faults that are recorded for later adding in the pre run hook 

198 

199 Returns: 

200 None 

201 ''' 

202 for f in self.fault_init: 

203 if f['kind'] == 'random': 

204 self.add_random_fault(args=f['args'], rnd_args=f['rnd_args']) 

205 elif f['kind'] == 'combination': 

206 self.add_fault_from_combination(args=f['args'], rnd_args=f['rnd_args']) 

207 else: 

208 raise NotImplementedError(f'I don\'t know how to add stored fault of kind {f["kind"]}') 

209 

210 def add_random_fault(self, args=None, rnd_args=None): 

211 ''' 

212 Method to generate a random fault and add it to the list of faults to be injected at some point 

213 

214 Args: 

215 args (dict): parameters for fault initialization that should not be randomized 

216 rnd_args (dict): special parameters for randomization other than the default ones 

217 

218 Returns: 

219 None 

220 ''' 

221 

222 # replace args and rnd_args with empty dict if we didn't specify anything 

223 args = {} if args is None else args 

224 rnd_args = {} if rnd_args is None else rnd_args 

225 

226 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook 

227 if self.rnd_params == {}: 

228 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'random'}] 

229 else: 

230 self.faults += [ 

231 Fault.random( 

232 args=args, rnd_params={**self.rnd_params, **rnd_args}, random_generator=self.random_generator 

233 ) 

234 ] 

235 

236 return None 

237 

238 def add_fault_from_combination(self, args=None, rnd_args=None): 

239 ''' 

240 Method to generate a random fault and add it to the list of faults to be injected at some point 

241 

242 Args: 

243 args (dict): parameters for fault initialization that override the combinations 

244 rnd_args (dict): possible values that the parameters can take 

245 

246 Returns: 

247 None 

248 ''' 

249 

250 # replace args and rnd_args with empty dict if we didn't specify anything 

251 args = {} if args is None else args 

252 rnd_args = {} if rnd_args is None else rnd_args 

253 

254 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook 

255 if self.rnd_params == {}: 

256 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'combination'}] 

257 else: 

258 self.faults += [ 

259 Fault.index_to_combination( 

260 args=args, rnd_params={**self.rnd_params, **rnd_args}, generator=self.random_generator 

261 ) 

262 ] 

263 

264 return None 

265 

266 def inject_fault(self, step, f): 

267 ''' 

268 Method to inject a fault into a step. 

269 

270 Args: 

271 step (pySDC.Step.step): Step to inject the fault into 

272 f (Fault): fault that should be injected 

273 

274 Returns: 

275 None 

276 ''' 

277 L = step.levels[f.level_number] 

278 _abs_before = None 

279 _abs_after = None 

280 

281 # insert the fault in some target 

282 if f.target == 0: 

283 ''' 

284 Target 0 means we flip a bit in the solution. 

285 

286 To make sure the faults have some impact, we have to reevaluate the right hand side. Otherwise the fault is 

287 fixed automatically in this implementation, as the right hand side is assembled only from f(t, u) and u is 

288 tempered with after computing f(t, u). 

289 

290 To be fair to iteration based resilience strategies, we also reevaluate the residual. Otherwise, when a 

291 fault happens in the last iteration, it will not show up in the residual and the iteration is wrongly 

292 stopped. 

293 ''' 

294 _abs_before = abs(L.u[f.node][tuple(f.problem_pos)]) 

295 L.u[f.node][tuple(f.problem_pos)] = self.flip_bit(L.u[f.node][tuple(f.problem_pos)], f.bit) 

296 L.f[f.node] = L.prob.eval_f(L.u[f.node], L.time + L.dt * L.sweep.coll.nodes[max([0, f.node - 1])]) 

297 L.sweep.compute_residual() 

298 _abs_after = abs(L.u[f.node][tuple(f.problem_pos)]) 

299 else: 

300 raise NotImplementedError(f'Target {f.target} for faults not implemented!') 

301 

302 # log what happened to stats and screen 

303 self.logger.info( 

304 f'Flipping bit {f.bit} {f.when} iteration {f.iteration} in node {f.node} on rank {f.rank}. Target: {f.target}. Abs: {_abs_before:.4e} -> {_abs_after:.4e}' 

305 ) 

306 self.add_to_stats( 

307 process=step.status.slot, 

308 time=L.time, 

309 level=L.level_index, 

310 iter=step.status.iter, 

311 sweep=L.status.sweep, 

312 type='bitflip', 

313 value=(f.level_number, f.iteration, f.node, f.problem_pos, f.bit, f.target, f.rank), 

314 ) 

315 

316 # remove the fault from the list to make sure it happens only once 

317 self.faults.remove(f) 

318 

319 return None 

320 

321 def pre_run(self, step, level_number): 

322 ''' 

323 Setup random parameters and add the faults that we couldn't before here 

324 

325 Args: 

326 step (pySDC.Step.step): the current step 

327 level_number (int): the current level number 

328 

329 Returns: 

330 None 

331 ''' 

332 

333 super().pre_run(step, level_number) 

334 

335 if not type(step.levels[level_number].u[0]) == mesh: 

336 raise NotImplementedError(f'Fault insertion is only implemented for type mesh, not \ 

337{type(step.levels[level_number].u[0])}') 

338 

339 dtype = step.levels[level_number].prob.u_exact(t=0).dtype 

340 if dtype in [float, np.float64]: 

341 bit = 64 

342 elif dtype in [complex]: 

343 bit = 128 

344 else: 

345 raise NotImplementedError(f'Don\'t know how many bits type {dtype} has') 

346 

347 # define parameters for randomization 

348 self.rnd_params = { 

349 'level_number': len(step.levels), 

350 'node': step.levels[0].sweep.params.num_nodes, 

351 'iteration': step.params.maxiter, 

352 'problem_pos': step.levels[level_number].u[0].shape, 

353 'bit': bit, # change manually if you ever have something else 

354 'rank': 0, 

355 **self.rnd_params, 

356 } 

357 

358 # initialize the faults have been added before we knew the random parameters 

359 if step.status.first: 

360 self.add_stored_faults() 

361 

362 if self.rnd_params['level_number'] > 1: 

363 raise NotImplementedError('I don\'t know how to insert faults in this multi-level madness :(') 

364 

365 # initialize parameters for periodic fault injection 

366 self.timestep_idx = 0 

367 self.iter_idx = 0 

368 

369 return None 

370 

371 def pre_step(self, step, level_number): 

372 ''' 

373 Deal with periodic fault injection here: 

374 - Increment the index for counting time steps 

375 - Add a random fault in this time step if it is time for it based on the frequency 

376 

377 Args: 

378 step (pySDC.Step.step): the current step 

379 level_number (int): the current level number 

380 

381 Returns: 

382 None 

383 ''' 

384 super().pre_step(step, level_number) 

385 

386 self.timestep_idx += 1 

387 

388 if self.timestep_idx % self.fault_frequency_time == 0 and not self.timestep_idx == 0: 

389 self.add_random_fault(args={'timestep': self.timestep_idx}) 

390 

391 return None 

392 

393 def pre_iteration(self, step, level_number): 

394 ''' 

395 Check if we have a fault that should be inserted here and deal with periodic injection per iteration count 

396 

397 Args: 

398 step (pySDC.Step.step): the current step 

399 level_number (int): the current level number 

400 

401 Returns: 

402 None 

403 ''' 

404 

405 super().pre_iteration(step, level_number) 

406 

407 # check if the fault-free iteration count period has elapsed 

408 if self.iter_idx % self.fault_frequency_iter == 0 and not self.iter_idx == 0: 

409 self.add_random_fault(args={'timestep': self.timestep_idx, 'iteration': step.status.iter}) 

410 

411 # loop though all faults that have not yet happened and check if they are scheduled now 

412 for f in [me for me in self.faults if me.when == 'before']: 

413 # based on iteration number 

414 if self.timestep_idx == f.timestep and step.status.iter == f.iteration: 

415 self.inject_fault(step, f) 

416 # based on time 

417 elif f.time is not None: 

418 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank: 

419 self.inject_fault(step, f) 

420 

421 self.iter_idx += 1 

422 

423 return None 

424 

425 def post_iteration(self, step, level_number): 

426 ''' 

427 Check if we have a fault that should be inserted here 

428 

429 Args: 

430 step (pySDC.Step.step): the current step 

431 level_number (int): the current level number 

432 

433 Returns: 

434 None 

435 ''' 

436 

437 super().post_iteration(step, level_number) 

438 

439 # loop though all unhappened faults and check if they are scheduled now 

440 for f in [me for me in self.faults if me.when == 'after']: 

441 # based on iteration number 

442 if self.timestep_idx == f.timestep and step.status.iter == f.iteration: 

443 self.inject_fault(step, f) 

444 # based on time 

445 elif f.time is not None: 

446 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank: 

447 self.inject_fault(step, f) 

448 

449 return None 

450 

451 @classmethod 

452 def to_binary(cls, f): 

453 ''' 

454 Converts a single float in a string containing its binary representation in memory following IEEE754 

455 The struct.pack function returns the input with the applied conversion code in 8 bit blocks, which are then 

456 concatenated as a string. Complex numbers will be returned as two consecutive strings. 

457 

458 Args: 

459 f (float, np.float64, np.float32): number to be converted to binary representation 

460 

461 Returns: 

462 (str) Binary representation of f following IEEE754 as a string 

463 ''' 

464 if type(f) in [np.float64, float]: 

465 conversion_code = '>d' # big endian, double 

466 elif type(f) in [np.float32]: 

467 conversion_code = '>f' # big endian, float 

468 elif type(f) in [np.complex128]: 

469 return f'{cls.to_binary(f.real)}{cls.to_binary(f.imag)}' 

470 else: 

471 raise NotImplementedError(f'Don\'t know how to convert number of type {type(f)} to binary') 

472 

473 return ''.join('{:0>8b}'.format(c) for c in struct.pack(conversion_code, f)) 

474 

475 @classmethod 

476 def to_float(cls, s): 

477 ''' 

478 Converts a string of a IEEE754 binary representation in a float. The string is converted to integer with base 2 

479 and converted to bytes, which can be unpacked into a Python float by the struct module. 

480 

481 Args: 

482 s (str): binary representation of a float number of 32 or 64 bit length following IEEE754 

483 

484 Returns: 

485 (float) floating point representation of the binary string 

486 ''' 

487 if len(s) == 64: 

488 conversion_code = '>d' # big endian, double 

489 byte_count = 8 

490 elif len(s) == 32: 

491 conversion_code = '>f' # big endian, float 

492 byte_count = 4 

493 elif len(s) == 128: # complex floats 

494 real = s[0:64] 

495 imag = s[64:128] 

496 return cls.to_float(real) + cls.to_float(imag) * 1j 

497 

498 else: 

499 raise NotImplementedError(f'Don\'t know how to convert string of length {len(s)} to float') 

500 

501 return struct.unpack(conversion_code, int(s, 2).to_bytes(byte_count, 'big'))[0] 

502 

503 @classmethod 

504 def flip_bit(cls, target, bit): 

505 ''' 

506 Flips a bit at position bit in a target using the bitwise xor operator 

507 

508 Args: 

509 target (float, np.float64, np.float32): the floating point number in which you want to flip a bit 

510 bit (int): the bit which you intend to flip 

511 

512 Returns: 

513 (float) The floating point number resulting from flipping the respective bit in target 

514 ''' 

515 binary = cls.to_binary(target) 

516 return cls.to_float(f'{binary[:bit]}{int(binary[bit]) ^ 1}{binary[bit+1:]}') 

517 

518 

519def prepare_controller_for_faults(controller, fault_stuff, rnd_args=None, args=None): 

520 """ 

521 Prepare the controller for a run with faults. That means the fault injection hook is added and supplied with the 

522 relevant parameters. 

523 

524 Args: 

525 controller (pySDC.controller): The controller 

526 fault_stuff (dict): A dictionary with information on how to add faults 

527 rnd_args (dict): Default arguments for how to add random faults in a specific problem 

528 args (dict): Default arguments for where to add faults in a specific problem 

529 

530 Returns: 

531 None 

532 """ 

533 args = {} if args is None else args 

534 rnd_args = {} if rnd_args is None else rnd_args 

535 

536 faultHook = get_fault_injector_hook(controller) 

537 faultHook.random_generator = fault_stuff['rng'] 

538 

539 for key in ['fault_frequency_iter']: 

540 if key in fault_stuff.keys(): 

541 faultHook.__dict__[key] = fault_stuff[key] 

542 

543 if not len(faultHook.rnd_params.keys()) > 0: 

544 faultHook.add_fault( 

545 rnd_args={**rnd_args, **fault_stuff.get('rnd_params', {})}, 

546 args={**args, **fault_stuff.get('args', {})}, 

547 ) 

548 

549 for key, val in fault_stuff.get('rnd_params', {}).items(): 

550 faultHook.rnd_params[key] = val 

551 

552 faultHook.rnd_params['rank'] = {'rank': len(controller.MS), **rnd_args, **fault_stuff.get('rnd_params', {})}.get( 

553 'rank', 1 

554 ) 

555 

556 

557def get_fault_injector_hook(controller): 

558 """ 

559 Get the fault injector hook from the list of hooks in the controller. 

560 If there is not one already, it is added here. 

561 

562 Args: 

563 controller (pySDC.controller): The controller 

564 

565 Returns: 

566 pySDC.hook.FaultInjector: The fault injecting hook 

567 """ 

568 hook_types = [type(me) for me in controller.hooks] 

569 

570 if FaultInjector not in hook_types: 

571 controller.add_hook(FaultInjector) 

572 return get_fault_injector_hook(controller) 

573 else: 

574 hook_idx = [i for i in range(len(hook_types)) if hook_types[i] == FaultInjector] 

575 assert len(hook_idx) == 1, f'Expected exactly one FaultInjector, got {len(hook_idx)}!' 

576 return controller.hooks[hook_idx[0]]