Coverage for pySDC/projects/Resilience/fault_injection.py: 91%

184 statements  

« prev     ^ index     » next       coverage.py v7.6.9, created at 2024-12-20 14:51 +0000

1import struct 

2import numpy as np 

3 

4from pySDC.core.hooks import Hooks 

5from pySDC.implementations.datatype_classes.mesh import mesh 

6from pySDC.helpers.pysdc_helper import FrozenClass 

7 

8 

9def get_combination_from_index(index, options): 

10 """ 

11 Transform an index into a set of combinations. This is used when trying all possible combinations for fault insertion. 

12 

13 For instance if you want to insert a fault in any iteration in any node than you have k options for iterations and M 

14 options for the node for a total of M * k possibilities. You can then pass an index between 0 and M * k to this 

15 function together with the options [k, M] which will return a unique value for both k and M from the index. For 

16 instance, index = 0 will return [0, 0] and index = k * M will return [k-1, M-1]. 

17 

18 Args: 

19 index (int): Index of the combination 

20 options (list): The number of options for the each combination. 

21 """ 

22 

23 if len(options) == 1: 

24 return [index % options[0]] 

25 else: 

26 return [index % options[0]] + get_combination_from_index(index // options[0], options[1:]) 

27 

28 

29class Fault(FrozenClass): 

30 ''' 

31 Class for storing all the data that belongs to a fault, i.e. when and where it happens 

32 ''' 

33 

34 def __init__(self, params=None): 

35 ''' 

36 Initialization routine for faults 

37 

38 Args: 

39 params (dict): Parameters regarding when the fault will be inserted 

40 ''' 

41 

42 params = {} if params is None else params 

43 

44 self.time = None 

45 self.timestep = None 

46 self.level_number = None 

47 self.iteration = None 

48 self.node = None 

49 self.problem_pos = None 

50 self.bit = None 

51 self.rank = None 

52 self.target = 0 

53 self.when = 'after' # before or after an iteration? 

54 

55 for k, v in params.items(): 

56 setattr(self, k, v) 

57 

58 self._freeze() 

59 

60 @classmethod 

61 def random(cls, args, rnd_params, random_generator=None): 

62 ''' 

63 Classmethod to initialize a random fault 

64 

65 Args: 

66 args (dict): Supply variables that will be exempt from randomization here 

67 rnd_params (dict): Supply attributes to the randomization such as maximum values here 

68 random_generator (numpy.random.RandomState): Give a random generator to ensure repeatability 

69 

70 Returns Fault: Randomly generated fault 

71 ''' 

72 

73 if random_generator is None: 

74 random_generator = np.random.RandomState(2187) 

75 

76 random = { 

77 'level_number': random_generator.randint(low=0, high=rnd_params['level_number']), 

78 'node': random_generator.randint(low=rnd_params.get('min_node', 0), high=rnd_params['node'] + 1), 

79 'iteration': random_generator.randint(low=1, high=rnd_params['iteration'] + 1), 

80 'problem_pos': [random_generator.randint(low=0, high=i) for i in rnd_params['problem_pos']], 

81 'bit': random_generator.randint(low=0, high=rnd_params['bit']), 

82 'rank': random_generator.randint(low=0, high=rnd_params['rank']), 

83 } 

84 return cls({**random, **args}) 

85 

86 @classmethod 

87 def index_to_combination(cls, args, rnd_params, generator=None): 

88 ''' 

89 Classmethod to initialize a fault based on an index to translate to a combination of fault parameters, in order 

90 to loop through all combinations. Probably only makes sense for ODEs. 

91 

92 First, we get the number of possible combinations m, and then get a value for each fault parameter as 

93 i = m % i_max (plus modifications to make sure we get a sensible value) 

94 

95 Args: 

96 args (dict): Supply variables that will be exempt from randomization here. 

97 rnd_params (dict): Supply attributes to the randomization such as maximum values here 

98 generator (int): Index for specific combination 

99 

100 Returns: 

101 Fault: Generated from a specific combination of parameters 

102 ''' 

103 

104 ranges = [ 

105 (0, rnd_params['level_number']), 

106 (rnd_params.get('min_node', 0), rnd_params['node'] + 1), 

107 (1, rnd_params['iteration'] + 1), 

108 (0, rnd_params['bit']), 

109 (0, rnd_params['rank']), 

110 ] 

111 ranges += [(0, i) for i in rnd_params['problem_pos']] 

112 

113 # get values for taking modulo later 

114 mods = [me[1] - me[0] for me in ranges] 

115 

116 # get the combinations from the index 

117 combinations = get_combination_from_index(generator, mods) 

118 

119 # translate the combinations into a fault that we want to add 

120 combination = { 

121 'level_number': range(*ranges[0])[combinations[0]], 

122 'node': range(*ranges[1])[combinations[1]], 

123 'iteration': range(*ranges[2])[combinations[2]], 

124 'bit': range(*ranges[3])[combinations[3]], 

125 'rank': range(*ranges[4])[combinations[4]], 

126 'problem_pos': [range(*ranges[5])[combinations[5 + i]] for i in range(len(rnd_params['problem_pos']))], 

127 } 

128 

129 return cls({**combination, **args}) 

130 

131 

132class FaultInjector(Hooks): 

133 ''' 

134 Class to use as base for hooks class instead of abstract hooks class to insert faults using hooks 

135 ''' 

136 

137 def __init__(self): 

138 ''' 

139 Initialization routine 

140 ''' 

141 super().__init__() 

142 self.fault_frequency_time = np.inf 

143 self.fault_frequency_iter = np.inf 

144 self.faults = [] 

145 self.fault_init = [] # add faults to this list when the random parameters have not been set up yet 

146 self.rnd_params = {} 

147 self.random_generator = np.random.RandomState(2187) # number of the cell in which Princess Leia is held 

148 

149 @classmethod 

150 def generate_fault_stuff_single_fault( 

151 cls, bit=0, iteration=1, problem_pos=None, level_number=0, node=1, time=None, rank=0 

152 ): 

153 """ 

154 Generate a fault stuff object which will insert a single fault at the supplied parameters. Because there will 

155 be some parameter set for everything, there is no randomization anymore. 

156 

157 Args: 

158 bit (int): Which bit to flip 

159 iteration (int): After which iteration to flip 

160 problem_pos: Where in the problem to flip a bit, type depends on the problem 

161 level_number (int): In which level you want to flip 

162 node (int): In which node to flip 

163 time (float): The bitflip will occur in the time step after this time is reached 

164 rank (int): The rank you want to insert the fault into 

165 

166 Returns: 

167 dict: Can be supplied to the run functions in the resilience project to generate the single fault 

168 """ 

169 assert problem_pos is not None, "Please supply a spatial position for the fault as `problem_pos`!" 

170 assert time is not None, "Please supply a time for the fault as `time`!" 

171 fault_stuff = { 

172 'rng': np.random.RandomState(0), 

173 'args': { 

174 'bit': bit, 

175 'iteration': iteration, 

176 'level_number': level_number, 

177 'problem_pos': problem_pos, 

178 'node': node, 

179 'time': time, 

180 'rank': rank, 

181 }, 

182 } 

183 fault_stuff['rnd_args'] = fault_stuff['args'] 

184 return fault_stuff 

185 

186 def add_fault(self, args, rnd_args): 

187 if type(self.random_generator) == int: 

188 self.add_fault_from_combination(args, rnd_args) 

189 elif type(self.random_generator) == np.random.RandomState: 

190 self.add_random_fault(args, rnd_args) 

191 else: 

192 raise NotImplementedError( 

193 f'Don\'t know how to add fault with generator of type \ 

194{type(self.random_generator)}' 

195 ) 

196 

197 def add_stored_faults(self): 

198 ''' 

199 Method to add faults that are recorded for later adding in the pre run hook 

200 

201 Returns: 

202 None 

203 ''' 

204 for f in self.fault_init: 

205 if f['kind'] == 'random': 

206 self.add_random_fault(args=f['args'], rnd_args=f['rnd_args']) 

207 elif f['kind'] == 'combination': 

208 self.add_fault_from_combination(args=f['args'], rnd_args=f['rnd_args']) 

209 else: 

210 raise NotImplementedError(f'I don\'t know how to add stored fault of kind {f["kind"]}') 

211 

212 def add_random_fault(self, args=None, rnd_args=None): 

213 ''' 

214 Method to generate a random fault and add it to the list of faults to be injected at some point 

215 

216 Args: 

217 args (dict): parameters for fault initialization that should not be randomized 

218 rnd_args (dict): special parameters for randomization other than the default ones 

219 

220 Returns: 

221 None 

222 ''' 

223 

224 # replace args and rnd_args with empty dict if we didn't specify anything 

225 args = {} if args is None else args 

226 rnd_args = {} if rnd_args is None else rnd_args 

227 

228 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook 

229 if self.rnd_params == {}: 

230 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'random'}] 

231 else: 

232 self.faults += [ 

233 Fault.random( 

234 args=args, rnd_params={**self.rnd_params, **rnd_args}, random_generator=self.random_generator 

235 ) 

236 ] 

237 

238 return None 

239 

240 def add_fault_from_combination(self, args=None, rnd_args=None): 

241 ''' 

242 Method to generate a random fault and add it to the list of faults to be injected at some point 

243 

244 Args: 

245 args (dict): parameters for fault initialization that override the combinations 

246 rnd_args (dict): possible values that the parameters can take 

247 

248 Returns: 

249 None 

250 ''' 

251 

252 # replace args and rnd_args with empty dict if we didn't specify anything 

253 args = {} if args is None else args 

254 rnd_args = {} if rnd_args is None else rnd_args 

255 

256 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook 

257 if self.rnd_params == {}: 

258 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'combination'}] 

259 else: 

260 self.faults += [ 

261 Fault.index_to_combination( 

262 args=args, rnd_params={**self.rnd_params, **rnd_args}, generator=self.random_generator 

263 ) 

264 ] 

265 

266 return None 

267 

268 def inject_fault(self, step, f): 

269 ''' 

270 Method to inject a fault into a step. 

271 

272 Args: 

273 step (pySDC.Step.step): Step to inject the fault into 

274 f (Fault): fault that should be injected 

275 

276 Returns: 

277 None 

278 ''' 

279 L = step.levels[f.level_number] 

280 _abs_before = None 

281 _abs_after = None 

282 

283 # insert the fault in some target 

284 if f.target == 0: 

285 ''' 

286 Target 0 means we flip a bit in the solution. 

287 

288 To make sure the faults have some impact, we have to reevaluate the right hand side. Otherwise the fault is 

289 fixed automatically in this implementation, as the right hand side is assembled only from f(t, u) and u is 

290 tempered with after computing f(t, u). 

291 

292 To be fair to iteration based resilience strategies, we also reevaluate the residual. Otherwise, when a 

293 fault happens in the last iteration, it will not show up in the residual and the iteration is wrongly 

294 stopped. 

295 ''' 

296 _abs_before = abs(L.u[f.node][tuple(f.problem_pos)]) 

297 L.u[f.node][tuple(f.problem_pos)] = self.flip_bit(L.u[f.node][tuple(f.problem_pos)], f.bit) 

298 L.f[f.node] = L.prob.eval_f(L.u[f.node], L.time + L.dt * L.sweep.coll.nodes[max([0, f.node - 1])]) 

299 L.sweep.compute_residual() 

300 _abs_after = abs(L.u[f.node][tuple(f.problem_pos)]) 

301 else: 

302 raise NotImplementedError(f'Target {f.target} for faults not implemented!') 

303 

304 # log what happened to stats and screen 

305 self.logger.info( 

306 f'Flipping bit {f.bit} {f.when} iteration {f.iteration} in node {f.node} on rank {f.rank}. Target: {f.target}. Abs: {_abs_before:.4e} -> {_abs_after:.4e}' 

307 ) 

308 self.add_to_stats( 

309 process=step.status.slot, 

310 time=L.time, 

311 level=L.level_index, 

312 iter=step.status.iter, 

313 sweep=L.status.sweep, 

314 type='bitflip', 

315 value=(f.level_number, f.iteration, f.node, f.problem_pos, f.bit, f.target, f.rank), 

316 ) 

317 

318 # remove the fault from the list to make sure it happens only once 

319 self.faults.remove(f) 

320 

321 return None 

322 

323 def pre_run(self, step, level_number): 

324 ''' 

325 Setup random parameters and add the faults that we couldn't before here 

326 

327 Args: 

328 step (pySDC.Step.step): the current step 

329 level_number (int): the current level number 

330 

331 Returns: 

332 None 

333 ''' 

334 

335 super().pre_run(step, level_number) 

336 

337 if not type(step.levels[level_number].u[0]) == mesh: 

338 raise NotImplementedError( 

339 f'Fault insertion is only implemented for type mesh, not \ 

340{type(step.levels[level_number].u[0])}' 

341 ) 

342 

343 dtype = step.levels[level_number].prob.u_exact(t=0).dtype 

344 if dtype in [float, np.float64]: 

345 bit = 64 

346 elif dtype in [complex]: 

347 bit = 128 

348 else: 

349 raise NotImplementedError(f'Don\'t know how many bits type {dtype} has') 

350 

351 # define parameters for randomization 

352 self.rnd_params = { 

353 'level_number': len(step.levels), 

354 'node': step.levels[0].sweep.params.num_nodes, 

355 'iteration': step.params.maxiter, 

356 'problem_pos': step.levels[level_number].u[0].shape, 

357 'bit': bit, # change manually if you ever have something else 

358 'rank': 0, 

359 **self.rnd_params, 

360 } 

361 

362 # initialize the faults have been added before we knew the random parameters 

363 if step.status.first: 

364 self.add_stored_faults() 

365 

366 if self.rnd_params['level_number'] > 1: 

367 raise NotImplementedError('I don\'t know how to insert faults in this multi-level madness :(') 

368 

369 # initialize parameters for periodic fault injection 

370 self.timestep_idx = 0 

371 self.iter_idx = 0 

372 

373 return None 

374 

375 def pre_step(self, step, level_number): 

376 ''' 

377 Deal with periodic fault injection here: 

378 - Increment the index for counting time steps 

379 - Add a random fault in this time step if it is time for it based on the frequency 

380 

381 Args: 

382 step (pySDC.Step.step): the current step 

383 level_number (int): the current level number 

384 

385 Returns: 

386 None 

387 ''' 

388 super().pre_step(step, level_number) 

389 

390 self.timestep_idx += 1 

391 

392 if self.timestep_idx % self.fault_frequency_time == 0 and not self.timestep_idx == 0: 

393 self.add_random_fault(args={'timestep': self.timestep_idx}) 

394 

395 return None 

396 

397 def pre_iteration(self, step, level_number): 

398 ''' 

399 Check if we have a fault that should be inserted here and deal with periodic injection per iteration count 

400 

401 Args: 

402 step (pySDC.Step.step): the current step 

403 level_number (int): the current level number 

404 

405 Returns: 

406 None 

407 ''' 

408 

409 super().pre_iteration(step, level_number) 

410 

411 # check if the fault-free iteration count period has elapsed 

412 if self.iter_idx % self.fault_frequency_iter == 0 and not self.iter_idx == 0: 

413 self.add_random_fault(args={'timestep': self.timestep_idx, 'iteration': step.status.iter}) 

414 

415 # loop though all faults that have not yet happened and check if they are scheduled now 

416 for f in [me for me in self.faults if me.when == 'before']: 

417 # based on iteration number 

418 if self.timestep_idx == f.timestep and step.status.iter == f.iteration: 

419 self.inject_fault(step, f) 

420 # based on time 

421 elif f.time is not None: 

422 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank: 

423 self.inject_fault(step, f) 

424 

425 self.iter_idx += 1 

426 

427 return None 

428 

429 def post_iteration(self, step, level_number): 

430 ''' 

431 Check if we have a fault that should be inserted here 

432 

433 Args: 

434 step (pySDC.Step.step): the current step 

435 level_number (int): the current level number 

436 

437 Returns: 

438 None 

439 ''' 

440 

441 super().post_iteration(step, level_number) 

442 

443 # loop though all unhappened faults and check if they are scheduled now 

444 for f in [me for me in self.faults if me.when == 'after']: 

445 # based on iteration number 

446 if self.timestep_idx == f.timestep and step.status.iter == f.iteration: 

447 self.inject_fault(step, f) 

448 # based on time 

449 elif f.time is not None: 

450 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank: 

451 self.inject_fault(step, f) 

452 

453 return None 

454 

455 @classmethod 

456 def to_binary(cls, f): 

457 ''' 

458 Converts a single float in a string containing its binary representation in memory following IEEE754 

459 The struct.pack function returns the input with the applied conversion code in 8 bit blocks, which are then 

460 concatenated as a string. Complex numbers will be returned as two consecutive strings. 

461 

462 Args: 

463 f (float, np.float64, np.float32): number to be converted to binary representation 

464 

465 Returns: 

466 (str) Binary representation of f following IEEE754 as a string 

467 ''' 

468 if type(f) in [np.float64, float]: 

469 conversion_code = '>d' # big endian, double 

470 elif type(f) in [np.float32]: 

471 conversion_code = '>f' # big endian, float 

472 elif type(f) in [np.complex128]: 

473 return f'{cls.to_binary(f.real)}{cls.to_binary(f.imag)}' 

474 else: 

475 raise NotImplementedError(f'Don\'t know how to convert number of type {type(f)} to binary') 

476 

477 return ''.join('{:0>8b}'.format(c) for c in struct.pack(conversion_code, f)) 

478 

479 @classmethod 

480 def to_float(cls, s): 

481 ''' 

482 Converts a string of a IEEE754 binary representation in a float. The string is converted to integer with base 2 

483 and converted to bytes, which can be unpacked into a Python float by the struct module. 

484 

485 Args: 

486 s (str): binary representation of a float number of 32 or 64 bit length following IEEE754 

487 

488 Returns: 

489 (float) floating point representation of the binary string 

490 ''' 

491 if len(s) == 64: 

492 conversion_code = '>d' # big endian, double 

493 byte_count = 8 

494 elif len(s) == 32: 

495 conversion_code = '>f' # big endian, float 

496 byte_count = 4 

497 elif len(s) == 128: # complex floats 

498 real = s[0:64] 

499 imag = s[64:128] 

500 return cls.to_float(real) + cls.to_float(imag) * 1j 

501 

502 else: 

503 raise NotImplementedError(f'Don\'t know how to convert string of length {len(s)} to float') 

504 

505 return struct.unpack(conversion_code, int(s, 2).to_bytes(byte_count, 'big'))[0] 

506 

507 @classmethod 

508 def flip_bit(cls, target, bit): 

509 ''' 

510 Flips a bit at position bit in a target using the bitwise xor operator 

511 

512 Args: 

513 target (float, np.float64, np.float32): the floating point number in which you want to flip a bit 

514 bit (int): the bit which you intend to flip 

515 

516 Returns: 

517 (float) The floating point number resulting from flipping the respective bit in target 

518 ''' 

519 binary = cls.to_binary(target) 

520 return cls.to_float(f'{binary[:bit]}{int(binary[bit]) ^ 1}{binary[bit+1:]}') 

521 

522 

523def prepare_controller_for_faults(controller, fault_stuff, rnd_args=None, args=None): 

524 """ 

525 Prepare the controller for a run with faults. That means the fault injection hook is added and supplied with the 

526 relevant parameters. 

527 

528 Args: 

529 controller (pySDC.controller): The controller 

530 fault_stuff (dict): A dictionary with information on how to add faults 

531 rnd_args (dict): Default arguments for how to add random faults in a specific problem 

532 args (dict): Default arguments for where to add faults in a specific problem 

533 

534 Returns: 

535 None 

536 """ 

537 args = {} if args is None else args 

538 rnd_args = {} if rnd_args is None else rnd_args 

539 

540 faultHook = get_fault_injector_hook(controller) 

541 faultHook.random_generator = fault_stuff['rng'] 

542 

543 for key in ['fault_frequency_iter']: 

544 if key in fault_stuff.keys(): 

545 faultHook.__dict__[key] = fault_stuff[key] 

546 

547 if not len(faultHook.rnd_params.keys()) > 0: 

548 faultHook.add_fault( 

549 rnd_args={**rnd_args, **fault_stuff.get('rnd_params', {})}, 

550 args={**args, **fault_stuff.get('args', {})}, 

551 ) 

552 

553 for key, val in fault_stuff.get('rnd_params', {}).items(): 

554 faultHook.rnd_params[key] = val 

555 

556 faultHook.rnd_params['rank'] = {'rank': len(controller.MS), **rnd_args, **fault_stuff.get('rnd_params', {})}.get( 

557 'rank', 1 

558 ) 

559 

560 

561def get_fault_injector_hook(controller): 

562 """ 

563 Get the fault injector hook from the list of hooks in the controller. 

564 If there is not one already, it is added here. 

565 

566 Args: 

567 controller (pySDC.controller): The controller 

568 

569 Returns: 

570 pySDC.hook.FaultInjector: The fault injecting hook 

571 """ 

572 hook_types = [type(me) for me in controller.hooks] 

573 

574 if FaultInjector not in hook_types: 

575 controller.add_hook(FaultInjector) 

576 return get_fault_injector_hook(controller) 

577 else: 

578 hook_idx = [i for i in range(len(hook_types)) if hook_types[i] == FaultInjector] 

579 assert len(hook_idx) == 1, f'Expected exactly one FaultInjector, got {len(hook_idx)}!' 

580 return controller.hooks[hook_idx[0]]