Coverage for pySDC/projects/Resilience/fault_injection.py: 91%

184 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-09-19 09:13 +0000

1import struct 

2import numpy as np 

3 

4from pySDC.core.hooks import Hooks 

5from pySDC.implementations.datatype_classes.mesh import mesh 

6from pySDC.helpers.pysdc_helper import FrozenClass 

7 

8 

9def get_combination_from_index(index, options): 

10 """ 

11 Transform an index into a set of combinations. This is used when trying all possible combinations for fault insertion. 

12 

13 For instance if you want to insert a fault in any iteration in any node than you have k options for iterations and M 

14 options for the node for a total of M * k possibilities. You can then pass an index between 0 and M * k to this 

15 function together with the options [k, M] which will return a unique value for both k and M from the index. For 

16 instance, index = 0 will return [0, 0] and index = k * M will return [k-1, M-1]. 

17 

18 Args: 

19 index (int): Index of the combination 

20 options (list): The number of options for the each combination. 

21 """ 

22 

23 if len(options) == 1: 

24 return [index % options[0]] 

25 else: 

26 return [index % options[0]] + get_combination_from_index(index // options[0], options[1:]) 

27 

28 

29class Fault(FrozenClass): 

30 ''' 

31 Class for storing all the data that belongs to a fault, i.e. when and where it happens 

32 ''' 

33 

34 def __init__(self, params=None): 

35 ''' 

36 Initialization routine for faults 

37 

38 Args: 

39 params (dict): Parameters regarding when the fault will be inserted 

40 ''' 

41 

42 params = {} if params is None else params 

43 

44 self.time = None 

45 self.timestep = None 

46 self.level_number = None 

47 self.iteration = None 

48 self.node = None 

49 self.problem_pos = None 

50 self.bit = None 

51 self.rank = None 

52 self.target = 0 

53 self.when = 'after' # before or after an iteration? 

54 

55 for k, v in params.items(): 

56 setattr(self, k, v) 

57 

58 self._freeze() 

59 

60 @classmethod 

61 def random(cls, args, rnd_params, random_generator=None): 

62 ''' 

63 Classmethod to initialize a random fault 

64 

65 Args: 

66 args (dict): Supply variables that will be exempt from randomization here 

67 rnd_params (dict): Supply attributes to the randomization such as maximum values here 

68 random_generator (numpy.random.RandomState): Give a random generator to ensure repeatability 

69 

70 Returns Fault: Randomly generated fault 

71 ''' 

72 

73 if random_generator is None: 

74 random_generator = np.random.RandomState(2187) 

75 

76 random = { 

77 'level_number': random_generator.randint(low=0, high=rnd_params['level_number']), 

78 'node': random_generator.randint(low=rnd_params.get('min_node', 0), high=rnd_params['node'] + 1), 

79 'iteration': random_generator.randint(low=1, high=rnd_params['iteration'] + 1), 

80 'problem_pos': [random_generator.randint(low=0, high=i) for i in rnd_params['problem_pos']], 

81 'bit': random_generator.randint(low=0, high=rnd_params['bit']), 

82 'rank': random_generator.randint(low=0, high=rnd_params['rank']), 

83 } 

84 return cls({**random, **args}) 

85 

86 @classmethod 

87 def index_to_combination(cls, args, rnd_params, generator=None): 

88 ''' 

89 Classmethod to initialize a fault based on an index to translate to a combination of fault parameters, in order 

90 to loop through all combinations. Probably only makes sense for ODEs. 

91 

92 First, we get the number of possible combinations m, and then get a value for each fault parameter as 

93 i = m % i_max (plus modifications to make sure we get a sensible value) 

94 

95 Args: 

96 args (dict): Supply variables that will be exempt from randomization here. 

97 rnd_params (dict): Supply attributes to the randomization such as maximum values here 

98 generator (int): Index for specific combination 

99 

100 Returns: 

101 Fault: Generated from a specific combination of parameters 

102 ''' 

103 

104 ranges = [ 

105 (0, rnd_params['level_number']), 

106 (rnd_params.get('min_node', 0), rnd_params['node'] + 1), 

107 (1, rnd_params['iteration'] + 1), 

108 (0, rnd_params['bit']), 

109 (0, rnd_params['rank']), 

110 ] 

111 ranges += [(0, i) for i in rnd_params['problem_pos']] 

112 

113 # get values for taking modulo later 

114 mods = [me[1] - me[0] for me in ranges] 

115 

116 # get the combinations from the index 

117 combinations = get_combination_from_index(generator, mods) 

118 

119 # translate the combinations into a fault that we want to add 

120 combination = { 

121 'level_number': range(*ranges[0])[combinations[0]], 

122 'node': range(*ranges[1])[combinations[1]], 

123 'iteration': range(*ranges[2])[combinations[2]], 

124 'bit': range(*ranges[3])[combinations[3]], 

125 'rank': range(*ranges[4])[combinations[4]], 

126 'problem_pos': [range(*ranges[5])[combinations[5 + i]] for i in range(len(rnd_params['problem_pos']))], 

127 } 

128 

129 return cls({**combination, **args}) 

130 

131 

132class FaultInjector(Hooks): 

133 ''' 

134 Class to use as base for hooks class instead of abstract hooks class to insert faults using hooks 

135 ''' 

136 

137 def __init__(self): 

138 ''' 

139 Initialization routine 

140 ''' 

141 super().__init__() 

142 self.fault_frequency_time = np.inf 

143 self.fault_frequency_iter = np.inf 

144 self.faults = [] 

145 self.fault_init = [] # add faults to this list when the random parameters have not been set up yet 

146 self.rnd_params = {} 

147 self.random_generator = np.random.RandomState(2187) # number of the cell in which Princess Leia is held 

148 

149 @classmethod 

150 def generate_fault_stuff_single_fault( 

151 cls, bit=0, iteration=1, problem_pos=None, level_number=0, node=1, time=None, rank=0 

152 ): 

153 """ 

154 Generate a fault stuff object which will insert a single fault at the supplied parameters. Because there will 

155 be some parameter set for everything, there is no randomization anymore. 

156 

157 Args: 

158 bit (int): Which bit to flip 

159 iteration (int): After which iteration to flip 

160 problem_pos: Where in the problem to flip a bit, type depends on the problem 

161 level_number (int): In which level you want to flip 

162 node (int): In which node to flip 

163 time (float): The bitflip will occur in the time step after this time is reached 

164 rank (int): The rank you want to insert the fault into 

165 

166 Returns: 

167 dict: Can be supplied to the run functions in the resilience project to generate the single fault 

168 """ 

169 assert problem_pos is not None, "Please supply a spatial position for the fault as `problem_pos`!" 

170 assert time is not None, "Please supply a time for the fault as `time`!" 

171 fault_stuff = { 

172 'rng': np.random.RandomState(0), 

173 'args': { 

174 'bit': bit, 

175 'iteration': iteration, 

176 'level_number': level_number, 

177 'problem_pos': problem_pos, 

178 'node': node, 

179 'time': time, 

180 'rank': rank, 

181 }, 

182 } 

183 fault_stuff['rnd_args'] = fault_stuff['args'] 

184 return fault_stuff 

185 

186 def add_fault(self, args, rnd_args): 

187 if type(self.random_generator) == int: 

188 self.add_fault_from_combination(args, rnd_args) 

189 elif type(self.random_generator) == np.random.RandomState: 

190 self.add_random_fault(args, rnd_args) 

191 else: 

192 raise NotImplementedError( 

193 f' \ 

194 Don\'t know how to add fault with generator of type \ 

195{type(self.random_generator)}' 

196 ) 

197 

198 def add_stored_faults(self): 

199 ''' 

200 Method to add faults that are recorded for later adding in the pre run hook 

201 

202 Returns: 

203 None 

204 ''' 

205 for f in self.fault_init: 

206 if f['kind'] == 'random': 

207 self.add_random_fault(args=f['args'], rnd_args=f['rnd_args']) 

208 elif f['kind'] == 'combination': 

209 self.add_fault_from_combination(args=f['args'], rnd_args=f['rnd_args']) 

210 else: 

211 raise NotImplementedError(f'I don\'t know how to add stored fault of kind {f["kind"]}') 

212 

213 def add_random_fault(self, args=None, rnd_args=None): 

214 ''' 

215 Method to generate a random fault and add it to the list of faults to be injected at some point 

216 

217 Args: 

218 args (dict): parameters for fault initialization that should not be randomized 

219 rnd_args (dict): special parameters for randomization other than the default ones 

220 

221 Returns: 

222 None 

223 ''' 

224 

225 # replace args and rnd_args with empty dict if we didn't specify anything 

226 args = {} if args is None else args 

227 rnd_args = {} if rnd_args is None else rnd_args 

228 

229 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook 

230 if self.rnd_params == {}: 

231 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'random'}] 

232 else: 

233 self.faults += [ 

234 Fault.random( 

235 args=args, rnd_params={**self.rnd_params, **rnd_args}, random_generator=self.random_generator 

236 ) 

237 ] 

238 

239 return None 

240 

241 def add_fault_from_combination(self, args=None, rnd_args=None): 

242 ''' 

243 Method to generate a random fault and add it to the list of faults to be injected at some point 

244 

245 Args: 

246 args (dict): parameters for fault initialization that override the combinations 

247 rnd_args (dict): possible values that the parameters can take 

248 

249 Returns: 

250 None 

251 ''' 

252 

253 # replace args and rnd_args with empty dict if we didn't specify anything 

254 args = {} if args is None else args 

255 rnd_args = {} if rnd_args is None else rnd_args 

256 

257 # check if we can add the fault directly, or if we have to store its parameters and add it in the pre_run hook 

258 if self.rnd_params == {}: 

259 self.fault_init += [{'args': args, 'rnd_args': rnd_args, 'kind': 'combination'}] 

260 else: 

261 self.faults += [ 

262 Fault.index_to_combination( 

263 args=args, rnd_params={**self.rnd_params, **rnd_args}, generator=self.random_generator 

264 ) 

265 ] 

266 

267 return None 

268 

269 def inject_fault(self, step, f): 

270 ''' 

271 Method to inject a fault into a step. 

272 

273 Args: 

274 step (pySDC.Step.step): Step to inject the fault into 

275 f (Fault): fault that should be injected 

276 

277 Returns: 

278 None 

279 ''' 

280 L = step.levels[f.level_number] 

281 _abs_before = None 

282 _abs_after = None 

283 

284 # insert the fault in some target 

285 if f.target == 0: 

286 ''' 

287 Target 0 means we flip a bit in the solution. 

288 

289 To make sure the faults have some impact, we have to reevaluate the right hand side. Otherwise the fault is 

290 fixed automatically in this implementation, as the right hand side is assembled only from f(t, u) and u is 

291 tempered with after computing f(t, u). 

292 

293 To be fair to iteration based resilience strategies, we also reevaluate the residual. Otherwise, when a 

294 fault happens in the last iteration, it will not show up in the residual and the iteration is wrongly 

295 stopped. 

296 ''' 

297 _abs_before = abs(L.u[f.node][tuple(f.problem_pos)]) 

298 L.u[f.node][tuple(f.problem_pos)] = self.flip_bit(L.u[f.node][tuple(f.problem_pos)], f.bit) 

299 L.f[f.node] = L.prob.eval_f(L.u[f.node], L.time + L.dt * L.sweep.coll.nodes[max([0, f.node - 1])]) 

300 L.sweep.compute_residual() 

301 _abs_after = abs(L.u[f.node][tuple(f.problem_pos)]) 

302 else: 

303 raise NotImplementedError(f'Target {f.target} for faults not implemented!') 

304 

305 # log what happened to stats and screen 

306 self.logger.info( 

307 f'Flipping bit {f.bit} {f.when} iteration {f.iteration} in node {f.node} on rank {f.rank}. Target: {f.target}. Abs: {_abs_before:.4e} -> {_abs_after:.4e}' 

308 ) 

309 self.add_to_stats( 

310 process=step.status.slot, 

311 time=L.time, 

312 level=L.level_index, 

313 iter=step.status.iter, 

314 sweep=L.status.sweep, 

315 type='bitflip', 

316 value=(f.level_number, f.iteration, f.node, f.problem_pos, f.bit, f.target, f.rank), 

317 ) 

318 

319 # remove the fault from the list to make sure it happens only once 

320 self.faults.remove(f) 

321 

322 return None 

323 

324 def pre_run(self, step, level_number): 

325 ''' 

326 Setup random parameters and add the faults that we couldn't before here 

327 

328 Args: 

329 step (pySDC.Step.step): the current step 

330 level_number (int): the current level number 

331 

332 Returns: 

333 None 

334 ''' 

335 

336 super().pre_run(step, level_number) 

337 

338 if not type(step.levels[level_number].u[0]) == mesh: 

339 raise NotImplementedError( 

340 f' \ 

341 Fault insertion is only implemented for type mesh, not \ 

342{type(step.levels[level_number].u[0])}' 

343 ) 

344 

345 dtype = step.levels[level_number].prob.u_exact(t=0).dtype 

346 if dtype in [float, np.float64]: 

347 bit = 64 

348 elif dtype in [complex]: 

349 bit = 128 

350 else: 

351 raise NotImplementedError(f'Don\'t know how many bits type {dtype} has') 

352 

353 # define parameters for randomization 

354 self.rnd_params = { 

355 'level_number': len(step.levels), 

356 'node': step.levels[0].sweep.params.num_nodes, 

357 'iteration': step.params.maxiter, 

358 'problem_pos': step.levels[level_number].u[0].shape, 

359 'bit': bit, # change manually if you ever have something else 

360 'rank': 0, 

361 **self.rnd_params, 

362 } 

363 

364 # initialize the faults have been added before we knew the random parameters 

365 if step.status.first: 

366 self.add_stored_faults() 

367 

368 if self.rnd_params['level_number'] > 1: 

369 raise NotImplementedError('I don\'t know how to insert faults in this multi-level madness :(') 

370 

371 # initialize parameters for periodic fault injection 

372 self.timestep_idx = 0 

373 self.iter_idx = 0 

374 

375 return None 

376 

377 def pre_step(self, step, level_number): 

378 ''' 

379 Deal with periodic fault injection here: 

380 - Increment the index for counting time steps 

381 - Add a random fault in this time step if it is time for it based on the frequency 

382 

383 Args: 

384 step (pySDC.Step.step): the current step 

385 level_number (int): the current level number 

386 

387 Returns: 

388 None 

389 ''' 

390 super().pre_step(step, level_number) 

391 

392 self.timestep_idx += 1 

393 

394 if self.timestep_idx % self.fault_frequency_time == 0 and not self.timestep_idx == 0: 

395 self.add_random_fault(args={'timestep': self.timestep_idx}) 

396 

397 return None 

398 

399 def pre_iteration(self, step, level_number): 

400 ''' 

401 Check if we have a fault that should be inserted here and deal with periodic injection per iteration count 

402 

403 Args: 

404 step (pySDC.Step.step): the current step 

405 level_number (int): the current level number 

406 

407 Returns: 

408 None 

409 ''' 

410 

411 super().pre_iteration(step, level_number) 

412 

413 # check if the fault-free iteration count period has elapsed 

414 if self.iter_idx % self.fault_frequency_iter == 0 and not self.iter_idx == 0: 

415 self.add_random_fault(args={'timestep': self.timestep_idx, 'iteration': step.status.iter}) 

416 

417 # loop though all faults that have not yet happened and check if they are scheduled now 

418 for f in [me for me in self.faults if me.when == 'before']: 

419 # based on iteration number 

420 if self.timestep_idx == f.timestep and step.status.iter == f.iteration: 

421 self.inject_fault(step, f) 

422 # based on time 

423 elif f.time is not None: 

424 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank: 

425 self.inject_fault(step, f) 

426 

427 self.iter_idx += 1 

428 

429 return None 

430 

431 def post_iteration(self, step, level_number): 

432 ''' 

433 Check if we have a fault that should be inserted here 

434 

435 Args: 

436 step (pySDC.Step.step): the current step 

437 level_number (int): the current level number 

438 

439 Returns: 

440 None 

441 ''' 

442 

443 super().post_iteration(step, level_number) 

444 

445 # loop though all unhappened faults and check if they are scheduled now 

446 for f in [me for me in self.faults if me.when == 'after']: 

447 # based on iteration number 

448 if self.timestep_idx == f.timestep and step.status.iter == f.iteration: 

449 self.inject_fault(step, f) 

450 # based on time 

451 elif f.time is not None: 

452 if step.time > f.time and step.status.iter == f.iteration and step.status.slot == f.rank: 

453 self.inject_fault(step, f) 

454 

455 return None 

456 

457 @classmethod 

458 def to_binary(cls, f): 

459 ''' 

460 Converts a single float in a string containing its binary representation in memory following IEEE754 

461 The struct.pack function returns the input with the applied conversion code in 8 bit blocks, which are then 

462 concatenated as a string. Complex numbers will be returned as two consecutive strings. 

463 

464 Args: 

465 f (float, np.float64, np.float32): number to be converted to binary representation 

466 

467 Returns: 

468 (str) Binary representation of f following IEEE754 as a string 

469 ''' 

470 if type(f) in [np.float64, float]: 

471 conversion_code = '>d' # big endian, double 

472 elif type(f) in [np.float32]: 

473 conversion_code = '>f' # big endian, float 

474 elif type(f) in [np.complex128]: 

475 return f'{cls.to_binary(f.real)}{cls.to_binary(f.imag)}' 

476 else: 

477 raise NotImplementedError(f'Don\'t know how to convert number of type {type(f)} to binary') 

478 

479 return ''.join('{:0>8b}'.format(c) for c in struct.pack(conversion_code, f)) 

480 

481 @classmethod 

482 def to_float(cls, s): 

483 ''' 

484 Converts a string of a IEEE754 binary representation in a float. The string is converted to integer with base 2 

485 and converted to bytes, which can be unpacked into a Python float by the struct module. 

486 

487 Args: 

488 s (str): binary representation of a float number of 32 or 64 bit length following IEEE754 

489 

490 Returns: 

491 (float) floating point representation of the binary string 

492 ''' 

493 if len(s) == 64: 

494 conversion_code = '>d' # big endian, double 

495 byte_count = 8 

496 elif len(s) == 32: 

497 conversion_code = '>f' # big endian, float 

498 byte_count = 4 

499 elif len(s) == 128: # complex floats 

500 real = s[0:64] 

501 imag = s[64:128] 

502 return cls.to_float(real) + cls.to_float(imag) * 1j 

503 

504 else: 

505 raise NotImplementedError(f'Don\'t know how to convert string of length {len(s)} to float') 

506 

507 return struct.unpack(conversion_code, int(s, 2).to_bytes(byte_count, 'big'))[0] 

508 

509 @classmethod 

510 def flip_bit(cls, target, bit): 

511 ''' 

512 Flips a bit at position bit in a target using the bitwise xor operator 

513 

514 Args: 

515 target (float, np.float64, np.float32): the floating point number in which you want to flip a bit 

516 bit (int): the bit which you intend to flip 

517 

518 Returns: 

519 (float) The floating point number resulting from flipping the respective bit in target 

520 ''' 

521 binary = cls.to_binary(target) 

522 return cls.to_float(f'{binary[:bit]}{int(binary[bit]) ^ 1}{binary[bit+1:]}') 

523 

524 

525def prepare_controller_for_faults(controller, fault_stuff, rnd_args=None, args=None): 

526 """ 

527 Prepare the controller for a run with faults. That means the fault injection hook is added and supplied with the 

528 relevant parameters. 

529 

530 Args: 

531 controller (pySDC.controller): The controller 

532 fault_stuff (dict): A dictionary with information on how to add faults 

533 rnd_args (dict): Default arguments for how to add random faults in a specific problem 

534 args (dict): Default arguments for where to add faults in a specific problem 

535 

536 Returns: 

537 None 

538 """ 

539 args = {} if args is None else args 

540 rnd_args = {} if rnd_args is None else rnd_args 

541 

542 faultHook = get_fault_injector_hook(controller) 

543 faultHook.random_generator = fault_stuff['rng'] 

544 

545 for key in ['fault_frequency_iter']: 

546 if key in fault_stuff.keys(): 

547 faultHook.__dict__[key] = fault_stuff[key] 

548 

549 if not len(faultHook.rnd_params.keys()) > 0: 

550 faultHook.add_fault( 

551 rnd_args={**rnd_args, **fault_stuff.get('rnd_params', {})}, 

552 args={**args, **fault_stuff.get('args', {})}, 

553 ) 

554 

555 for key, val in fault_stuff.get('rnd_params', {}).items(): 

556 faultHook.rnd_params[key] = val 

557 

558 faultHook.rnd_params['rank'] = {'rank': len(controller.MS), **rnd_args, **fault_stuff.get('rnd_params', {})}.get( 

559 'rank', 1 

560 ) 

561 

562 

563def get_fault_injector_hook(controller): 

564 """ 

565 Get the fault injector hook from the list of hooks in the controller. 

566 If there is not one already, it is added here. 

567 

568 Args: 

569 controller (pySDC.controller): The controller 

570 

571 Returns: 

572 pySDC.hook.FaultInjector: The fault injecting hook 

573 """ 

574 hook_types = [type(me) for me in controller.hooks] 

575 

576 if FaultInjector not in hook_types: 

577 controller.add_hook(FaultInjector) 

578 return get_fault_injector_hook(controller) 

579 else: 

580 hook_idx = [i for i in range(len(hook_types)) if hook_types[i] == FaultInjector] 

581 assert len(hook_idx) == 1, f'Expected exactly one FaultInjector, got {len(hook_idx)}!' 

582 return controller.hooks[hook_idx[0]]