Cached activations:
['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized',
'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q',
'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z',
'blocks.0.hook_attn_out', 'blocks.0.hook_resid_mid', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized',
'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_pre_linear', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out',
'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized',
'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q',
'blocks.1.attn.hook_rot_k', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z',
'blocks.1.hook_attn_out', 'blocks.1.hook_resid_mid', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized',
'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_pre_linear', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out',
'blocks.1.hook_resid_post', 'blocks.2.hook_resid_pre', 'blocks.2.ln1.hook_scale', 'blocks.2.ln1.hook_normalized',
'blocks.2.attn.hook_q', 'blocks.2.attn.hook_k', 'blocks.2.attn.hook_v', 'blocks.2.attn.hook_rot_q',
'blocks.2.attn.hook_rot_k', 'blocks.2.attn.hook_attn_scores', 'blocks.2.attn.hook_pattern', 'blocks.2.attn.hook_z',
'blocks.2.hook_attn_out', 'blocks.2.hook_resid_mid', 'blocks.2.ln2.hook_scale', 'blocks.2.ln2.hook_normalized',
'blocks.2.mlp.hook_pre', 'blocks.2.mlp.hook_pre_linear', 'blocks.2.mlp.hook_post', 'blocks.2.hook_mlp_out',
'blocks.2.hook_resid_post', 'blocks.3.hook_resid_pre', 'blocks.3.ln1.hook_scale', 'blocks.3.ln1.hook_normalized',
'blocks.3.attn.hook_q', 'blocks.3.attn.hook_k', 'blocks.3.attn.hook_v', 'blocks.3.attn.hook_rot_q',
'blocks.3.attn.hook_rot_k', 'blocks.3.attn.hook_attn_scores', 'blocks.3.attn.hook_pattern', 'blocks.3.attn.hook_z',
'blocks.3.hook_attn_out', 'blocks.3.hook_resid_mid', 'blocks.3.ln2.hook_scale', 'blocks.3.ln2.hook_normalized',
'blocks.3.mlp.hook_pre', 'blocks.3.mlp.hook_pre_linear', 'blocks.3.mlp.hook_post', 'blocks.3.hook_mlp_out',
'blocks.3.hook_resid_post', 'blocks.4.hook_resid_pre', 'blocks.4.ln1.hook_scale', 'blocks.4.ln1.hook_normalized',
'blocks.4.attn.hook_q', 'blocks.4.attn.hook_k', 'blocks.4.attn.hook_v', 'blocks.4.attn.hook_rot_q',
'blocks.4.attn.hook_rot_k', 'blocks.4.attn.hook_attn_scores', 'blocks.4.attn.hook_pattern', 'blocks.4.attn.hook_z',
'blocks.4.hook_attn_out', 'blocks.4.hook_resid_mid', 'blocks.4.ln2.hook_scale', 'blocks.4.ln2.hook_normalized',
'blocks.4.mlp.hook_pre', 'blocks.4.mlp.hook_pre_linear', 'blocks.4.mlp.hook_post', 'blocks.4.hook_mlp_out',
'blocks.4.hook_resid_post', 'blocks.5.hook_resid_pre', 'blocks.5.ln1.hook_scale', 'blocks.5.ln1.hook_normalized',
'blocks.5.attn.hook_q', 'blocks.5.attn.hook_k', 'blocks.5.attn.hook_v', 'blocks.5.attn.hook_rot_q',
'blocks.5.attn.hook_rot_k', 'blocks.5.attn.hook_attn_scores', 'blocks.5.attn.hook_pattern', 'blocks.5.attn.hook_z',
'blocks.5.hook_attn_out', 'blocks.5.hook_resid_mid', 'blocks.5.ln2.hook_scale', 'blocks.5.ln2.hook_normalized',
'blocks.5.mlp.hook_pre', 'blocks.5.mlp.hook_pre_linear', 'blocks.5.mlp.hook_post', 'blocks.5.hook_mlp_out',
'blocks.5.hook_resid_post', 'blocks.6.hook_resid_pre', 'blocks.6.ln1.hook_scale', 'blocks.6.ln1.hook_normalized',
'blocks.6.attn.hook_q', 'blocks.6.attn.hook_k', 'blocks.6.attn.hook_v', 'blocks.6.attn.hook_rot_q',
'blocks.6.attn.hook_rot_k', 'blocks.6.attn.hook_attn_scores', 'blocks.6.attn.hook_pattern', 'blocks.6.attn.hook_z',
'blocks.6.hook_attn_out', 'blocks.6.hook_resid_mid', 'blocks.6.ln2.hook_scale', 'blocks.6.ln2.hook_normalized',
'blocks.6.mlp.hook_pre', 'blocks.6.mlp.hook_pre_linear', 'blocks.6.mlp.hook_post', 'blocks.6.hook_mlp_out',
'blocks.6.hook_resid_post', 'blocks.7.hook_resid_pre', 'blocks.7.ln1.hook_scale', 'blocks.7.ln1.hook_normalized',
'blocks.7.attn.hook_q', 'blocks.7.attn.hook_k', 'blocks.7.attn.hook_v', 'blocks.7.attn.hook_rot_q',
'blocks.7.attn.hook_rot_k', 'blocks.7.attn.hook_attn_scores', 'blocks.7.attn.hook_pattern', 'blocks.7.attn.hook_z',
'blocks.7.hook_attn_out', 'blocks.7.hook_resid_mid', 'blocks.7.ln2.hook_scale', 'blocks.7.ln2.hook_normalized',
'blocks.7.mlp.hook_pre', 'blocks.7.mlp.hook_pre_linear', 'blocks.7.mlp.hook_post', 'blocks.7.hook_mlp_out',
'blocks.7.hook_resid_post', 'blocks.8.hook_resid_pre', 'blocks.8.ln1.hook_scale', 'blocks.8.ln1.hook_normalized',
'blocks.8.attn.hook_q', 'blocks.8.attn.hook_k', 'blocks.8.attn.hook_v', 'blocks.8.attn.hook_rot_q',
'blocks.8.attn.hook_rot_k', 'blocks.8.attn.hook_attn_scores', 'blocks.8.attn.hook_pattern', 'blocks.8.attn.hook_z',
'blocks.8.hook_attn_out', 'blocks.8.hook_resid_mid', 'blocks.8.ln2.hook_scale', 'blocks.8.ln2.hook_normalized',
'blocks.8.mlp.hook_pre', 'blocks.8.mlp.hook_pre_linear', 'blocks.8.mlp.hook_post', 'blocks.8.hook_mlp_out',
'blocks.8.hook_resid_post', 'blocks.9.hook_resid_pre', 'blocks.9.ln1.hook_scale', 'blocks.9.ln1.hook_normalized',
'blocks.9.attn.hook_q', 'blocks.9.attn.hook_k', 'blocks.9.attn.hook_v', 'blocks.9.attn.hook_rot_q',
'blocks.9.attn.hook_rot_k', 'blocks.9.attn.hook_attn_scores', 'blocks.9.attn.hook_pattern', 'blocks.9.attn.hook_z',
'blocks.9.hook_attn_out', 'blocks.9.hook_resid_mid', 'blocks.9.ln2.hook_scale', 'blocks.9.ln2.hook_normalized',
'blocks.9.mlp.hook_pre', 'blocks.9.mlp.hook_pre_linear', 'blocks.9.mlp.hook_post', 'blocks.9.hook_mlp_out',
'blocks.9.hook_resid_post', 'blocks.10.hook_resid_pre', 'blocks.10.ln1.hook_scale',
'blocks.10.ln1.hook_normalized', 'blocks.10.attn.hook_q', 'blocks.10.attn.hook_k', 'blocks.10.attn.hook_v',
'blocks.10.attn.hook_rot_q', 'blocks.10.attn.hook_rot_k', 'blocks.10.attn.hook_attn_scores',
'blocks.10.attn.hook_pattern', 'blocks.10.attn.hook_z', 'blocks.10.hook_attn_out', 'blocks.10.hook_resid_mid',
'blocks.10.ln2.hook_scale', 'blocks.10.ln2.hook_normalized', 'blocks.10.mlp.hook_pre',
'blocks.10.mlp.hook_pre_linear', 'blocks.10.mlp.hook_post', 'blocks.10.hook_mlp_out', 'blocks.10.hook_resid_post',
'blocks.11.hook_resid_pre', 'blocks.11.ln1.hook_scale', 'blocks.11.ln1.hook_normalized', 'blocks.11.attn.hook_q',
'blocks.11.attn.hook_k', 'blocks.11.attn.hook_v', 'blocks.11.attn.hook_rot_q', 'blocks.11.attn.hook_rot_k',
'blocks.11.attn.hook_attn_scores', 'blocks.11.attn.hook_pattern', 'blocks.11.attn.hook_z',
'blocks.11.hook_attn_out', 'blocks.11.hook_resid_mid', 'blocks.11.ln2.hook_scale', 'blocks.11.ln2.hook_normalized',
'blocks.11.mlp.hook_pre', 'blocks.11.mlp.hook_pre_linear', 'blocks.11.mlp.hook_post', 'blocks.11.hook_mlp_out',
'blocks.11.hook_resid_post', 'blocks.12.hook_resid_pre', 'blocks.12.ln1.hook_scale',
'blocks.12.ln1.hook_normalized', 'blocks.12.attn.hook_q', 'blocks.12.attn.hook_k', 'blocks.12.attn.hook_v',
'blocks.12.attn.hook_rot_q', 'blocks.12.attn.hook_rot_k', 'blocks.12.attn.hook_attn_scores',
'blocks.12.attn.hook_pattern', 'blocks.12.attn.hook_z', 'blocks.12.hook_attn_out', 'blocks.12.hook_resid_mid',
'blocks.12.ln2.hook_scale', 'blocks.12.ln2.hook_normalized', 'blocks.12.mlp.hook_pre',
'blocks.12.mlp.hook_pre_linear', 'blocks.12.mlp.hook_post', 'blocks.12.hook_mlp_out', 'blocks.12.hook_resid_post',
'blocks.13.hook_resid_pre', 'blocks.13.ln1.hook_scale', 'blocks.13.ln1.hook_normalized', 'blocks.13.attn.hook_q',
'blocks.13.attn.hook_k', 'blocks.13.attn.hook_v', 'blocks.13.attn.hook_rot_q', 'blocks.13.attn.hook_rot_k',
'blocks.13.attn.hook_attn_scores', 'blocks.13.attn.hook_pattern', 'blocks.13.attn.hook_z',
'blocks.13.hook_attn_out', 'blocks.13.hook_resid_mid', 'blocks.13.ln2.hook_scale', 'blocks.13.ln2.hook_normalized',
'blocks.13.mlp.hook_pre', 'blocks.13.mlp.hook_pre_linear', 'blocks.13.mlp.hook_post', 'blocks.13.hook_mlp_out',
'blocks.13.hook_resid_post', 'blocks.14.hook_resid_pre', 'blocks.14.ln1.hook_scale',
'blocks.14.ln1.hook_normalized', 'blocks.14.attn.hook_q', 'blocks.14.attn.hook_k', 'blocks.14.attn.hook_v',
'blocks.14.attn.hook_rot_q', 'blocks.14.attn.hook_rot_k', 'blocks.14.attn.hook_attn_scores',
'blocks.14.attn.hook_pattern', 'blocks.14.attn.hook_z', 'blocks.14.hook_attn_out', 'blocks.14.hook_resid_mid',
'blocks.14.ln2.hook_scale', 'blocks.14.ln2.hook_normalized', 'blocks.14.mlp.hook_pre',
'blocks.14.mlp.hook_pre_linear', 'blocks.14.mlp.hook_post', 'blocks.14.hook_mlp_out', 'blocks.14.hook_resid_post',
'blocks.15.hook_resid_pre', 'blocks.15.ln1.hook_scale', 'blocks.15.ln1.hook_normalized', 'blocks.15.attn.hook_q',
'blocks.15.attn.hook_k', 'blocks.15.attn.hook_v', 'blocks.15.attn.hook_rot_q', 'blocks.15.attn.hook_rot_k',
'blocks.15.attn.hook_attn_scores', 'blocks.15.attn.hook_pattern', 'blocks.15.attn.hook_z',
'blocks.15.hook_attn_out', 'blocks.15.hook_resid_mid', 'blocks.15.ln2.hook_scale', 'blocks.15.ln2.hook_normalized',
'blocks.15.mlp.hook_pre', 'blocks.15.mlp.hook_pre_linear', 'blocks.15.mlp.hook_post', 'blocks.15.hook_mlp_out',
'blocks.15.hook_resid_post', 'ln_final.hook_scale', 'ln_final.hook_normalized']
Residual activations' shape:
torch.Size([520, 29, 2048])
# GPU used after: 2940
# GPU cached after: 2970
Duration: 16.63542079925537