diff mbox

[RFC,net-next,2/6] x86: bpf_jit_comp: support BPF_S_ANC_SECCOMP_LD_W instruction

Message ID 1366962706-24204-3-git-send-email-xi.wang@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Xi Wang April 26, 2013, 7:51 a.m. UTC
This patch implements the seccomp BPF_S_ANC_SECCOMP_LD_W instruction
in x86 JIT.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
---
 arch/x86/net/bpf_jit_comp.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

Comments

Eric Dumazet April 26, 2013, 2:18 p.m. UTC | #1
On Fri, 2013-04-26 at 03:51 -0400, Xi Wang wrote:

> +#ifdef CONFIG_SECCOMP_FILTER
> +			case BPF_S_ANC_SECCOMP_LD_W:
> +				if (K == offsetof(struct seccomp_data, arch)) {
> +					int arch = syscall_get_arch(current, NULL);
> +
> +					EMIT1_off32(0xb8, arch); /* mov arch,%eax */
> +					break;
> +				}
> +				func = (u8 *)seccomp_bpf_load;
> +				t_offset = func - (image + addrs[i]);
> +				EMIT1_off32(0xbf, K); /* mov imm32,%edi */
> +				EMIT1_off32(0xe8, t_offset); /* call seccomp_bpf_load */
> +				break;
> +#endif

This seems seriously wrong to me.

This cannot have been tested at all.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xi Wang April 26, 2013, 2:50 p.m. UTC | #2
On Fri, Apr 26, 2013 at 10:18 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2013-04-26 at 03:51 -0400, Xi Wang wrote:
>
>> +#ifdef CONFIG_SECCOMP_FILTER
>> +                     case BPF_S_ANC_SECCOMP_LD_W:
>> +                             if (K == offsetof(struct seccomp_data, arch)) {
>> +                                     int arch = syscall_get_arch(current, NULL);
>> +
>> +                                     EMIT1_off32(0xb8, arch); /* mov arch,%eax */
>> +                                     break;
>> +                             }
>> +                             func = (u8 *)seccomp_bpf_load;
>> +                             t_offset = func - (image + addrs[i]);
>> +                             EMIT1_off32(0xbf, K); /* mov imm32,%edi */
>> +                             EMIT1_off32(0xe8, t_offset); /* call seccomp_bpf_load */
>> +                             break;
>> +#endif
>
> This seems seriously wrong to me.

Can you elaborate?

> This cannot have been tested at all.

Thanks to QEMU for hiding bugs then. :)

- xi
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 26, 2013, 3:11 p.m. UTC | #3
On Fri, 2013-04-26 at 10:50 -0400, Xi Wang wrote:
> On Fri, Apr 26, 2013 at 10:18 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Fri, 2013-04-26 at 03:51 -0400, Xi Wang wrote:
> >
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> +                     case BPF_S_ANC_SECCOMP_LD_W:
> >> +                             if (K == offsetof(struct seccomp_data, arch)) {
> >> +                                     int arch = syscall_get_arch(current, NULL);
> >> +
> >> +                                     EMIT1_off32(0xb8, arch); /* mov arch,%eax */
> >> +                                     break;
> >> +                             }
> >> +                             func = (u8 *)seccomp_bpf_load;
> >> +                             t_offset = func - (image + addrs[i]);
> >> +                             EMIT1_off32(0xbf, K); /* mov imm32,%edi */
> >> +                             EMIT1_off32(0xe8, t_offset); /* call seccomp_bpf_load */
> >> +                             break;
> >> +#endif
> >
> > This seems seriously wrong to me.
> 
> Can you elaborate?
> 
> > This cannot have been tested at all.
> 
> Thanks to QEMU for hiding bugs then. :)



1) 'current' at the time the code is jitted (compiled) is not the
'current' at the time the filter will be evaluated.

On x86_64, if CONFIG_IA32_EMULATION=y, syscall_get_arch() evaluates to :

if (task_thread_info(task)->status & TS_COMPAT)
	return AUDIT_ARCH_I386;
return AUDIT_ARCH_X86_64;

So your code is completely wrong.

2) Calling a function potentially destroys some registers.
   %rdi,%r8,%r9 for instance, so we are going to crash very easily.

I dont know, I feel a bit uncomfortable having to explain this to
someone sending security related patches...




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight April 26, 2013, 3:15 p.m. UTC | #4
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> +                     case BPF_S_ANC_SECCOMP_LD_W:
> >> +                             if (K == offsetof(struct seccomp_data, arch)) {
> >> +                                     int arch = syscall_get_arch(current, NULL);
> >> +
> >> +                                     EMIT1_off32(0xb8, arch); /* mov arch,%eax */
> >> +                                     break;
> >> +                             }
> >> +                             func = (u8 *)seccomp_bpf_load;
> >> +                             t_offset = func - (image + addrs[i]);
> >> +                             EMIT1_off32(0xbf, K); /* mov imm32,%edi */
> >> +                             EMIT1_off32(0xe8, t_offset); /* call seccomp_bpf_load */
> >> +                             break;
> >> +#endif
> >
> > This seems seriously wrong to me.
> 
> Can you elaborate?

The 'call seccomp_bpf_load' needs a pc-relative offset,
I assume that is what EMIT1_off32() generates.

The other two instructions want an absolute 32 bit value...

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 26, 2013, 3:27 p.m. UTC | #5
On Fri, 2013-04-26 at 16:15 +0100, David Laight wrote:
> > >> +#ifdef CONFIG_SECCOMP_FILTER
> > >> +                     case BPF_S_ANC_SECCOMP_LD_W:
> > >> +                             if (K == offsetof(struct seccomp_data, arch)) {
> > >> +                                     int arch = syscall_get_arch(current, NULL);
> > >> +
> > >> +                                     EMIT1_off32(0xb8, arch); /* mov arch,%eax */
> > >> +                                     break;
> > >> +                             }
> > >> +                             func = (u8 *)seccomp_bpf_load;
> > >> +                             t_offset = func - (image + addrs[i]);
> > >> +                             EMIT1_off32(0xbf, K); /* mov imm32,%edi */
> > >> +                             EMIT1_off32(0xe8, t_offset); /* call seccomp_bpf_load */
> > >> +                             break;
> > >> +#endif
> > >
> > > This seems seriously wrong to me.
> > 
> > Can you elaborate?
> 
> The 'call seccomp_bpf_load' needs a pc-relative offset,
> I assume that is what EMIT1_off32() generates.
> 
> The other two instructions want an absolute 32 bit value...

Hmm, this part is fine, we perform the relative adjustments in 
t_offset = func - (image + addrs[i]);




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xi Wang April 26, 2013, 3:29 p.m. UTC | #6
On Fri, Apr 26, 2013 at 11:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 2) Calling a function potentially destroys some registers.
>    %rdi,%r8,%r9 for instance, so we are going to crash very easily.
>
> I dont know, I feel a bit uncomfortable having to explain this to
> someone sending security related patches...

My old code did save these registers.  But, do we really need that for
seccomp?  For example, %rdi (skb) is always NULL and never used by
seccomp filters.  Did I miss anything?

- xi
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight April 26, 2013, 3:38 p.m. UTC | #7
> > > >> +                             func = (u8 *)seccomp_bpf_load;

> > > >> +                             t_offset = func - (image + addrs[i]);

> > > >> +                             EMIT1_off32(0xbf, K); /* mov imm32,%edi */

> > > >> +                             EMIT1_off32(0xe8, t_offset); /* call seccomp_bpf_load */

> > > >> +                             break;

> > > >> +#endif

> > > >

> > > > This seems seriously wrong to me.

> > >

> > > Can you elaborate?

> >

> > The 'call seccomp_bpf_load' needs a pc-relative offset,

> > I assume that is what EMIT1_off32() generates.

> >

> > The other two instructions want an absolute 32 bit value...

> 

> Hmm, this part is fine, we perform the relative adjustments in

> t_offset = func - (image + addrs[i]);


The call needs the displacement from the address of
the instruction following the call.
I can't imagine any way in which above can allow for the 5 byte
'mov imm32,%edi' instruction.

I'd have thought there would be an EMIT1_imm32().
(I've written a lot of x86 asm in my days!)

	David
Eric Dumazet April 26, 2013, 3:43 p.m. UTC | #8
On Fri, 2013-04-26 at 11:29 -0400, Xi Wang wrote:
> On Fri, Apr 26, 2013 at 11:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > 2) Calling a function potentially destroys some registers.
> >    %rdi,%r8,%r9 for instance, so we are going to crash very easily.
> >
> > I dont know, I feel a bit uncomfortable having to explain this to
> > someone sending security related patches...
> 
> My old code did save these registers.  But, do we really need that for
> seccomp?  For example, %rdi (skb) is always NULL and never used by
> seccomp filters.  Did I miss anything?

I do not know.

This is not explained in your changelog or in any comment.

You have to make the full analysis yourself and make us comfortable with
the results.

You send patches and ask us to spend hours on it, this is not how it
works.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 26, 2013, 3:46 p.m. UTC | #9
On Fri, 2013-04-26 at 16:38 +0100, David Laight wrote:

> The call needs the displacement from the address of
> the instruction following the call.
> I can't imagine any way in which above can allow for the 5 byte
> 'mov imm32,%edi' instruction.
> 
> I'd have thought there would be an EMIT1_imm32().
> (I've written a lot of x86 asm in my days!)
> 

Thats the same. 

off32 is a 32bit quantity, or immediate 32 if you prefer.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xi Wang April 26, 2013, 3:57 p.m. UTC | #10
On Fri, Apr 26, 2013 at 11:43 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> I do not know.
>
> This is not explained in your changelog or in any comment.
>
> You have to make the full analysis yourself and make us comfortable with
> the results.
>
> You send patches and ask us to spend hours on it, this is not how it
> works.

"do we really need that for seccomp?" is not asking.  I just tried to
explain in a gentle way.

%rdi,%r8,%r9 are not used by seccomp filters so I removed the push/pop
part.  I agree that I should explain the details in the code comments
or logs.  Thanks for catching that.

- xi
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xi Wang April 26, 2013, 4:02 p.m. UTC | #11
On Fri, Apr 26, 2013 at 11:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 1) 'current' at the time the code is jitted (compiled) is not the
> 'current' at the time the filter will be evaluated.
>
> On x86_64, if CONFIG_IA32_EMULATION=y, syscall_get_arch() evaluates to :
>
> if (task_thread_info(task)->status & TS_COMPAT)
>         return AUDIT_ARCH_I386;
> return AUDIT_ARCH_X86_64;
>
> So your code is completely wrong.

Just to be clear, are you worrying about a process changing its
personality after installing seccomp filters?

- xi
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 26, 2013, 4:14 p.m. UTC | #12
On Fri, 2013-04-26 at 12:02 -0400, Xi Wang wrote:
> On Fri, Apr 26, 2013 at 11:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > 1) 'current' at the time the code is jitted (compiled) is not the
> > 'current' at the time the filter will be evaluated.
> >
> > On x86_64, if CONFIG_IA32_EMULATION=y, syscall_get_arch() evaluates to :
> >
> > if (task_thread_info(task)->status & TS_COMPAT)
> >         return AUDIT_ARCH_I386;
> > return AUDIT_ARCH_X86_64;
> >
> > So your code is completely wrong.
> 
> Just to be clear, are you worrying about a process changing its
> personality after installing seccomp filters?

You didn't explained how things worked.

Are you assuming we network guys know everything ?

Just to make it very clear :

We are very dumb and you must explain us everything.

If process would not change personality, why do we have get_arch() at
all ? Why isn't it optimized outside of the JIT itself, in the generic
seccomp checker, its a single "A = K" instruction after all.

Why this part is even in the x86 BPF JIT ?

To me it looks like _if_ get_arch() is provided in BPF, its for a
reason, and your implementation looks very suspicious, if not buggy.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xi Wang April 26, 2013, 6:25 p.m. UTC | #13
Not sure how many you are speaking for when you say "We are very dumb".  :)

Thanks for catching this.  I'l remove this arch thing in v2.

To address your other concern about registers, I'll add some comments
to the code, something like:

"%rdi,%r8,%r9 are not used by seccomp filters; it's safe to not save them."

- xi

On Fri, Apr 26, 2013 at 12:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2013-04-26 at 12:02 -0400, Xi Wang wrote:
>> On Fri, Apr 26, 2013 at 11:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > 1) 'current' at the time the code is jitted (compiled) is not the
>> > 'current' at the time the filter will be evaluated.
>> >
>> > On x86_64, if CONFIG_IA32_EMULATION=y, syscall_get_arch() evaluates to :
>> >
>> > if (task_thread_info(task)->status & TS_COMPAT)
>> >         return AUDIT_ARCH_I386;
>> > return AUDIT_ARCH_X86_64;
>> >
>> > So your code is completely wrong.
>>
>> Just to be clear, are you worrying about a process changing its
>> personality after installing seccomp filters?
>
> You didn't explained how things worked.
>
> Are you assuming we network guys know everything ?
>
> Just to make it very clear :
>
> We are very dumb and you must explain us everything.
>
> If process would not change personality, why do we have get_arch() at
> all ? Why isn't it optimized outside of the JIT itself, in the generic
> seccomp checker, its a single "A = K" instruction after all.
>
> Why this part is even in the x86 BPF JIT ?
>
> To me it looks like _if_ get_arch() is provided in BPF, its for a
> reason, and your implementation looks very suspicious, if not buggy.
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 26, 2013, 6:40 p.m. UTC | #14
On Fri, 2013-04-26 at 14:25 -0400, Xi Wang wrote:
> Not sure how many you are speaking for when you say "We are very dumb".  :)
> 
> Thanks for catching this.  I'l remove this arch thing in v2.
> 
> To address your other concern about registers, I'll add some comments
> to the code, something like:
> 
> "%rdi,%r8,%r9 are not used by seccomp filters; it's safe to not save them."

OK good

BTW, most of us prefer Bottom posting on lkml/netdev

 http://en.wikipedia.org/wiki/Posting_style#Bottom-posting

Thanks


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 26, 2013, 6:48 p.m. UTC | #15
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 26 Apr 2013 08:43:41 -0700

> You send patches and ask us to spend hours on it, this is not how it
> works.

+1
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f66b540..03c9c81 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -8,10 +8,11 @@ 
  * of the License.
  */
 #include <linux/moduleloader.h>
-#include <asm/cacheflush.h>
 #include <linux/netdevice.h>
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
+#include <asm/cacheflush.h>
+#include <asm/syscall.h>
 
 /*
  * Conventions :
@@ -144,7 +145,7 @@  static int pkt_type_offset(void)
 	return -1;
 }
 
-void bpf_jit_compile(struct sk_filter *fp)
+bpf_func_t bpf_jit_compile(struct sock_filter *filter, unsigned int flen)
 {
 	u8 temp[64];
 	u8 *prog;
@@ -157,15 +158,14 @@  void bpf_jit_compile(struct sk_filter *fp)
 	int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */
 	unsigned int cleanup_addr; /* epilogue code offset */
 	unsigned int *addrs;
-	const struct sock_filter *filter = fp->insns;
-	int flen = fp->len;
+	bpf_func_t bpf_func = sk_run_filter;
 
 	if (!bpf_jit_enable)
-		return;
+		return bpf_func;
 
 	addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL);
 	if (addrs == NULL)
-		return;
+		return bpf_func;
 
 	/* Before first pass, make a rough estimation of addrs[]
 	 * each bpf instruction is translated to less than 64 bytes
@@ -684,6 +684,20 @@  cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
 				}
 				EMIT_COND_JMP(f_op, f_offset);
 				break;
+#ifdef CONFIG_SECCOMP_FILTER
+			case BPF_S_ANC_SECCOMP_LD_W:
+				if (K == offsetof(struct seccomp_data, arch)) {
+					int arch = syscall_get_arch(current, NULL);
+
+					EMIT1_off32(0xb8, arch); /* mov arch,%eax */
+					break;
+				}
+				func = (u8 *)seccomp_bpf_load;
+				t_offset = func - (image + addrs[i]);
+				EMIT1_off32(0xbf, K); /* mov imm32,%edi */
+				EMIT1_off32(0xe8, t_offset); /* call seccomp_bpf_load */
+				break;
+#endif
 			default:
 				/* hmm, too complex filter, give up with jit compiler */
 				goto out;
@@ -694,7 +708,7 @@  cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
 					pr_err("bpb_jit_compile fatal error\n");
 					kfree(addrs);
 					module_free(NULL, image);
-					return;
+					return bpf_func;
 				}
 				memcpy(image + proglen, temp, ilen);
 			}
@@ -731,11 +745,11 @@  cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
 
 	if (image) {
 		bpf_flush_icache(image, image + proglen);
-		fp->bpf_func = (void *)image;
+		bpf_func = (void *)image;
 	}
 out:
 	kfree(addrs);
-	return;
+	return bpf_func;
 }
 
 static void jit_free_defer(struct work_struct *arg)
@@ -746,10 +760,10 @@  static void jit_free_defer(struct work_struct *arg)
 /* run from softirq, we must use a work_struct to call
  * module_free() from process context
  */
-void bpf_jit_free(struct sk_filter *fp)
+void bpf_jit_free(bpf_func_t bpf_func)
 {
-	if (fp->bpf_func != sk_run_filter) {
-		struct work_struct *work = (struct work_struct *)fp->bpf_func;
+	if (bpf_func != sk_run_filter) {
+		struct work_struct *work = (struct work_struct *)bpf_func;
 
 		INIT_WORK(work, jit_free_defer);
 		schedule_work(work);