diff mbox

[RFC,5/5] Add sample for adding simple drop program to link

Message ID 1459560118-5582-6-git-send-email-bblanco@plumgrid.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Brenden Blanco April 2, 2016, 1:21 a.m. UTC
Add a sample program that only drops packets at the
BPF_PROG_TYPE_PHYS_DEV hook of a link. With the drop-only program,
observed single core rate is ~14.6Mpps.

Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.

$ perf record -a samples/bpf/netdrvx1 $(</sys/class/net/eth0/ifindex)
proto 17:   14597724 drops/s

./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 6486875(c6485849+d1026) usec, 23689465 (60byte,0frags)
  3651906pps 1752Mb/sec (1752914880bps) errors: 0
Device: eth4@1
Result: OK: 6486874(c6485656+d1217) usec, 23689489 (60byte,0frags)
  3651911pps 1752Mb/sec (1752917280bps) errors: 0
Device: eth4@2
Result: OK: 6486851(c6485730+d1120) usec, 23687853 (60byte,0frags)
  3651672pps 1752Mb/sec (1752802560bps) errors: 0
Device: eth4@3
Result: OK: 6486879(c6485807+d1071) usec, 23688954 (60byte,0frags)
  3651825pps 1752Mb/sec (1752876000bps) errors: 0

perf report --no-children:
  18.36%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_process_rx_cq
  15.98%  swapper        [kernel.vmlinux]  [k] poll_idle
  12.71%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_alloc_frags
   6.87%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_free_frag
   4.20%  ksoftirqd/1    [kernel.vmlinux]  [k] get_page_from_freelist
   4.09%  swapper        [mlx4_en]         [k] mlx4_en_process_rx_cq
   3.32%  ksoftirqd/1    [kernel.vmlinux]  [k] sk_load_byte_positive_offset
   2.39%  ksoftirqd/1    [mdio]            [k] 0x00000000000074cd
   2.23%  swapper        [mlx4_en]         [k] mlx4_en_alloc_frags
   2.20%  ksoftirqd/1    [kernel.vmlinux]  [k] free_pages_prepare
   2.08%  ksoftirqd/1    [mlx4_en]         [k] mlx4_call_bpf
   1.57%  ksoftirqd/1    [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
   1.35%  ksoftirqd/1    [mdio]            [k] 0x00000000000074fa
   1.09%  ksoftirqd/1    [kernel.vmlinux]  [k] free_one_page
   1.02%  ksoftirqd/1    [kernel.vmlinux]  [k] bpf_map_lookup_elem
   0.90%  ksoftirqd/1    [kernel.vmlinux]  [k] __alloc_pages_nodemask
   0.88%  swapper        [kernel.vmlinux]  [k] intel_idle
   0.82%  ksoftirqd/1    [mdio]            [k] 0x00000000000074be
   0.80%  swapper        [mlx4_en]         [k] mlx4_en_free_frag

machine specs:
 receiver - Intel E5-1630 v3 @ 3.70GHz
 sender - Intel E5645 @ 2.40GHz
 Mellanox ConnectX-3 @40G

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 samples/bpf/Makefile        |   4 ++
 samples/bpf/bpf_load.c      |   8 +++
 samples/bpf/netdrvx1_kern.c |  26 ++++++++
 samples/bpf/netdrvx1_user.c | 155 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 193 insertions(+)
 create mode 100644 samples/bpf/netdrvx1_kern.c
 create mode 100644 samples/bpf/netdrvx1_user.c

Comments

Jesper Dangaard Brouer April 6, 2016, 7:48 p.m. UTC | #1
I'm testing with this program and these patches, after getting past the
challenge of compiling the samples/bpf files ;-)


On Fri,  1 Apr 2016 18:21:58 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:

> Add a sample program that only drops packets at the
> BPF_PROG_TYPE_PHYS_DEV hook of a link. With the drop-only program,
> observed single core rate is ~14.6Mpps.

On my i7-4790K CPU @ 4.00GHz I'm seeing 9.7Mpps (single flow/cpu).
(generator: pktgen_sample03_burst_single_flow.sh)

 # ./netdrvx1 $(</sys/class/net/mlx4p1/ifindex)
 sh: /sys/kernel/debug/tracing/kprobe_events: No such file or directory
 Success: Loaded file ./netdrvx1_kern.o
 proto 17:    9776320 drops/s

These numbers are quite impressive. Compared to: sending it to local
socket that drop packets 1.7Mpps. Compared to: dropping with iptables
in "raw" table 3.7Mpps.

If I do multiple flows, via ./pktgen_sample05_flow_per_thread.sh
then I hit this strange 14.5Mpps limit (proto 17:   14505558 drops/s).
And the RX 4x CPUs are starting to NOT use 100% in softirq, they have
some cycles attributed to %idle. (I verified generator is sending at
24Mpps).


> Other tests were run, for instance without the dropcnt increment or
> without reading from the packet header, the packet rate was mostly
> unchanged.

If I change the program to not touch packet data (don't call
load_byte()) then the performance increase to 14.6Mpps (single
flow/cpu).  And the RX CPU is mostly idle... mlx4_en_process_rx_cq()
and page alloc/free functions taking the time.

> $ perf record -a samples/bpf/netdrvx1 $(</sys/class/net/eth0/ifindex)
> proto 17:   14597724 drops/s
> 
> ./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
> Running... ctrl^C to stop
> Device: eth4@0
> Result: OK: 6486875(c6485849+d1026) usec, 23689465 (60byte,0frags)
>   3651906pps 1752Mb/sec (1752914880bps) errors: 0
> Device: eth4@1
> Result: OK: 6486874(c6485656+d1217) usec, 23689489 (60byte,0frags)
>   3651911pps 1752Mb/sec (1752917280bps) errors: 0
> Device: eth4@2
> Result: OK: 6486851(c6485730+d1120) usec, 23687853 (60byte,0frags)
>   3651672pps 1752Mb/sec (1752802560bps) errors: 0
> Device: eth4@3
> Result: OK: 6486879(c6485807+d1071) usec, 23688954 (60byte,0frags)
>   3651825pps 1752Mb/sec (1752876000bps) errors: 0
> 
> perf report --no-children:
>   18.36%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_process_rx_cq
>   15.98%  swapper        [kernel.vmlinux]  [k] poll_idle
>   12.71%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_alloc_frags
>    6.87%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_free_frag
>    4.20%  ksoftirqd/1    [kernel.vmlinux]  [k] get_page_from_freelist
>    4.09%  swapper        [mlx4_en]         [k] mlx4_en_process_rx_cq
>    3.32%  ksoftirqd/1    [kernel.vmlinux]  [k] sk_load_byte_positive_offset
>    2.39%  ksoftirqd/1    [mdio]            [k] 0x00000000000074cd
>    2.23%  swapper        [mlx4_en]         [k] mlx4_en_alloc_frags
>    2.20%  ksoftirqd/1    [kernel.vmlinux]  [k] free_pages_prepare
>    2.08%  ksoftirqd/1    [mlx4_en]         [k] mlx4_call_bpf
>    1.57%  ksoftirqd/1    [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
>    1.35%  ksoftirqd/1    [mdio]            [k] 0x00000000000074fa
>    1.09%  ksoftirqd/1    [kernel.vmlinux]  [k] free_one_page
>    1.02%  ksoftirqd/1    [kernel.vmlinux]  [k] bpf_map_lookup_elem
>    0.90%  ksoftirqd/1    [kernel.vmlinux]  [k] __alloc_pages_nodemask
>    0.88%  swapper        [kernel.vmlinux]  [k] intel_idle
>    0.82%  ksoftirqd/1    [mdio]            [k] 0x00000000000074be
>    0.80%  swapper        [mlx4_en]         [k] mlx4_en_free_frag

My picture (single flow/cpu) looks a little bit different:

 +   64.33%  ksoftirqd/7    [kernel.vmlinux]  [k] __bpf_prog_run
 +    9.60%  ksoftirqd/7    [mlx4_en]         [k] mlx4_en_alloc_frags
 +    7.71%  ksoftirqd/7    [mlx4_en]         [k] mlx4_en_process_rx_cq
 +    5.47%  ksoftirqd/7    [mlx4_en]         [k] mlx4_en_free_frag
 +    1.68%  ksoftirqd/7    [kernel.vmlinux]  [k] get_page_from_freelist
 +    1.52%  ksoftirqd/7    [mlx4_en]         [k] mlx4_call_bpf
 +    1.02%  ksoftirqd/7    [kernel.vmlinux]  [k] free_pages_prepare
 +    0.72%  ksoftirqd/7    [mlx4_en]         [k] mlx4_alloc_pages.isra.20
 +    0.70%  ksoftirqd/7    [kernel.vmlinux]  [k] __rcu_read_unlock
 +    0.65%  ksoftirqd/7    [kernel.vmlinux]  [k] percpu_array_map_lookup_elem

On my i7-4790K CPU, I don't have DDIO, thus I assume this high cost in
__bpf_prog_run is due to a cache-miss on the packet data.

> machine specs:
>  receiver - Intel E5-1630 v3 @ 3.70GHz
>  sender - Intel E5645 @ 2.40GHz
>  Mellanox ConnectX-3 @40G
Jesper Dangaard Brouer April 6, 2016, 8:01 p.m. UTC | #2
On Wed, 6 Apr 2016 21:48:48 +0200
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> I'm testing with this program and these patches, after getting past the
> challenge of compiling the samples/bpf files ;-)
> 
> 
> On Fri,  1 Apr 2016 18:21:58 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:
> 
> > Add a sample program that only drops packets at the
> > BPF_PROG_TYPE_PHYS_DEV hook of a link. With the drop-only program,
> > observed single core rate is ~14.6Mpps.  
> 
> On my i7-4790K CPU @ 4.00GHz I'm seeing 9.7Mpps (single flow/cpu).
> (generator: pktgen_sample03_burst_single_flow.sh)
> 
>  # ./netdrvx1 $(</sys/class/net/mlx4p1/ifindex)
>  sh: /sys/kernel/debug/tracing/kprobe_events: No such file or directory
>  Success: Loaded file ./netdrvx1_kern.o
>  proto 17:    9776320 drops/s
> 
> These numbers are quite impressive. Compared to: sending it to local
> socket that drop packets 1.7Mpps. Compared to: dropping with iptables
> in "raw" table 3.7Mpps.
> 
> If I do multiple flows, via ./pktgen_sample05_flow_per_thread.sh
> then I hit this strange 14.5Mpps limit (proto 17:   14505558 drops/s).
> And the RX 4x CPUs are starting to NOT use 100% in softirq, they have
> some cycles attributed to %idle. (I verified generator is sending at
> 24Mpps).
> 
> 
> > Other tests were run, for instance without the dropcnt increment or
> > without reading from the packet header, the packet rate was mostly
> > unchanged.  
> 
> If I change the program to not touch packet data (don't call
> load_byte()) then the performance increase to 14.6Mpps (single
> flow/cpu).  And the RX CPU is mostly idle... mlx4_en_process_rx_cq()
> and page alloc/free functions taking the time.
> 
> > $ perf record -a samples/bpf/netdrvx1 $(</sys/class/net/eth0/ifindex)
> > proto 17:   14597724 drops/s
> > 
> > ./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
> > Running... ctrl^C to stop
> > Device: eth4@0
> > Result: OK: 6486875(c6485849+d1026) usec, 23689465 (60byte,0frags)
> >   3651906pps 1752Mb/sec (1752914880bps) errors: 0
> > Device: eth4@1
> > Result: OK: 6486874(c6485656+d1217) usec, 23689489 (60byte,0frags)
> >   3651911pps 1752Mb/sec (1752917280bps) errors: 0
> > Device: eth4@2
> > Result: OK: 6486851(c6485730+d1120) usec, 23687853 (60byte,0frags)
> >   3651672pps 1752Mb/sec (1752802560bps) errors: 0
> > Device: eth4@3
> > Result: OK: 6486879(c6485807+d1071) usec, 23688954 (60byte,0frags)
> >   3651825pps 1752Mb/sec (1752876000bps) errors: 0
> > 
> > perf report --no-children:
> >   18.36%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_process_rx_cq
> >   15.98%  swapper        [kernel.vmlinux]  [k] poll_idle
> >   12.71%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_alloc_frags
> >    6.87%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_free_frag
> >    4.20%  ksoftirqd/1    [kernel.vmlinux]  [k] get_page_from_freelist
> >    4.09%  swapper        [mlx4_en]         [k] mlx4_en_process_rx_cq
> >    3.32%  ksoftirqd/1    [kernel.vmlinux]  [k] sk_load_byte_positive_offset
> >    2.39%  ksoftirqd/1    [mdio]            [k] 0x00000000000074cd
> >    2.23%  swapper        [mlx4_en]         [k] mlx4_en_alloc_frags
> >    2.20%  ksoftirqd/1    [kernel.vmlinux]  [k] free_pages_prepare
> >    2.08%  ksoftirqd/1    [mlx4_en]         [k] mlx4_call_bpf
> >    1.57%  ksoftirqd/1    [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
> >    1.35%  ksoftirqd/1    [mdio]            [k] 0x00000000000074fa
> >    1.09%  ksoftirqd/1    [kernel.vmlinux]  [k] free_one_page
> >    1.02%  ksoftirqd/1    [kernel.vmlinux]  [k] bpf_map_lookup_elem
> >    0.90%  ksoftirqd/1    [kernel.vmlinux]  [k] __alloc_pages_nodemask
> >    0.88%  swapper        [kernel.vmlinux]  [k] intel_idle
> >    0.82%  ksoftirqd/1    [mdio]            [k] 0x00000000000074be
> >    0.80%  swapper        [mlx4_en]         [k] mlx4_en_free_frag  
> 
> My picture (single flow/cpu) looks a little bit different:
> 
>  +   64.33%  ksoftirqd/7    [kernel.vmlinux]  [k] __bpf_prog_run
>  +    9.60%  ksoftirqd/7    [mlx4_en]         [k] mlx4_en_alloc_frags
>  +    7.71%  ksoftirqd/7    [mlx4_en]         [k] mlx4_en_process_rx_cq
>  +    5.47%  ksoftirqd/7    [mlx4_en]         [k] mlx4_en_free_frag
>  +    1.68%  ksoftirqd/7    [kernel.vmlinux]  [k] get_page_from_freelist
>  +    1.52%  ksoftirqd/7    [mlx4_en]         [k] mlx4_call_bpf
>  +    1.02%  ksoftirqd/7    [kernel.vmlinux]  [k] free_pages_prepare
>  +    0.72%  ksoftirqd/7    [mlx4_en]         [k] mlx4_alloc_pages.isra.20
>  +    0.70%  ksoftirqd/7    [kernel.vmlinux]  [k] __rcu_read_unlock
>  +    0.65%  ksoftirqd/7    [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
> 
> On my i7-4790K CPU, I don't have DDIO, thus I assume this high cost in
> __bpf_prog_run is due to a cache-miss on the packet data.

Before someone else point out the obvious... I forgot to enable JIT.
Enable it::

 # echo 1 > /proc/sys/net/core/bpf_jit_enable

Performance increased to: 10.8Mpps (proto 17:   10819446 drops/s)

 Samples: 51K of event 'cycles', Event count (approx.): 56775706510
   Overhead  Command      Shared Object     Symbol
 +   55.90%  ksoftirqd/7  [kernel.vmlinux]  [k] sk_load_byte_positive_offset
 +   10.71%  ksoftirqd/7  [mlx4_en]         [k] mlx4_en_alloc_frags
 +    8.26%  ksoftirqd/7  [mlx4_en]         [k] mlx4_en_process_rx_cq
 +    5.94%  ksoftirqd/7  [mlx4_en]         [k] mlx4_en_free_frag
 +    2.04%  ksoftirqd/7  [kernel.vmlinux]  [k] get_page_from_freelist
 +    2.03%  ksoftirqd/7  [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
 +    1.42%  ksoftirqd/7  [mlx4_en]         [k] mlx4_call_bpf
 +    1.04%  ksoftirqd/7  [kernel.vmlinux]  [k] free_pages_prepare
 +    1.03%  ksoftirqd/7  [kernel.vmlinux]  [k] __rcu_read_unlock
 +    0.97%  ksoftirqd/7  [mlx4_en]         [k] mlx4_alloc_pages.isra.20
 +    0.95%  ksoftirqd/7  [devlink]         [k] 0x0000000000005f87
 +    0.58%  ksoftirqd/7  [devlink]         [k] 0x0000000000005f8f
 +    0.49%  ksoftirqd/7  [kernel.vmlinux]  [k] __free_pages_ok
 +    0.47%  ksoftirqd/7  [kernel.vmlinux]  [k] __rcu_read_lock
 +    0.46%  ksoftirqd/7  [kernel.vmlinux]  [k] free_one_page
 +    0.38%  ksoftirqd/7  [kernel.vmlinux]  [k] net_rx_action
 +    0.36%  ksoftirqd/7  [kernel.vmlinux]  [k] bpf_map_lookup_elem
 +    0.36%  ksoftirqd/7  [kernel.vmlinux]  [k] __mod_zone_page_state
 +    0.34%  ksoftirqd/7  [kernel.vmlinux]  [k] __alloc_pages_nodemask
 +    0.32%  ksoftirqd/7  [kernel.vmlinux]  [k] _raw_spin_lock
 +    0.31%  ksoftirqd/7  [devlink]         [k] 0x0000000000005f0a
 +    0.29%  ksoftirqd/7  [kernel.vmlinux]  [k] next_zones_zonelist

It is a very likely cache-miss in sk_load_byte_positive_offset().
Daniel Borkmann April 6, 2016, 8:03 p.m. UTC | #3
On 04/06/2016 09:48 PM, Jesper Dangaard Brouer wrote:
>
> I'm testing with this program and these patches, after getting past the
> challenge of compiling the samples/bpf files ;-)
>
> On Fri,  1 Apr 2016 18:21:58 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:
>
>> Add a sample program that only drops packets at the
>> BPF_PROG_TYPE_PHYS_DEV hook of a link. With the drop-only program,
>> observed single core rate is ~14.6Mpps.
>
> On my i7-4790K CPU @ 4.00GHz I'm seeing 9.7Mpps (single flow/cpu).
> (generator: pktgen_sample03_burst_single_flow.sh)
>
>   # ./netdrvx1 $(</sys/class/net/mlx4p1/ifindex)
>   sh: /sys/kernel/debug/tracing/kprobe_events: No such file or directory
>   Success: Loaded file ./netdrvx1_kern.o
>   proto 17:    9776320 drops/s
>
> These numbers are quite impressive. Compared to: sending it to local
> socket that drop packets 1.7Mpps. Compared to: dropping with iptables
> in "raw" table 3.7Mpps.
>
> If I do multiple flows, via ./pktgen_sample05_flow_per_thread.sh
> then I hit this strange 14.5Mpps limit (proto 17:   14505558 drops/s).
> And the RX 4x CPUs are starting to NOT use 100% in softirq, they have
> some cycles attributed to %idle. (I verified generator is sending at
> 24Mpps).
>
>> Other tests were run, for instance without the dropcnt increment or
>> without reading from the packet header, the packet rate was mostly
>> unchanged.
>
> If I change the program to not touch packet data (don't call
> load_byte()) then the performance increase to 14.6Mpps (single
> flow/cpu).  And the RX CPU is mostly idle... mlx4_en_process_rx_cq()
> and page alloc/free functions taking the time.
>
>> $ perf record -a samples/bpf/netdrvx1 $(</sys/class/net/eth0/ifindex)
>> proto 17:   14597724 drops/s
>>
>> ./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
>> Running... ctrl^C to stop
>> Device: eth4@0
>> Result: OK: 6486875(c6485849+d1026) usec, 23689465 (60byte,0frags)
>>    3651906pps 1752Mb/sec (1752914880bps) errors: 0
>> Device: eth4@1
>> Result: OK: 6486874(c6485656+d1217) usec, 23689489 (60byte,0frags)
>>    3651911pps 1752Mb/sec (1752917280bps) errors: 0
>> Device: eth4@2
>> Result: OK: 6486851(c6485730+d1120) usec, 23687853 (60byte,0frags)
>>    3651672pps 1752Mb/sec (1752802560bps) errors: 0
>> Device: eth4@3
>> Result: OK: 6486879(c6485807+d1071) usec, 23688954 (60byte,0frags)
>>    3651825pps 1752Mb/sec (1752876000bps) errors: 0
>>
>> perf report --no-children:
>>    18.36%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_process_rx_cq
>>    15.98%  swapper        [kernel.vmlinux]  [k] poll_idle
>>    12.71%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_alloc_frags
>>     6.87%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_free_frag
>>     4.20%  ksoftirqd/1    [kernel.vmlinux]  [k] get_page_from_freelist
>>     4.09%  swapper        [mlx4_en]         [k] mlx4_en_process_rx_cq
>>     3.32%  ksoftirqd/1    [kernel.vmlinux]  [k] sk_load_byte_positive_offset
>>     2.39%  ksoftirqd/1    [mdio]            [k] 0x00000000000074cd
>>     2.23%  swapper        [mlx4_en]         [k] mlx4_en_alloc_frags
>>     2.20%  ksoftirqd/1    [kernel.vmlinux]  [k] free_pages_prepare
>>     2.08%  ksoftirqd/1    [mlx4_en]         [k] mlx4_call_bpf
>>     1.57%  ksoftirqd/1    [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
>>     1.35%  ksoftirqd/1    [mdio]            [k] 0x00000000000074fa
>>     1.09%  ksoftirqd/1    [kernel.vmlinux]  [k] free_one_page
>>     1.02%  ksoftirqd/1    [kernel.vmlinux]  [k] bpf_map_lookup_elem
>>     0.90%  ksoftirqd/1    [kernel.vmlinux]  [k] __alloc_pages_nodemask
>>     0.88%  swapper        [kernel.vmlinux]  [k] intel_idle
>>     0.82%  ksoftirqd/1    [mdio]            [k] 0x00000000000074be
>>     0.80%  swapper        [mlx4_en]         [k] mlx4_en_free_frag
>
> My picture (single flow/cpu) looks a little bit different:
>
>   +   64.33%  ksoftirqd/7    [kernel.vmlinux]  [k] __bpf_prog_run

Looks like 'echo 1 > /proc/sys/net/core/bpf_jit_enable' is missing?

>   +    9.60%  ksoftirqd/7    [mlx4_en]         [k] mlx4_en_alloc_frags
>   +    7.71%  ksoftirqd/7    [mlx4_en]         [k] mlx4_en_process_rx_cq
>   +    5.47%  ksoftirqd/7    [mlx4_en]         [k] mlx4_en_free_frag
>   +    1.68%  ksoftirqd/7    [kernel.vmlinux]  [k] get_page_from_freelist
>   +    1.52%  ksoftirqd/7    [mlx4_en]         [k] mlx4_call_bpf
>   +    1.02%  ksoftirqd/7    [kernel.vmlinux]  [k] free_pages_prepare
>   +    0.72%  ksoftirqd/7    [mlx4_en]         [k] mlx4_alloc_pages.isra.20
>   +    0.70%  ksoftirqd/7    [kernel.vmlinux]  [k] __rcu_read_unlock
>   +    0.65%  ksoftirqd/7    [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
>
> On my i7-4790K CPU, I don't have DDIO, thus I assume this high cost in
> __bpf_prog_run is due to a cache-miss on the packet data.
>
>> machine specs:
>>   receiver - Intel E5-1630 v3 @ 3.70GHz
>>   sender - Intel E5645 @ 2.40GHz
>>   Mellanox ConnectX-3 @40G
>
Alexei Starovoitov April 6, 2016, 11:11 p.m. UTC | #4
On Wed, Apr 06, 2016 at 10:01:00PM +0200, Jesper Dangaard Brouer wrote:
> On Wed, 6 Apr 2016 21:48:48 +0200
> Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> > If I do multiple flows, via ./pktgen_sample05_flow_per_thread.sh
> > then I hit this strange 14.5Mpps limit (proto 17:   14505558 drops/s).
> > And the RX 4x CPUs are starting to NOT use 100% in softirq, they have
> > some cycles attributed to %idle. (I verified generator is sending at
> > 24Mpps).
...
> > If I change the program to not touch packet data (don't call
> > load_byte()) then the performance increase to 14.6Mpps (single
> > flow/cpu).  And the RX CPU is mostly idle... mlx4_en_process_rx_cq()
> > and page alloc/free functions taking the time.

Please try it with module param log_num_mgm_entry_size=-1
It should get to 20Mpps when bpf doesn't touch the packet.

> Before someone else point out the obvious... I forgot to enable JIT.
> Enable it::
> 
>  # echo 1 > /proc/sys/net/core/bpf_jit_enable
> 
> Performance increased to: 10.8Mpps (proto 17:   10819446 drops/s)
> 
>  Samples: 51K of event 'cycles', Event count (approx.): 56775706510
>    Overhead  Command      Shared Object     Symbol
>  +   55.90%  ksoftirqd/7  [kernel.vmlinux]  [k] sk_load_byte_positive_offset
>  +   10.71%  ksoftirqd/7  [mlx4_en]         [k] mlx4_en_alloc_frags
... 
> It is a very likely cache-miss in sk_load_byte_positive_offset().

yes, likely due to missing ddio as you said.
diff mbox

Patch

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 502c9fc..ad36bb8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -19,6 +19,7 @@  hostprogs-y += lathist
 hostprogs-y += offwaketime
 hostprogs-y += spintest
 hostprogs-y += map_perf_test
+hostprogs-y += netdrvx1
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -38,6 +39,7 @@  lathist-objs := bpf_load.o libbpf.o lathist_user.o
 offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
 spintest-objs := bpf_load.o libbpf.o spintest_user.o
 map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
+netdrvx1-objs := bpf_load.o libbpf.o netdrvx1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -56,6 +58,7 @@  always += lathist_kern.o
 always += offwaketime_kern.o
 always += spintest_kern.o
 always += map_perf_test_kern.o
+always += netdrvx1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -75,6 +78,7 @@  HOSTLOADLIBES_lathist += -lelf
 HOSTLOADLIBES_offwaketime += -lelf
 HOSTLOADLIBES_spintest += -lelf
 HOSTLOADLIBES_map_perf_test += -lelf -lrt
+HOSTLOADLIBES_netdrvx1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 58f86bd..9308fbc 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -49,6 +49,7 @@  static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_socket = strncmp(event, "socket", 6) == 0;
 	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
 	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
+	bool is_phys_dev = strncmp(event, "phys_dev", 8) == 0;
 	enum bpf_prog_type prog_type;
 	char buf[256];
 	int fd, efd, err, id;
@@ -63,6 +64,8 @@  static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
 	} else if (is_kprobe || is_kretprobe) {
 		prog_type = BPF_PROG_TYPE_KPROBE;
+	} else if (is_phys_dev) {
+		prog_type = BPF_PROG_TYPE_PHYS_DEV;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -76,6 +79,9 @@  static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_phys_dev)
+		return 0;
+
 	if (is_socket) {
 		event += 6;
 		if (*event != '/')
@@ -304,6 +310,7 @@  int load_bpf_file(char *path)
 
 			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
 			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
+			    memcmp(shname_prog, "phys_dev", 8) == 0 ||
 			    memcmp(shname_prog, "socket", 6) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
@@ -320,6 +327,7 @@  int load_bpf_file(char *path)
 
 		if (memcmp(shname, "kprobe/", 7) == 0 ||
 		    memcmp(shname, "kretprobe/", 10) == 0 ||
+		    memcmp(shname, "phys_dev", 8) == 0 ||
 		    memcmp(shname, "socket", 6) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
diff --git a/samples/bpf/netdrvx1_kern.c b/samples/bpf/netdrvx1_kern.c
new file mode 100644
index 0000000..9837d8a
--- /dev/null
+++ b/samples/bpf/netdrvx1_kern.c
@@ -0,0 +1,26 @@ 
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") dropcnt = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 256,
+};
+
+SEC("phys_dev1")
+int bpf_prog1(struct xdp_metadata *ctx)
+{
+	int index = load_byte(ctx, ETH_HLEN + offsetof(struct iphdr, protocol));
+	long *value;
+
+	value = bpf_map_lookup_elem(&dropcnt, &index);
+	if (value)
+		*value += 1;
+
+	return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/netdrvx1_user.c b/samples/bpf/netdrvx1_user.c
new file mode 100644
index 0000000..9e6ec9a
--- /dev/null
+++ b/samples/bpf/netdrvx1_user.c
@@ -0,0 +1,155 @@ 
+#include <linux/bpf.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "bpf_load.h"
+#include "libbpf.h"
+
+static int set_link_bpf_fd(int ifindex, int fd)
+{
+	struct sockaddr_nl sa;
+	int sock, seq = 0, len, ret = -1;
+	char buf[4096];
+	struct rtattr *rta;
+	struct {
+		struct nlmsghdr  nh;
+		struct ifinfomsg ifinfo;
+		char             attrbuf[64];
+	} req;
+	struct nlmsghdr *nh;
+	struct nlmsgerr *err;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.nl_family = AF_NETLINK;
+
+	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (sock < 0) {
+		printf("open netlink socket: %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+		printf("bind to netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_type = RTM_SETLINK;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.ifinfo.ifi_family = AF_UNSPEC;
+	req.ifinfo.ifi_index = ifindex;
+	rta = (struct rtattr *)(((char *) &req)
+				+ NLMSG_ALIGN(req.nh.nlmsg_len));
+	rta->rta_type = 42/*IFLA_BPF_FD*/;
+	rta->rta_len = RTA_LENGTH(sizeof(unsigned int));
+	req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len)
+		+ RTA_LENGTH(sizeof(fd));
+	memcpy(RTA_DATA(rta), &fd, sizeof(fd));
+	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+		printf("send to netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	len = recv(sock, buf, sizeof(buf), 0);
+	if (len < 0) {
+		printf("recv from netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+	     nh = NLMSG_NEXT(nh, len)) {
+		if (nh->nlmsg_pid != getpid()) {
+			printf("Wrong pid %d, expected %d\n",
+			       nh->nlmsg_pid, getpid());
+			goto cleanup;
+		}
+		if (nh->nlmsg_seq != seq) {
+			printf("Wrong seq %d, expected %d\n",
+			       nh->nlmsg_seq, seq);
+			goto cleanup;
+		}
+		switch (nh->nlmsg_type) {
+		case NLMSG_ERROR:
+			err = (struct nlmsgerr *)NLMSG_DATA(nh);
+			if (!err->error)
+				continue;
+			printf("nlmsg error %s\n", strerror(-err->error));
+			goto cleanup;
+		case NLMSG_DONE:
+			break;
+		}
+	}
+
+	ret = 0;
+
+cleanup:
+	close(sock);
+	return ret;
+}
+
+/* simple per-protocol drop counter
+ */
+static void poll_stats(int secs)
+{
+	unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+	__u64 values[nr_cpus];
+	__u32 key;
+	int i;
+
+	sleep(secs);
+
+	for (key = 0; key < 256; key++) {
+		__u64 sum = 0;
+
+		assert(bpf_lookup_elem(map_fd[0], &key, values) == 0);
+		for (i = 0; i < nr_cpus; i++)
+			sum += values[i];
+		if (sum)
+			printf("proto %u: %10llu drops/s\n", key, sum/secs);
+	}
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int ifindex;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (ac != 2) {
+		printf("usage: %s IFINDEX\n", argv[0]);
+		return 1;
+	}
+
+	ifindex = strtoul(argv[1], NULL, 0);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	if (!prog_fd[0]) {
+		printf("load_bpf_file: %s\n", strerror(errno));
+		return 1;
+	}
+
+	if (set_link_bpf_fd(ifindex, prog_fd[0]) < 0) {
+		printf("link set bpf fd failed\n");
+		return 1;
+	}
+
+	poll_stats(5);
+
+	set_link_bpf_fd(ifindex, -1);
+
+	return 0;
+}