diff mbox series

[net,v4,01/12] net: core: limit nested device depth

Message ID 20190928164843.31800-2-ap420073@gmail.com
State Changes Requested
Delegated to: David Miller
Headers show
Series net: fix nested device bugs | expand

Commit Message

Taehee Yoo Sept. 28, 2019, 4:48 p.m. UTC
Current code doesn't limit the number of nested devices.
Nested devices would be handled recursively and this needs huge stack
memory. So, unlimited nested devices could make stack overflow.

This patch adds upper_level and lower_level, they are common variables
and represent maximum lower/upper depth.
When upper/lower device is attached or dettached,
{lower/upper}_level are updated. and if maximum depth is bigger than 8,
attach routine fails and returns -EMLINK.

In addition, this patch converts recursive routine of
netdev_walk_all_{lower/upper} to iterator routine.

Test commands:
    ip link add dummy0 type dummy
    ip link add link dummy0 name vlan1 type vlan id 1
    ip link set vlan1 up

    for i in {2..200}
    do
	    let A=$i-1

	    ip link add vlan$i link vlan$A type vlan id $i
    done
    ip link del vlan1

Splat looks like:
[  923.102992] Thread overran stack, or stack corrupted
[  923.103471] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
[  923.104086] CPU: 0 PID: 1597 Comm: ip Not tainted 5.3.0+ #3
[  923.104771] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[  923.108837] RIP: 0010:stack_depot_fetch+0x10/0x30
[  923.109470] Code: 00 75 10 48 8b 73 18 48 89 ef 5b 5d e9 79 b1 83 ff 0f 0b e8 92 96 97 ff eb e9 89 f8 c1 ef 11 25 ff 0
[  923.111775] RSP: 0018:ffff8880541ceb78 EFLAGS: 00010006
[  923.112452] RAX: 00000000001fffff RBX: ffff8880541cee88 RCX: 0000000000000000
[  923.113399] RDX: 000000000000001d RSI: ffff8880541ceb80 RDI: 0000000000003ff0
[  923.114284] RBP: ffffea0001507380 R08: ffffed100d8fdf23 R09: ffffed100d8fdf23
[  923.115183] R10: 0000000000000001 R11: ffffed100d8fdf22 R12: ffff88806c240880
[  923.115986] R13: ffff8880541cec98 R14: ffff8880541cee88 R15: ffff8880541ced20
[  923.120477] FS:  00007ff38ab4f0c0(0000) GS:ffff88806c600000(0000) knlGS:0000000000000000
[  923.121486] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  923.122451] CR2: ffffffffa5be5658 CR3: 0000000053532004 CR4: 00000000000606f0
[  923.123303] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  923.128422] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  923.129399] Call Trace:
[  923.129710] Modules linked in: 8021q dummy ip_tables x_tables
[  923.130518] CR2: ffffffffa5be5658
[  923.130909] ---[ end trace 9568b7d36ab26094 ]---
[  923.131457] RIP: 0010:stack_depot_fetch+0x10/0x30
[  923.132006] Code: 00 75 10 48 8b 73 18 48 89 ef 5b 5d e9 79 b1 83 ff 0f 0b e8 92 96 97 ff eb e9 89 f8 c1 ef 11 25 ff 0
[  923.134219] RSP: 0018:ffff8880541ceb78 EFLAGS: 00010006
[  923.134834] RAX: 00000000001fffff RBX: ffff8880541cee88 RCX: 0000000000000000
[  923.135664] RDX: 000000000000001d RSI: ffff8880541ceb80 RDI: 0000000000003ff0
[  923.136514] RBP: ffffea0001507380 R08: ffffed100d8fdf23 R09: ffffed100d8fdf23
[  923.137276] R10: 0000000000000001 R11: ffffed100d8fdf22 R12: ffff88806c240880
[  923.138025] R13: ffff8880541cec98 R14: ffff8880541cee88 R15: ffff8880541ced20
[  923.138773] FS:  00007ff38ab4f0c0(0000) GS:ffff88806c600000(0000) knlGS:0000000000000000
[  923.140099] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  923.140763] CR2: ffffffffa5be5658 CR3: 0000000053532004 CR4: 00000000000606f0
[  923.141539] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  923.144930] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  923.145942] Kernel panic - not syncing: Fatal exception

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---

v3 -> v4 :
 - This patch is not changed
v2 -> v3 :
 - Modify nesting infra code to use iterator instead of recursive
 v1 -> v2 :
  - This patch is not changed

 include/linux/netdevice.h |   4 +
 net/core/dev.c            | 286 ++++++++++++++++++++++++++++++++------
 2 files changed, 245 insertions(+), 45 deletions(-)

Comments

Johannes Berg Sept. 28, 2019, 7:36 p.m. UTC | #1
Hi,

>  int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
>  				  int (*fn)(struct net_device *dev,
>  					    void *data),
>  				  void *data)
>  {
[...]
>  	}
>  
>  	return 0;
> +
>  }

that seems like an oversight, probably from editing the patch in
different versions?

> +static int __netdev_update_upper_level(struct net_device *dev, void *data)
> +{
> +	dev->upper_level = __netdev_upper_depth(dev) + 1;
> +	return 0;
> +}
> +
> +static int __netdev_update_lower_level(struct net_device *dev, void *data)
> +{
> +	dev->lower_level = __netdev_lower_depth(dev) + 1;
> +	return 0;
> +}

Is there any point in the return value here? You don't really use it,
afaict? I guess I might see the point if it was used for tail-call
optimisation or such?


Also, I dunno, I guess netdevs aren't as much under pressure as SKBs :-)
but do we actually gain much from storing the nesting level at all? You
have to maintain it all the time anyway when adding/removing and that's
the only place where you also check it, so perhaps it wouldn't be that
bad to just count at that time?

But then again the counting would probably be recursive again ...

>  	return 0;
> +
>  }
>  EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);

same nit as above
 
> +	__netdev_update_upper_level(dev, NULL);
> +	netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
> +
> +	__netdev_update_lower_level(upper_dev, NULL);
> +	netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);

Actually, if I'm reading this correctly you already walk all the levels
anyway? Then couldn't you calculate the depth at this time and return
it, instead of storing it? Though, if it actually overflowed then you'd
have to walk *again* to undo that?

Hmm, actually, if you don't store the value you don't even need to walk
here I guess, or at least you would only have to do it to verify you
*can* attach, but wouldn't have to in detach?

So it looks to me like on attach (i.e. this code, quoted from
__netdev_upper_dev_link) you're already walking the entire graph to
update the level values, and could probably instead calculate the
nesting depth to validate it?
And then on netdev_upper_dev_unlink() you wouldn't even have to walk the
graph at all, since you only need that to update the values that you
stored.

But maybe I'm misinterpreting this completely?

Thanks,
johannes
Taehee Yoo Sept. 29, 2019, 11:05 a.m. UTC | #2
On Sun, 29 Sep 2019 at 04:36, Johannes Berg <johannes@sipsolutions.net> wrote:
>
> Hi,
>
> >  int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
> >                                 int (*fn)(struct net_device *dev,
> >                                           void *data),
> >                                 void *data)
> >  {
> [...]
> >       }
> >
> >       return 0;
> > +
> >  }
>
> that seems like an oversight, probably from editing the patch in
> different versions?
>

I will fix this in a v5 patch.

> > +static int __netdev_update_upper_level(struct net_device *dev, void *data)
> > +{
> > +     dev->upper_level = __netdev_upper_depth(dev) + 1;
> > +     return 0;
> > +}
> > +
> > +static int __netdev_update_lower_level(struct net_device *dev, void *data)
> > +{
> > +     dev->lower_level = __netdev_lower_depth(dev) + 1;
> > +     return 0;
> > +}
>
> Is there any point in the return value here? You don't really use it,
> afaict? I guess I might see the point if it was used for tail-call
> optimisation or such?
>

These functions are used as a callback function of
netdev_walk_all_{upper/lower}_dev(). So these return types are needed.

>
> Also, I dunno, I guess netdevs aren't as much under pressure as SKBs :-)
> but do we actually gain much from storing the nesting level at all? You
> have to maintain it all the time anyway when adding/removing and that's
> the only place where you also check it, so perhaps it wouldn't be that
> bad to just count at that time?
>
> But then again the counting would probably be recursive again ...
>
> >       return 0;
> > +
> >  }
> >  EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
>
> same nit as above
>

I will fix this in a v5 patch too.

> > +     __netdev_update_upper_level(dev, NULL);
> > +     netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
> > +
> > +     __netdev_update_lower_level(upper_dev, NULL);
> > +     netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
>
> Actually, if I'm reading this correctly you already walk all the levels
> anyway? Then couldn't you calculate the depth at this time and return
> it, instead of storing it? Though, if it actually overflowed then you'd
> have to walk *again* to undo that?
>
> Hmm, actually, if you don't store the value you don't even need to walk
> here I guess, or at least you would only have to do it to verify you
> *can* attach, but wouldn't have to in detach?
>
> So it looks to me like on attach (i.e. this code, quoted from
> __netdev_upper_dev_link) you're already walking the entire graph to
> update the level values, and could probably instead calculate the
> nesting depth to validate it?
> And then on netdev_upper_dev_unlink() you wouldn't even have to walk the
> graph at all, since you only need that to update the values that you
> stored.
>
> But maybe I'm misinterpreting this completely?
>

Without storing level storing, a walking graph routine is needed only
once. The routine would work as a nesting depth validator.
So that the detach routine doesn't need to walk the graph.
Whereas, in this patch, both attach and detach routine need to
walk graph. So, storing nesting variable way is slower than without
storing nesting variable way because of the detach routine's updating
upper and lower level routine.

But I'm sure that storing nesting variables is useful because other
modules already using nesting level values.
Please look at vlan_get_encap_level() and usecases.
If we don't provide nesting level variables, they should calculate
every time when they need it and this way is easier way to get a
nesting level. There are use-cases of lower_level variable
in the 11th patch.

Thank you

> Thanks,
> johannes
>
>
Johannes Berg Oct. 1, 2019, 7:11 a.m. UTC | #3
Hi,

Sorry for the delay.

> These functions are used as a callback function of
> netdev_walk_all_{upper/lower}_dev(). So these return types are needed.

Ah yes, I missed that, sorry.

> Without storing level storing, a walking graph routine is needed only
> once. The routine would work as a nesting depth validator.
> So that the detach routine doesn't need to walk the graph.
> Whereas, in this patch, both attach and detach routine need to
> walk graph. So, storing nesting variable way is slower than without
> storing nesting variable way because of the detach routine's updating
> upper and lower level routine.

Right, that's what I thought.

> But I'm sure that storing nesting variables is useful because other
> modules already using nesting level values.
> Please look at vlan_get_encap_level() and usecases.

Indeed, I noticed that later.

> If we don't provide nesting level variables, they should calculate
> every time when they need it and this way is easier way to get a
> nesting level. There are use-cases of lower_level variable
> in the 11th patch.

Yes, makes sense, agree. One could argue that you only ever need the
"lower_level" stored, not the "upper_level", but I guess that doesn't
really make a difference.

Placing these in a better position in the struct might make sense - a
cursory look suggested that they weren't filling any of the many holes
there, did you pay attention to that or was the placement more or less
random?

johannes
Taehee Yoo Oct. 1, 2019, 1:53 p.m. UTC | #4
On Tue, 1 Oct 2019 at 16:11, Johannes Berg <johannes@sipsolutions.net> wrote:
>
> Hi,
>

Hi!

> Sorry for the delay.
>
> > These functions are used as a callback function of
> > netdev_walk_all_{upper/lower}_dev(). So these return types are needed.
>
> Ah yes, I missed that, sorry.
>
> > Without storing level storing, a walking graph routine is needed only
> > once. The routine would work as a nesting depth validator.
> > So that the detach routine doesn't need to walk the graph.
> > Whereas, in this patch, both attach and detach routine need to
> > walk graph. So, storing nesting variable way is slower than without
> > storing nesting variable way because of the detach routine's updating
> > upper and lower level routine.
>
> Right, that's what I thought.
>
> > But I'm sure that storing nesting variables is useful because other
> > modules already using nesting level values.
> > Please look at vlan_get_encap_level() and usecases.
>
> Indeed, I noticed that later.
>
> > If we don't provide nesting level variables, they should calculate
> > every time when they need it and this way is easier way to get a
> > nesting level. There are use-cases of lower_level variable
> > in the 11th patch.
>
> Yes, makes sense, agree. One could argue that you only ever need the
> "lower_level" stored, not the "upper_level", but I guess that doesn't
> really make a difference.
>
> Placing these in a better position in the struct might make sense - a
> cursory look suggested that they weren't filling any of the many holes
> there, did you pay attention to that or was the placement more or less
> random?
>

If I understand correctly, you said about the alignment of
"lower_level" and "upper_level".
I thought this place is a fine position for variables as regards the
alignment and I didn't try to put each variable in different places.

If I misunderstood your mention, please let me know.

Thank you

> johannes
>
Johannes Berg Oct. 1, 2019, 1:57 p.m. UTC | #5
Hi,

(jumping out now, forgive me for being so brief)

> If I understand correctly, you said about the alignment of
> "lower_level" and "upper_level".
> I thought this place is a fine position for variables as regards the
> alignment and I didn't try to put each variable in different places.
> 
> If I misunderstood your mention, please let me know.

Not sure what you mean, alignment doesn't matter for them (they're u8).

I was thinking of the packing for the overall struct, we have:

        unsigned int            max_mtu;
        unsigned short          type;
        unsigned short          hard_header_len;
        unsigned char           min_header_len;

+	unsigned char		upper_level, lower_level;

        unsigned short          needed_headroom;
        unsigned short          needed_tailroom;


Previously, there was a one byte hole at that spot due to a single
"unsigned char" (after something aligned at least 4 bytes) followed by
"unsigned short" - now you push that out a bit.

If you place the variables a bit lower, below "name_assign_type", you
probably fill a hole instead.

Check out the 'pahole' tool.

johannes
Taehee Yoo Oct. 1, 2019, 6:23 p.m. UTC | #6
On Tue, 1 Oct 2019 at 22:57, Johannes Berg <johannes@sipsolutions.net> wrote:
>
> Hi,
>

Hi!

> (jumping out now, forgive me for being so brief)
>
> > If I understand correctly, you said about the alignment of
> > "lower_level" and "upper_level".
> > I thought this place is a fine position for variables as regards the
> > alignment and I didn't try to put each variable in different places.
> >
> > If I misunderstood your mention, please let me know.
>
> Not sure what you mean, alignment doesn't matter for them (they're u8).
>
> I was thinking of the packing for the overall struct, we have:
>
>         unsigned int            max_mtu;
>         unsigned short          type;
>         unsigned short          hard_header_len;
>         unsigned char           min_header_len;
>
> +       unsigned char           upper_level, lower_level;
>
>         unsigned short          needed_headroom;
>         unsigned short          needed_tailroom;
>
>
> Previously, there was a one byte hole at that spot due to a single
> "unsigned char" (after something aligned at least 4 bytes) followed by
> "unsigned short" - now you push that out a bit.
>
> If you place the variables a bit lower, below "name_assign_type", you
> probably fill a hole instead.
>
> Check out the 'pahole' tool.
>

Thank you for the detailed explanation.
I tested the pahole and found holes.

$ pahole ./vmlinux.o -C net_device
        unsigned char              addr_assign_type;     /*   598     1 */
        unsigned char              addr_len;             /*   599     1 */
        short unsigned int         neigh_priv_len;       /*   600     2 */
        short unsigned int         dev_id;               /*   602     2 */
        short unsigned int         dev_port;             /*   604     2 */

        /* XXX 2 bytes hole, try to pack */

I will place the variables here.

> johannes
>

Thank you so much!
Taehee
Sabrina Dubroca Oct. 10, 2019, 10:19 a.m. UTC | #7
2019-09-28, 16:48:32 +0000, Taehee Yoo wrote:
> @@ -6790,23 +6878,45 @@ int netdev_walk_all_lower_dev(struct net_device *dev,
>  					void *data),
>  			      void *data)
>  {
> -	struct net_device *ldev;
> -	struct list_head *iter;
> -	int ret;
> +	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
> +	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
> +	int ret, cur = 0;
>  
> -	for (iter = &dev->adj_list.lower,
> -	     ldev = netdev_next_lower_dev(dev, &iter);
> -	     ldev;
> -	     ldev = netdev_next_lower_dev(dev, &iter)) {
> -		/* first is the lower device itself */
> -		ret = fn(ldev, data);
> -		if (ret)
> -			return ret;
> +	now = dev;
> +	iter = &dev->adj_list.lower;
>  
> -		/* then look at all of its lower devices */
> -		ret = netdev_walk_all_lower_dev(ldev, fn, data);
> -		if (ret)
> -			return ret;
> +	while (1) {
> +		if (now != dev) {
> +			ret = fn(now, data);
> +			if (ret)
> +				return ret;
> +		}
> +
> +		next = NULL;
> +		while (1) {
> +			ldev = netdev_next_lower_dev(now, &iter);
> +			if (!ldev)
> +				break;
> +
> +			if (!next) {
> +				next = ldev;
> +				niter = &ldev->adj_list.lower;
> +			} else {
> +				dev_stack[cur] = ldev;
> +				iter_stack[cur++] = &ldev->adj_list.lower;
> +				break;
> +			}
> +		}
> +
> +		if (!next) {
> +			if (!cur)
> +				return 0;

Hmm, I don't think this condition is correct.

If we have this topology:


                bridge0
                /  |  \
               /   |   \
              /    |    \
        dummy0   vlan1   vlan2
                   |       \
                 dummy1    dummy2

We end up with the expected lower/upper levels for all devices:

    | device  | upper | lower |
    |---------+-------+-------|
    | dummy0  |     2 |     1 |
    | dummy1  |     3 |     1 |
    | dummy2  |     3 |     1 |
    | vlan1   |     2 |     2 |
    | vlan2   |     2 |     2 |
    | bridge0 |     1 |     3 |


If we then add macvlan0 on top of bridge0:


                macvlan0
                   |
                   |
                bridge0
                /  |  \
               /   |   \
              /    |    \
        dummy0   vlan1   vlan2
                   |       \
                 dummy1    dummy2


we can observe that __netdev_update_upper_level is only called for
some of the devices under bridge0. I added a perf probe:

 # perf probe -a '__netdev_update_upper_level dev->name:string'

which gets hit for bridge0 (called directly by
__netdev_upper_dev_link) and then dummy0, vlan1, dummy1. It is never
called for vlan2 and dummy2.

After this, we have the following levels (*):

    | device   | upper | lower |
    |----------+-------+-------|
    | dummy0   |     3 |     1 |
    | dummy1   |     4 |     1 |
    | dummy2   |     3 |     1 |
    | vlan1    |     3 |     2 |
    | vlan2    |     2 |     2 |
    | bridge0  |     2 |     3 |
    | macvlan0 |     1 |     4 |

For dummy0, dummy1, vlan1, the upper level has increased by 1, as
expected. For dummy2 and vlan2, it's still the same, which is wrong.


(*) observed easily by adding another probe:

 # perf probe -a 'dev_get_stats dev->name:string dev->upper_level dev->lower_level'

and running "ip link"

Or you can just add prints and recompile, of course :)

> +			next = dev_stack[--cur];
> +			niter = iter_stack[cur];
> +		}
> +
> +		now = next;
> +		iter = niter;
>  	}
>  
>  	return 0;
Taehee Yoo Oct. 12, 2019, 11:42 a.m. UTC | #8
On Thu, 10 Oct 2019 at 19:19, Sabrina Dubroca <sd@queasysnail.net> wrote:
>

Hi Sabrina,

Thank you for review and testing!

> 2019-09-28, 16:48:32 +0000, Taehee Yoo wrote:
> > @@ -6790,23 +6878,45 @@ int netdev_walk_all_lower_dev(struct net_device *dev,
> >                                       void *data),
> >                             void *data)
> >  {
> > -     struct net_device *ldev;
> > -     struct list_head *iter;
> > -     int ret;
> > +     struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
> > +     struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
> > +     int ret, cur = 0;
> >
> > -     for (iter = &dev->adj_list.lower,
> > -          ldev = netdev_next_lower_dev(dev, &iter);
> > -          ldev;
> > -          ldev = netdev_next_lower_dev(dev, &iter)) {
> > -             /* first is the lower device itself */
> > -             ret = fn(ldev, data);
> > -             if (ret)
> > -                     return ret;
> > +     now = dev;
> > +     iter = &dev->adj_list.lower;
> >
> > -             /* then look at all of its lower devices */
> > -             ret = netdev_walk_all_lower_dev(ldev, fn, data);
> > -             if (ret)
> > -                     return ret;
> > +     while (1) {
> > +             if (now != dev) {
> > +                     ret = fn(now, data);
> > +                     if (ret)
> > +                             return ret;
> > +             }
> > +
> > +             next = NULL;
> > +             while (1) {
> > +                     ldev = netdev_next_lower_dev(now, &iter);
> > +                     if (!ldev)
> > +                             break;
> > +
> > +                     if (!next) {
> > +                             next = ldev;
> > +                             niter = &ldev->adj_list.lower;
> > +                     } else {
> > +                             dev_stack[cur] = ldev;
> > +                             iter_stack[cur++] = &ldev->adj_list.lower;
> > +                             break;
> > +                     }
> > +             }
> > +
> > +             if (!next) {
> > +                     if (!cur)
> > +                             return 0;
>
> Hmm, I don't think this condition is correct.
>
> If we have this topology:
>
>
>                 bridge0
>                 /  |  \
>                /   |   \
>               /    |    \
>         dummy0   vlan1   vlan2
>                    |       \
>                  dummy1    dummy2
>
> We end up with the expected lower/upper levels for all devices:
>
>     | device  | upper | lower |
>     |---------+-------+-------|
>     | dummy0  |     2 |     1 |
>     | dummy1  |     3 |     1 |
>     | dummy2  |     3 |     1 |
>     | vlan1   |     2 |     2 |
>     | vlan2   |     2 |     2 |
>     | bridge0 |     1 |     3 |
>
>
> If we then add macvlan0 on top of bridge0:
>
>
>                 macvlan0
>                    |
>                    |
>                 bridge0
>                 /  |  \
>                /   |   \
>               /    |    \
>         dummy0   vlan1   vlan2
>                    |       \
>                  dummy1    dummy2
>
>
> we can observe that __netdev_update_upper_level is only called for
> some of the devices under bridge0. I added a perf probe:
>
>  # perf probe -a '__netdev_update_upper_level dev->name:string'
>
> which gets hit for bridge0 (called directly by
> __netdev_upper_dev_link) and then dummy0, vlan1, dummy1. It is never
> called for vlan2 and dummy2.
>
> After this, we have the following levels (*):
>
>     | device   | upper | lower |
>     |----------+-------+-------|
>     | dummy0   |     3 |     1 |
>     | dummy1   |     4 |     1 |
>     | dummy2   |     3 |     1 |
>     | vlan1    |     3 |     2 |
>     | vlan2    |     2 |     2 |
>     | bridge0  |     2 |     3 |
>     | macvlan0 |     1 |     4 |
>
> For dummy0, dummy1, vlan1, the upper level has increased by 1, as
> expected. For dummy2 and vlan2, it's still the same, which is wrong.
>
>
> (*) observed easily by adding another probe:
>
>  # perf probe -a 'dev_get_stats dev->name:string dev->upper_level dev->lower_level'
>
> and running "ip link"
>
> Or you can just add prints and recompile, of course :)
>

Thank you so much, I found a bug very easily with your test config.
I will fix this bug in a v5 patch.

> > +                     next = dev_stack[--cur];
> > +                     niter = iter_stack[cur];
> > +             }
> > +
> > +             now = next;
> > +             iter = niter;
> >       }
> >
> >       return 0;
>
> --
> Sabrina

Thank you,
Taehee Yoo
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9eda1c31d1f7..613007aa5986 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1637,6 +1637,8 @@  enum netdev_priv_flags {
  *	@type:		Interface hardware type
  *	@hard_header_len: Maximum hardware header length.
  *	@min_header_len:  Minimum hardware header length
+ *	@upper_level:	Maximum depth level of upper devices.
+ *	@lower_level:	Maximum depth level of lower devices.
  *
  *	@needed_headroom: Extra headroom the hardware may need, but not in all
  *			  cases can this be guaranteed
@@ -1867,6 +1869,8 @@  struct net_device {
 	unsigned short		type;
 	unsigned short		hard_header_len;
 	unsigned char		min_header_len;
+	unsigned char		upper_level;
+	unsigned char		lower_level;
 
 	unsigned short		needed_headroom;
 	unsigned short		needed_tailroom;
diff --git a/net/core/dev.c b/net/core/dev.c
index bf3ed413abaf..13cb646fb98f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -146,6 +146,7 @@ 
 #include "net-sysfs.h"
 
 #define MAX_GRO_SKBS 8
+#define MAX_NEST_DEV 8
 
 /* This should be increased if a protocol with a bigger head is added. */
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
@@ -6644,6 +6645,21 @@  struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 
+static struct net_device *netdev_next_upper_dev(struct net_device *dev,
+						struct list_head **iter)
+{
+	struct netdev_adjacent *upper;
+
+	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+	if (&upper->list == &dev->adj_list.upper)
+		return NULL;
+
+	*iter = &upper->list;
+
+	return upper->dev;
+}
+
 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 						    struct list_head **iter)
 {
@@ -6661,31 +6677,103 @@  static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 	return upper->dev;
 }
 
+int netdev_walk_all_upper_dev(struct net_device *dev,
+			      int (*fn)(struct net_device *dev,
+					void *data),
+			      void *data)
+{
+	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+	int ret, cur = 0;
+
+	now = dev;
+	iter = &dev->adj_list.upper;
+
+	while (1) {
+		if (now != dev) {
+			ret = fn(now, data);
+			if (ret)
+				return ret;
+		}
+
+		next = NULL;
+		while (1) {
+			udev = netdev_next_upper_dev(now, &iter);
+			if (!udev)
+				break;
+
+			if (!next) {
+				next = udev;
+				niter = &udev->adj_list.upper;
+			} else {
+				dev_stack[cur] = udev;
+				iter_stack[cur++] = &udev->adj_list.upper;
+				break;
+			}
+		}
+
+		if (!next) {
+			if (!cur)
+				return 0;
+			next = dev_stack[--cur];
+			niter = iter_stack[cur];
+		}
+
+		now = next;
+		iter = niter;
+	}
+
+	return 0;
+}
+
 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 				  int (*fn)(struct net_device *dev,
 					    void *data),
 				  void *data)
 {
-	struct net_device *udev;
-	struct list_head *iter;
-	int ret;
+	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+	int ret, cur = 0;
 
-	for (iter = &dev->adj_list.upper,
-	     udev = netdev_next_upper_dev_rcu(dev, &iter);
-	     udev;
-	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
-		/* first is the upper device itself */
-		ret = fn(udev, data);
-		if (ret)
-			return ret;
+	now = dev;
+	iter = &dev->adj_list.upper;
 
-		/* then look at all of its upper devices */
-		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
-		if (ret)
-			return ret;
+	while (1) {
+		if (now != dev) {
+			ret = fn(now, data);
+			if (ret)
+				return ret;
+		}
+
+		next = NULL;
+		while (1) {
+			udev = netdev_next_upper_dev_rcu(now, &iter);
+			if (!udev)
+				break;
+
+			if (!next) {
+				next = udev;
+				niter = &udev->adj_list.upper;
+			} else {
+				dev_stack[cur] = udev;
+				iter_stack[cur++] = &udev->adj_list.upper;
+				break;
+			}
+		}
+
+		if (!next) {
+			if (!cur)
+				return 0;
+			next = dev_stack[--cur];
+			niter = iter_stack[cur];
+		}
+
+		now = next;
+		iter = niter;
 	}
 
 	return 0;
+
 }
 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 
@@ -6790,23 +6878,45 @@  int netdev_walk_all_lower_dev(struct net_device *dev,
 					void *data),
 			      void *data)
 {
-	struct net_device *ldev;
-	struct list_head *iter;
-	int ret;
+	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+	int ret, cur = 0;
 
-	for (iter = &dev->adj_list.lower,
-	     ldev = netdev_next_lower_dev(dev, &iter);
-	     ldev;
-	     ldev = netdev_next_lower_dev(dev, &iter)) {
-		/* first is the lower device itself */
-		ret = fn(ldev, data);
-		if (ret)
-			return ret;
+	now = dev;
+	iter = &dev->adj_list.lower;
 
-		/* then look at all of its lower devices */
-		ret = netdev_walk_all_lower_dev(ldev, fn, data);
-		if (ret)
-			return ret;
+	while (1) {
+		if (now != dev) {
+			ret = fn(now, data);
+			if (ret)
+				return ret;
+		}
+
+		next = NULL;
+		while (1) {
+			ldev = netdev_next_lower_dev(now, &iter);
+			if (!ldev)
+				break;
+
+			if (!next) {
+				next = ldev;
+				niter = &ldev->adj_list.lower;
+			} else {
+				dev_stack[cur] = ldev;
+				iter_stack[cur++] = &ldev->adj_list.lower;
+				break;
+			}
+		}
+
+		if (!next) {
+			if (!cur)
+				return 0;
+			next = dev_stack[--cur];
+			niter = iter_stack[cur];
+		}
+
+		now = next;
+		iter = niter;
 	}
 
 	return 0;
@@ -6827,31 +6937,100 @@  static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 	return lower->dev;
 }
 
-int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
-				  int (*fn)(struct net_device *dev,
-					    void *data),
-				  void *data)
+static u8 __netdev_upper_depth(struct net_device *dev)
+{
+	struct net_device *udev;
+	struct list_head *iter;
+	u8 max_depth = 0;
+
+	for (iter = &dev->adj_list.upper,
+	     udev = netdev_next_upper_dev(dev, &iter);
+	     udev;
+	     udev = netdev_next_upper_dev(dev, &iter)) {
+		if (max_depth < udev->upper_level)
+			max_depth = udev->upper_level;
+	}
+
+	return max_depth;
+}
+
+static u8 __netdev_lower_depth(struct net_device *dev)
 {
 	struct net_device *ldev;
 	struct list_head *iter;
-	int ret;
+	u8 max_depth = 0;
 
 	for (iter = &dev->adj_list.lower,
-	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
+	     ldev = netdev_next_lower_dev(dev, &iter);
 	     ldev;
-	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
-		/* first is the lower device itself */
-		ret = fn(ldev, data);
-		if (ret)
-			return ret;
+	     ldev = netdev_next_lower_dev(dev, &iter)) {
+		if (max_depth < ldev->lower_level)
+			max_depth = ldev->lower_level;
+	}
 
-		/* then look at all of its lower devices */
-		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
-		if (ret)
-			return ret;
+	return max_depth;
+}
+
+static int __netdev_update_upper_level(struct net_device *dev, void *data)
+{
+	dev->upper_level = __netdev_upper_depth(dev) + 1;
+	return 0;
+}
+
+static int __netdev_update_lower_level(struct net_device *dev, void *data)
+{
+	dev->lower_level = __netdev_lower_depth(dev) + 1;
+	return 0;
+}
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+				  int (*fn)(struct net_device *dev,
+					    void *data),
+				  void *data)
+{
+	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+	int ret, cur = 0;
+
+	now = dev;
+	iter = &dev->adj_list.lower;
+
+	while (1) {
+		if (now != dev) {
+			ret = fn(now, data);
+			if (ret)
+				return ret;
+		}
+
+		next = NULL;
+		while (1) {
+			ldev = netdev_next_lower_dev_rcu(now, &iter);
+			if (!ldev)
+				break;
+
+			if (!next) {
+				next = ldev;
+				niter = &ldev->adj_list.lower;
+			} else {
+				dev_stack[cur] = ldev;
+				iter_stack[cur++] = &ldev->adj_list.lower;
+				break;
+			}
+		}
+
+		if (!next) {
+			if (!cur)
+				return 0;
+			next = dev_stack[--cur];
+			niter = iter_stack[cur];
+		}
+
+		now = next;
+		iter = niter;
 	}
 
 	return 0;
+
 }
 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 
@@ -7105,6 +7284,9 @@  static int __netdev_upper_dev_link(struct net_device *dev,
 	if (netdev_has_upper_dev(upper_dev, dev))
 		return -EBUSY;
 
+	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
+		return -EMLINK;
+
 	if (!master) {
 		if (netdev_has_upper_dev(dev, upper_dev))
 			return -EEXIST;
@@ -7131,6 +7313,12 @@  static int __netdev_upper_dev_link(struct net_device *dev,
 	if (ret)
 		goto rollback;
 
+	__netdev_update_upper_level(dev, NULL);
+	netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+	__netdev_update_lower_level(upper_dev, NULL);
+	netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
+
 	return 0;
 
 rollback:
@@ -7213,6 +7401,12 @@  void netdev_upper_dev_unlink(struct net_device *dev,
 
 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 				      &changeupper_info.info);
+
+	__netdev_update_upper_level(dev, NULL);
+	netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+	__netdev_update_lower_level(upper_dev, NULL);
+	netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
@@ -9212,6 +9406,8 @@  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
+	dev->upper_level = 1;
+	dev->lower_level = 1;
 
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);