Message ID | 50858ABD.2000206@hp.com |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On 10/22/2012 10:04 PM, Brian Haley wrote: > On 10/19/2012 05:55 AM, Pavel Emelyanov wrote: >> The SO_BINDTODEVICE option is the only SOL_SOCKET one that can be set, but >> cannot be get via sockopt API. The only way we can find the device id a >> socket is bound to is via sock-diag interface. But the diag works only on >> hashed sockets, while the opt in question can be set for yet unhashed one. >> >> That said, in order to know what device a socket is bound to (we do want >> to know this in checkpoint-restore project) I propose to make this option >> getsockopt-able and report the respective device index. >> >> Another solution to the problem might be to teach the sock-diag reporting >> info on unhashed sockets. Should I go this way instead? >> >> Signed-off-by: Pavel Emelyanov <xemul@parallels.com> >> >> --- >> >> diff --git a/net/core/sock.c b/net/core/sock.c >> index 8a146cf..c49412c 100644 >> --- a/net/core/sock.c >> +++ b/net/core/sock.c >> @@ -1074,6 +1074,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, >> case SO_NOFCS: >> v.val = sock_flag(sk, SOCK_NOFCS); >> break; >> + case SO_BINDTODEVICE: >> + v.val = sk->sk_bound_dev_if; >> + break; >> default: >> return -ENOPROTOOPT; >> } > > Doesn't this make the set and get non-symmetrical? For example, setsockopt() > would take "eth0", but getsockopt() would return 2. It will, but since device name and index are two equal device "IDs" I assumed it would be OK. However, some comments inline. > The following patch would return a string, or -ENODEV if not set. > > -Brian > > --- > > Change getsockopt(SO_BINDTODEVICE) to be symmetrical with setsockopt() by > returning the interface name as a string. > > Signed-off-by: Brian Haley <brian.haley@hp.com> > > diff --git a/net/core/sock.c b/net/core/sock.c > index c49412c..69b9d92 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -505,7 +505,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) > } > EXPORT_SYMBOL(sk_dst_check); > > -static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) > +static int sock_setbindtodevice(struct sock *sk, char __user *optval, > + int optlen) > { > int ret = -ENOPROTOOPT; > #ifdef CONFIG_NETDEVICES > @@ -562,6 +563,49 @@ out: > return ret; > } > > +static int sock_getbindtodevice(struct sock *sk, char __user *optval, > + int __user *optlen, int len) > +{ > + int ret = -ENOPROTOOPT; > +#ifdef CONFIG_NETDEVICES > + struct net *net = sock_net(sk); > + struct net_device *dev; > + char devname[IFNAMSIZ]; > + > + ret = 0; > + if (sk->sk_bound_dev_if == 0) > + goto out; It will return 0 if device is not set, thus making it impossible to detect this situation. > + ret = -EINVAL; > + if (len < IFNAMSIZ) > + goto out; > + if (len > IFNAMSIZ) > + len = IFNAMSIZ; > + > + rcu_read_lock(); > + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); > + if (dev) > + strcpy(dev->name, devname); > + rcu_read_unlock(); > + ret = -ENODEV; > + if (!dev) > + goto out; > + > + ret = -EFAULT; > + if (copy_to_user(optval, devname, len)) > + goto out; > + > + if (put_user(len, optlen)) > + goto out; What's the point in reporting IFNAMSIZ to the userspace always, taking into account that this constant is exported there anyway? > + ret = 0; > + > +out: > +#endif > + > + return ret; > +} > + > static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) > { > if (valbool) > @@ -589,7 +633,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, > */ > > if (optname == SO_BINDTODEVICE) > - return sock_bindtodevice(sk, optval, optlen); > + return sock_setbindtodevice(sk, optval, optlen); > > if (optlen < sizeof(int)) > return -EINVAL; > @@ -1074,9 +1118,10 @@ int sock_getsockopt(struct socket *sock, int level, int > optname, > case SO_NOFCS: > v.val = sock_flag(sk, SOCK_NOFCS); > break; > + > case SO_BINDTODEVICE: > - v.val = sk->sk_bound_dev_if; > - break; > + return sock_getbindtodevice(sk, optval, optlen, len); > + > default: > return -ENOPROTOOPT; > } > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > . > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2012-10-22 at 14:04 -0400, Brian Haley wrote: > + char devname[IFNAMSIZ]; > + > + ret = 0; > + if (sk->sk_bound_dev_if == 0) > + goto out; > + > + ret = -EINVAL; > + if (len < IFNAMSIZ) > + goto out; > + if (len > IFNAMSIZ) > + len = IFNAMSIZ; > + > + rcu_read_lock(); > + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); > + if (dev) > + strcpy(dev->name, devname); > + rcu_read_unlock(); > + ret = -ENODEV; You probably meant strcpy(devname, dev->name) By the way, this is not really safe in case device is renamed -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/23/2012 12:45 AM, Eric Dumazet wrote: > On Mon, 2012-10-22 at 14:04 -0400, Brian Haley wrote: > >> + char devname[IFNAMSIZ]; >> + >> + ret = 0; >> + if (sk->sk_bound_dev_if == 0) >> + goto out; >> + >> + ret = -EINVAL; >> + if (len < IFNAMSIZ) >> + goto out; >> + if (len > IFNAMSIZ) >> + len = IFNAMSIZ; >> + >> + rcu_read_lock(); >> + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); >> + if (dev) >> + strcpy(dev->name, devname); >> + rcu_read_unlock(); >> + ret = -ENODEV; > > You probably meant > > strcpy(devname, dev->name) > > By the way, this is not really safe in case device is renamed Good point, actually. Getting a device name may be not very safe in terms of -- once we have the name there's no 100% guarantee, that this name corresponds to the actual device the socket is bound to (it could be renamed after we strcpy-ed its name). This problem doesn't exist when we get device index, as it cannot be changed. Thanks, Pavel -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/22/2012 04:45 PM, Eric Dumazet wrote: > On Mon, 2012-10-22 at 14:04 -0400, Brian Haley wrote: > >> + char devname[IFNAMSIZ]; >> + >> + ret = 0; >> + if (sk->sk_bound_dev_if == 0) >> + goto out; >> + >> + ret = -EINVAL; >> + if (len < IFNAMSIZ) >> + goto out; >> + if (len > IFNAMSIZ) >> + len = IFNAMSIZ; >> + >> + rcu_read_lock(); >> + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); >> + if (dev) >> + strcpy(dev->name, devname); >> + rcu_read_unlock(); >> + ret = -ENODEV; > > You probably meant > > strcpy(devname, dev->name) Yes, that was a stupid mistake, I'll fix it. > By the way, this is not really safe in case device is renamed It's not much different from what's there: setsockopt("foo"); rename foo -> bar index = getsockopt(); if_indextoname(index) -> "bar" I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *' to getsockopt(), I don't think any other value is non-symmetrical like this. -Brian -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/22/2012 04:28 PM, Pavel Emelyanov wrote: > On 10/22/2012 10:04 PM, Brian Haley wrote: >> On 10/19/2012 05:55 AM, Pavel Emelyanov wrote: >>> The SO_BINDTODEVICE option is the only SOL_SOCKET one that can be set, but >>> cannot be get via sockopt API. The only way we can find the device id a >>> socket is bound to is via sock-diag interface. But the diag works only on >>> hashed sockets, while the opt in question can be set for yet unhashed one. >>> >>> That said, in order to know what device a socket is bound to (we do want >>> to know this in checkpoint-restore project) I propose to make this option >>> getsockopt-able and report the respective device index. >>> >>> Another solution to the problem might be to teach the sock-diag reporting >>> info on unhashed sockets. Should I go this way instead? >>> >>> Signed-off-by: Pavel Emelyanov <xemul@parallels.com> >>> >>> --- >>> >>> diff --git a/net/core/sock.c b/net/core/sock.c >>> index 8a146cf..c49412c 100644 >>> --- a/net/core/sock.c >>> +++ b/net/core/sock.c >>> @@ -1074,6 +1074,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, >>> case SO_NOFCS: >>> v.val = sock_flag(sk, SOCK_NOFCS); >>> break; >>> + case SO_BINDTODEVICE: >>> + v.val = sk->sk_bound_dev_if; >>> + break; >>> default: >>> return -ENOPROTOOPT; >>> } >> >> Doesn't this make the set and get non-symmetrical? For example, setsockopt() >> would take "eth0", but getsockopt() would return 2. > > It will, but since device name and index are two equal device "IDs" I assumed > it would be OK. > > However, some comments inline. > >> The following patch would return a string, or -ENODEV if not set. Sorry, my description is not quite right, this should return something like this to be correct: 0 on success optlen zero if interface not set optlen > zero if set and optval filled-in -errno on failure >> +static int sock_getbindtodevice(struct sock *sk, char __user *optval, >> + int __user *optlen, int len) >> +{ >> + int ret = -ENOPROTOOPT; >> +#ifdef CONFIG_NETDEVICES >> + struct net *net = sock_net(sk); >> + struct net_device *dev; >> + char devname[IFNAMSIZ]; >> + >> + ret = 0; >> + if (sk->sk_bound_dev_if == 0) >> + goto out; > > It will return 0 if device is not set, thus making it impossible to detect > this situation. See below. >> + ret = -EINVAL; >> + if (len < IFNAMSIZ) >> + goto out; >> + if (len > IFNAMSIZ) >> + len = IFNAMSIZ; >> + >> + rcu_read_lock(); >> + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); >> + if (dev) >> + strcpy(dev->name, devname); >> + rcu_read_unlock(); >> + ret = -ENODEV; >> + if (!dev) >> + goto out; >> + >> + ret = -EFAULT; >> + if (copy_to_user(optval, devname, len)) >> + goto out; >> + >> + if (put_user(len, optlen)) >> + goto out; > > What's the point in reporting IFNAMSIZ to the userspace always, taking > into account that this constant is exported there anyway? I should have put an RFC on the patch :) In the case that there is no interface, the length returned would be zero, indicating nothing was there. I can post another version. -Brian -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2012-10-22 at 17:20 -0400, Brian Haley wrote: > It's not much different from what's there: > > setsockopt("foo"); > > rename foo -> bar > > index = getsockopt(); > if_indextoname(index) -> "bar" > > I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *' > to getsockopt(), I don't think any other value is non-symmetrical like this. > > -Brian I meant another cpu can be changing dev->name[] content while the strcpy() is done, and you get a mangled devname, like "for" or "bao" instead of "foo" or "bar" But yes, I obviously understood your point about "char *" and "int *" -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/22/2012 05:37 PM, Eric Dumazet wrote: > On Mon, 2012-10-22 at 17:20 -0400, Brian Haley wrote: > >> It's not much different from what's there: >> >> setsockopt("foo"); >> >> rename foo -> bar >> >> index = getsockopt(); >> if_indextoname(index) -> "bar" >> >> I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *' >> to getsockopt(), I don't think any other value is non-symmetrical like this. >> >> -Brian > > I meant another cpu can be changing dev->name[] content while the > strcpy() is done, and you get a mangled devname, like "for" or "bao" > instead of "foo" or "bar" Even when holding the rcu_read_lock()? I'd have to hold the rtnl lock there? -Brian -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2012-10-22 at 17:47 -0400, Brian Haley wrote: > On 10/22/2012 05:37 PM, Eric Dumazet wrote: > > On Mon, 2012-10-22 at 17:20 -0400, Brian Haley wrote: > > > >> It's not much different from what's there: > >> > >> setsockopt("foo"); > >> > >> rename foo -> bar > >> > >> index = getsockopt(); > >> if_indextoname(index) -> "bar" > >> > >> I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *' > >> to getsockopt(), I don't think any other value is non-symmetrical like this. > >> > >> -Brian > > > > I meant another cpu can be changing dev->name[] content while the > > strcpy() is done, and you get a mangled devname, like "for" or "bao" > > instead of "foo" or "bar" > > Even when holding the rcu_read_lock()? I'd have to hold the rtnl lock there? Yes, rcu_read_lock() only makes sure the device doesnt disappear. But its name can be changed. You could use a seqcount_t, so that readers dont have to lock rtnl. But do we really want to return a name here, I am not yet convinced. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2012-10-22 at 23:52 +0200, Eric Dumazet wrote: > On Mon, 2012-10-22 at 17:47 -0400, Brian Haley wrote: > > On 10/22/2012 05:37 PM, Eric Dumazet wrote: > > > On Mon, 2012-10-22 at 17:20 -0400, Brian Haley wrote: > > > > > >> It's not much different from what's there: > > >> > > >> setsockopt("foo"); > > >> > > >> rename foo -> bar > > >> > > >> index = getsockopt(); > > >> if_indextoname(index) -> "bar" > > >> > > >> I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *' > > >> to getsockopt(), I don't think any other value is non-symmetrical like this. > > >> > > >> -Brian > > > > > > I meant another cpu can be changing dev->name[] content while the > > > strcpy() is done, and you get a mangled devname, like "for" or "bao" > > > instead of "foo" or "bar" > > > > Even when holding the rcu_read_lock()? I'd have to hold the rtnl lock there? > > Yes, rcu_read_lock() only makes sure the device doesnt disappear. > > But its name can be changed. > > You could use a seqcount_t, so that readers dont have to lock rtnl. > > But do we really want to return a name here, I am not yet convinced. If setsockopt() takes a name then it makes no sense that getsockopt() would return an index. Perhaps an SO_BINDTOIFINDEX would be useful, but let's not make SO_BINDTODEVICE mean two different things. Ben.
On 10/22/2012 05:52 PM, Eric Dumazet wrote: >>> I meant another cpu can be changing dev->name[] content while the >>> strcpy() is done, and you get a mangled devname, like "for" or "bao" >>> instead of "foo" or "bar" >> >> Even when holding the rcu_read_lock()? I'd have to hold the rtnl lock there? > > Yes, rcu_read_lock() only makes sure the device doesnt disappear. > > But its name can be changed. There's a similar bug in the SIOCGIFNAME/dev_ifname() code too then, but I would think this rare enough that it doesn't happen in practice. -Brian -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/core/sock.c b/net/core/sock.c index c49412c..69b9d92 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -505,7 +505,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) +static int sock_setbindtodevice(struct sock *sk, char __user *optval, + int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -562,6 +563,49 @@ out: return ret; } +static int sock_getbindtodevice(struct sock *sk, char __user *optval, + int __user *optlen, int len) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + struct net_device *dev; + char devname[IFNAMSIZ]; + + ret = 0; + if (sk->sk_bound_dev_if == 0) + goto out; + + ret = -EINVAL; + if (len < IFNAMSIZ) + goto out; + if (len > IFNAMSIZ) + len = IFNAMSIZ; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + if (dev) + strcpy(dev->name, devname); + rcu_read_unlock(); + ret = -ENODEV; + if (!dev) + goto out; + + ret = -EFAULT; + if (copy_to_user(optval, devname, len)) + goto out; + + if (put_user(len, optlen)) + goto out; + + ret = 0; + +out: +#endif + + return ret; +} + static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) { if (valbool) @@ -589,7 +633,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, */ if (optname == SO_BINDTODEVICE) - return sock_bindtodevice(sk, optval, optlen); + return sock_setbindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; @@ -1074,9 +1118,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); break; + case SO_BINDTODEVICE: - v.val = sk->sk_bound_dev_if; - break; + return sock_getbindtodevice(sk, optval, optlen, len); + default: return -ENOPROTOOPT;