Message ID | 1266271241-6293-4-git-send-email-opurdila@ixiacom.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
Octavian Purdila wrote: > This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports > (bitmap type) which allows users to reserve ports for third-party > applications. > > The reserved ports will not be used by automatic port assignments > (e.g. when calling connect() or bind() with port number 0). Explicit > port allocation behavior is unchanged. > > Signed-off-by: Octavian Purdila <opurdila@ixiacom.com> > Signed-off-by: WANG Cong <amwang@redhat.com> > Cc: Neil Horman <nhorman@tuxdriver.com> > Cc: Eric Dumazet <eric.dumazet@gmail.com> > --- > Documentation/networking/ip-sysctl.txt | 12 ++++++++++++ > drivers/infiniband/core/cma.c | 7 ++++++- > include/net/ip.h | 6 ++++++ > net/ipv4/af_inet.c | 8 +++++++- > net/ipv4/inet_connection_sock.c | 6 ++++++ > net/ipv4/inet_hashtables.c | 2 ++ > net/ipv4/sysctl_net_ipv4.c | 17 +++++++++++++++++ > net/ipv4/udp.c | 3 ++- > net/sctp/socket.c | 2 ++ > 9 files changed, 60 insertions(+), 3 deletions(-) > > diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt > index 2dc7a1d..23be7a4 100644 > --- a/Documentation/networking/ip-sysctl.txt > +++ b/Documentation/networking/ip-sysctl.txt > @@ -564,6 +564,18 @@ ip_local_port_range - 2 INTEGERS > (i.e. by default) range 1024-4999 is enough to issue up to > 2000 connections per second to systems supporting timestamps. > > +ip_local_reserved_ports - BITMAP of 65536 ports > + Specify the ports which are reserved for known third-party > + applications. These ports will not be used by automatic port assignments > + (e.g. when calling connect() or bind() with port number 0). Explicit > + port allocation behavior is unchanged. > + > + Reserving ports is done by writing positive numbers in this proc entry, > + clearing them is done by writing negative numbers (e.g. 8080 reserves > + port number, -8080 makes it available for automatic assignment again). > + > + Default: Empty > + > ip_nonlocal_bind - BOOLEAN > If set, allows processes to bind() to non-local IP addresses, > which can be quite useful - but may break some applications. > diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c > index cc9b594..8248fc6 100644 > --- a/drivers/infiniband/core/cma.c > +++ b/drivers/infiniband/core/cma.c > @@ -1979,6 +1979,8 @@ retry: > /* FIXME: add proper port randomization per like inet_csk_get_port */ > do { > ret = idr_get_new_above(ps, bind_list, next_port, &port); > + if (inet_is_reserved_local_port(port)) > + ret = -EAGAIN; > } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); > > if (ret) > @@ -2997,10 +2999,13 @@ static int __init cma_init(void) > { > int ret, low, high, remaining; > > - get_random_bytes(&next_port, sizeof next_port); > inet_get_local_port_range(&low, &high); > +again: > + get_random_bytes(&next_port, sizeof next_port); > remaining = (high - low) + 1; > next_port = ((unsigned int) next_port % remaining) + low; > + if (inet_is_reserved_local_port(next_port)) > + goto again; > > cma_wq = create_singlethread_workqueue("rdma_cm"); > if (!cma_wq) > diff --git a/include/net/ip.h b/include/net/ip.h > index fb63371..2e24256 100644 > --- a/include/net/ip.h > +++ b/include/net/ip.h > @@ -184,6 +184,12 @@ extern struct local_ports { > } sysctl_local_ports; > extern void inet_get_local_port_range(int *low, int *high); > > +extern unsigned long *sysctl_local_reserved_ports; > +static inline int inet_is_reserved_local_port(int port) > +{ > + return test_bit(port, sysctl_local_reserved_ports); > +} > + > extern int sysctl_ip_default_ttl; > extern int sysctl_ip_nonlocal_bind; > > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > index 7d12c6a..06810b0 100644 > --- a/net/ipv4/af_inet.c > +++ b/net/ipv4/af_inet.c > @@ -1546,9 +1546,13 @@ static int __init inet_init(void) > > BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); > > + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); > + if (!sysctl_local_reserved_ports) > + goto out; > + I think we should also consider the ports in ip_local_port_range, since we can only reserve the ports in that range. > rc = proto_register(&tcp_prot, 1); > if (rc) > - goto out; > + goto out_free_reserved_ports; > > rc = proto_register(&udp_prot, 1); > if (rc) > @@ -1647,6 +1651,8 @@ out_unregister_udp_proto: > proto_unregister(&udp_prot); > out_unregister_tcp_proto: > proto_unregister(&tcp_prot); > +out_free_reserved_ports: > + kfree(sysctl_local_reserved_ports); > goto out; > } > > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > index 8da6429..1acb462 100644 > --- a/net/ipv4/inet_connection_sock.c > +++ b/net/ipv4/inet_connection_sock.c > @@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __read_mostly = { > .range = { 32768, 61000 }, > }; > > +unsigned long *sysctl_local_reserved_ports; > +EXPORT_SYMBOL(sysctl_local_reserved_ports); > + > void inet_get_local_port_range(int *low, int *high) > { > unsigned seq; > @@ -108,6 +111,8 @@ again: > > smallest_size = -1; > do { > + if (inet_is_reserved_local_port(rover)) > + goto next_nolock; > head = &hashinfo->bhash[inet_bhashfn(net, rover, > hashinfo->bhash_size)]; > spin_lock(&head->lock); > @@ -130,6 +135,7 @@ again: > break; > next: > spin_unlock(&head->lock); > + next_nolock: > if (++rover > high) > rover = low; > } while (--remaining > 0); > diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c > index 2b79377..d3e160a 100644 > --- a/net/ipv4/inet_hashtables.c > +++ b/net/ipv4/inet_hashtables.c > @@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, > local_bh_disable(); > for (i = 1; i <= remaining; i++) { > port = low + (i + offset) % remaining; > + if (inet_is_reserved_local_port(port)) > + continue; > head = &hinfo->bhash[inet_bhashfn(net, port, > hinfo->bhash_size)]; > spin_lock(&head->lock); > diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c > index 7e3712c..ce3597a 100644 > --- a/net/ipv4/sysctl_net_ipv4.c > +++ b/net/ipv4/sysctl_net_ipv4.c > @@ -298,6 +298,13 @@ static struct ctl_table ipv4_table[] = { > .mode = 0644, > .proc_handler = ipv4_local_port_range, > }, > + { > + .procname = "ip_local_reserved_ports", > + .data = NULL, /* initialized in sysctl_ipv4_init */ > + .maxlen = 65536, > + .mode = 0644, > + .proc_handler = proc_dobitmap, > + }, Isn't there an off-by-one here? In patch 2/3, you use 0 to set the fist bit, then how about 65535 which writes 65536th bit? This is beyond the range of port number. > #ifdef CONFIG_IP_MULTICAST > { > .procname = "igmp_max_memberships", > @@ -721,6 +728,16 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { > static __init int sysctl_ipv4_init(void) > { > struct ctl_table_header *hdr; > + struct ctl_table *i; > + > + for (i = ipv4_table; i->procname; i++) { > + if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { > + i->data = sysctl_local_reserved_ports; > + break; > + } > + } > + if (!i->procname[0]) > + return -EINVAL; > > hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); > if (hdr == NULL) > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index 608a544..bfd0a6a 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -232,7 +232,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, > */ > do { > if (low <= snum && snum <= high && > - !test_bit(snum >> udptable->log, bitmap)) > + !test_bit(snum >> udptable->log, bitmap) && > + !inet_is_reserved_local_port(snum)) > goto found; > snum += rand; > } while (snum != first); > diff --git a/net/sctp/socket.c b/net/sctp/socket.c > index f6d1e59..1f839d0 100644 > --- a/net/sctp/socket.c > +++ b/net/sctp/socket.c > @@ -5432,6 +5432,8 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) > rover++; > if ((rover < low) || (rover > high)) > rover = low; > + if (inet_is_reserved_local_port(rover)) > + continue; > index = sctp_phashfn(rover); > head = &sctp_port_hashtable[index]; > sctp_spin_lock(&head->lock); -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tuesday 16 February 2010 11:37:04 you wrote: > > BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); > > > > + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); > > + if (!sysctl_local_reserved_ports) > > + goto out; > > + > > I think we should also consider the ports in ip_local_port_range, > since we can only reserve the ports in that range. > That is subject to changes at runtime, which means we will have to readjust the bitmap at runtime which introduces the need for additional synchronization operations which I would rather avoid. > > + { > > + .procname = "ip_local_reserved_ports", > > + .data = NULL, /* initialized in sysctl_ipv4_init */ > > + .maxlen = 65536, > > + .mode = 0644, > > + .proc_handler = proc_dobitmap, > > + }, > > Isn't there an off-by-one here? > > In patch 2/3, you use 0 to set the fist bit, then how about 65535 which > writes 65536th bit? This is beyond the range of port number. > This seems fine to me, 65535 is the value used by both the port checking function and the proc read/write function. And it translates indeed to 65536th bit, but that is also bit 65535 if you start counting bits from 0 instead of 1. The usual computing/natural arithmetic confusion for the meaning of first :) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Octavian Purdila wrote: > On Tuesday 16 February 2010 11:37:04 you wrote: >>> BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); >>> >>> + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); >>> + if (!sysctl_local_reserved_ports) >>> + goto out; >>> + >> I think we should also consider the ports in ip_local_port_range, >> since we can only reserve the ports in that range. >> > > That is subject to changes at runtime, which means we will have to readjust > the bitmap at runtime which introduces the need for additional synchronization > operations which I would rather avoid. Why? As long as the bitmap is global, this will not be hard. Consider that if one user writes a port number which is beyond the ip_local_port_range into ip_local_reserved_ports, we should not accept this, because it doesn't make any sense. But with your patch, we do. > >>> + { >>> + .procname = "ip_local_reserved_ports", >>> + .data = NULL, /* initialized in sysctl_ipv4_init */ >>> + .maxlen = 65536, >>> + .mode = 0644, >>> + .proc_handler = proc_dobitmap, >>> + }, >> Isn't there an off-by-one here? >> >> In patch 2/3, you use 0 to set the fist bit, then how about 65535 which >> writes 65536th bit? This is beyond the range of port number. >> > > This seems fine to me, 65535 is the value used by both the port checking > function and the proc read/write function. And it translates indeed to > 65536th bit, but that is also bit 65535 if you start counting bits from 0 > instead of 1. The usual computing/natural arithmetic confusion for the meaning > of first :) > Oh, I see. Thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le mardi 16 février 2010 à 21:06 +0800, Cong Wang a écrit : > Octavian Purdila wrote: > > On Tuesday 16 February 2010 11:37:04 you wrote: > >>> BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); > >>> > >>> + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); > >>> + if (!sysctl_local_reserved_ports) > >>> + goto out; > >>> + > >> I think we should also consider the ports in ip_local_port_range, > >> since we can only reserve the ports in that range. > >> > > > > That is subject to changes at runtime, which means we will have to readjust > > the bitmap at runtime which introduces the need for additional synchronization > > operations which I would rather avoid. > > Why? As long as the bitmap is global, this will not be hard. > > Consider that if one user writes a port number which is beyond > the ip_local_port_range into ip_local_reserved_ports, we should > not accept this, because it doesn't make any sense. But with your > patch, we do. I disagree with you. This is perfectly OK. A port not being flagged in ip_local_reserved_ports doesnt mean it can be used for allocation. If you want to really block ports from being used at boot, you could for example : # temporarly reduce the ip_local_port_range echo "61000 61001" >/proc/sys/net/ipv4/ip_local_port_range # Build our bitmap (could be slow, if a remote database is read) for port in $LIST_RESERVED_PORT do echo $port >/proc/sys/net/ipv4/ip_local_reserved_ports done echo "10000 61000" >/proc/sys/net/ipv4/ip_local_port_range -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tuesday 16 February 2010 15:06:26 you wrote: > Octavian Purdila wrote: > > On Tuesday 16 February 2010 11:37:04 you wrote: > >>> BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); > >>> > >>> + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); > >>> + if (!sysctl_local_reserved_ports) > >>> + goto out; > >>> + > >> > >> I think we should also consider the ports in ip_local_port_range, > >> since we can only reserve the ports in that range. > > > > That is subject to changes at runtime, which means we will have to > > readjust the bitmap at runtime which introduces the need for additional > > synchronization operations which I would rather avoid. > > Why? As long as the bitmap is global, this will not be hard. > For the more important point see bellow, but with regard to reallocation, this means we need to at least use rcu_read_lock() in the fast path to avoid races between freeing the old bitmap and doing a read in progress. Granted, that is a light operation, but would it makes things so much more complicated just so that we save one memory page (assuming the range is the default [32000 64000] one). > Consider that if one user writes a port number which is beyond > the ip_local_port_range into ip_local_reserved_ports, we should > not accept this, because it doesn't make any sense. But with your > patch, we do. > I think it should be allowed. I see ip_local_reserved_ports and ip_local_range as independent settings that can be change at any time. That way I can flag port 8080 even if the current range is [32000, 64000] and then later I can expand the range to [1024, 64000] without loosing the 8080 reservation. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wednesday 17 February 2010 18:39:28 you wrote: > Le jeudi 18 février 2010 à 00:13 +0800, Cong Wang a écrit : > > I don't think so, if you want to avoid race condition, you just need to > > write the reserved ports before any networking application starts, IOW, > > as early as possible during boot. > > Sure, but I was thinking retrieving the list of reserved port by a > database query, using network :) > > Anyway, I just feel your argument is not applicable. > > Our kernel is capable of doing an intersection for us, we dont need > to forbid user to mark a port as 'reserved' if this port is already > blacklisted by another mechanism (for example, if this port is already > in use) > Also I believe that ip_local_port_range purpose is not to reserve *specific* ports. Changing this setting helps with things like increasing the port space for NAT or for a higher connection rate. We add the new option for reserving *specific* ports. So, even from a functional perspective, it makes more sense to me to keep them independent, as they serve different purposes. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Octavian Purdila wrote: > On Tuesday 16 February 2010 15:06:26 you wrote: >> Octavian Purdila wrote: >>> On Tuesday 16 February 2010 11:37:04 you wrote: >>>>> BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); >>>>> >>>>> + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); >>>>> + if (!sysctl_local_reserved_ports) >>>>> + goto out; >>>>> + >>>> I think we should also consider the ports in ip_local_port_range, >>>> since we can only reserve the ports in that range. >>> That is subject to changes at runtime, which means we will have to >>> readjust the bitmap at runtime which introduces the need for additional >>> synchronization operations which I would rather avoid. >> Why? As long as the bitmap is global, this will not be hard. >> > > For the more important point see bellow, but with regard to reallocation, this > means we need to at least use rcu_read_lock() in the fast path to avoid races > between freeing the old bitmap and doing a read in progress. > > Granted, that is a light operation, but would it makes things so much more > complicated just so that we save one memory page (assuming the range is the > default [32000 64000] one). Why not just allocate the bitmap for all ports? 65535/8 bytes are needed. > >> Consider that if one user writes a port number which is beyond >> the ip_local_port_range into ip_local_reserved_ports, we should >> not accept this, because it doesn't make any sense. But with your >> patch, we do. >> > > I think it should be allowed. I see ip_local_reserved_ports and ip_local_range > as independent settings that can be change at any time. According to the original purpose, they are not. > > That way I can flag port 8080 even if the current range is [32000, 64000] and > then later I can expand the range to [1024, 64000] without loosing the 8080 > reservation. Then its meaning is changed, bind(0) will never have chance to get 8080, thus reserving 8080 for this purpose fails. I want to always keep its original meaning, if the local_port_range goes out, then local_reserved_port should be empty at the same time, you have to reset it after changing local_port_range. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Eric Dumazet wrote: > Le mardi 16 février 2010 à 21:06 +0800, Cong Wang a écrit : >> Octavian Purdila wrote: >>> On Tuesday 16 February 2010 11:37:04 you wrote: >>>>> BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); >>>>> >>>>> + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); >>>>> + if (!sysctl_local_reserved_ports) >>>>> + goto out; >>>>> + >>>> I think we should also consider the ports in ip_local_port_range, >>>> since we can only reserve the ports in that range. >>>> >>> That is subject to changes at runtime, which means we will have to readjust >>> the bitmap at runtime which introduces the need for additional synchronization >>> operations which I would rather avoid. >> Why? As long as the bitmap is global, this will not be hard. >> >> Consider that if one user writes a port number which is beyond >> the ip_local_port_range into ip_local_reserved_ports, we should >> not accept this, because it doesn't make any sense. But with your >> patch, we do. > > I disagree with you. This is perfectly OK. > > A port not being flagged in ip_local_reserved_ports doesnt mean it can > be used for allocation. > > If you want to really block ports from being used at boot, you could for > example : > > # temporarly reduce the ip_local_port_range > echo "61000 61001" >/proc/sys/net/ipv4/ip_local_port_range > # Build our bitmap (could be slow, if a remote database is read) > for port in $LIST_RESERVED_PORT > do > echo $port >/proc/sys/net/ipv4/ip_local_reserved_ports > done > echo "10000 61000" >/proc/sys/net/ipv4/ip_local_port_range > > I don't think so, if you want to avoid race condition, you just need to write the reserved ports before any networking application starts, IOW, as early as possible during boot. Thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le jeudi 18 février 2010 à 00:13 +0800, Cong Wang a écrit : > I don't think so, if you want to avoid race condition, you just need to > write the reserved ports before any networking application starts, IOW, > as early as possible during boot. > Sure, but I was thinking retrieving the list of reserved port by a database query, using network :) Anyway, I just feel your argument is not applicable. Our kernel is capable of doing an intersection for us, we dont need to forbid user to mark a port as 'reserved' if this port is already blacklisted by another mechanism (for example, if this port is already in use) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Eric Dumazet wrote: > Le jeudi 18 février 2010 à 00:13 +0800, Cong Wang a écrit : > >> I don't think so, if you want to avoid race condition, you just need to >> write the reserved ports before any networking application starts, IOW, >> as early as possible during boot. >> > > Sure, but I was thinking retrieving the list of reserved port by a > database query, using network :) > > Anyway, I just feel your argument is not applicable. > > Our kernel is capable of doing an intersection for us, we dont need > to forbid user to mark a port as 'reserved' if this port is already > blacklisted by another mechanism (for example, if this port is already > in use) > Oh, I see your points. But this still could make people confused, like me. I think we'd better document this. Thanks. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 2dc7a1d..23be7a4 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -564,6 +564,18 @@ ip_local_port_range - 2 INTEGERS (i.e. by default) range 1024-4999 is enough to issue up to 2000 connections per second to systems supporting timestamps. +ip_local_reserved_ports - BITMAP of 65536 ports + Specify the ports which are reserved for known third-party + applications. These ports will not be used by automatic port assignments + (e.g. when calling connect() or bind() with port number 0). Explicit + port allocation behavior is unchanged. + + Reserving ports is done by writing positive numbers in this proc entry, + clearing them is done by writing negative numbers (e.g. 8080 reserves + port number, -8080 makes it available for automatic assignment again). + + Default: Empty + ip_nonlocal_bind - BOOLEAN If set, allows processes to bind() to non-local IP addresses, which can be quite useful - but may break some applications. diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index cc9b594..8248fc6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1979,6 +1979,8 @@ retry: /* FIXME: add proper port randomization per like inet_csk_get_port */ do { ret = idr_get_new_above(ps, bind_list, next_port, &port); + if (inet_is_reserved_local_port(port)) + ret = -EAGAIN; } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); if (ret) @@ -2997,10 +2999,13 @@ static int __init cma_init(void) { int ret, low, high, remaining; - get_random_bytes(&next_port, sizeof next_port); inet_get_local_port_range(&low, &high); +again: + get_random_bytes(&next_port, sizeof next_port); remaining = (high - low) + 1; next_port = ((unsigned int) next_port % remaining) + low; + if (inet_is_reserved_local_port(next_port)) + goto again; cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) diff --git a/include/net/ip.h b/include/net/ip.h index fb63371..2e24256 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -184,6 +184,12 @@ extern struct local_ports { } sysctl_local_ports; extern void inet_get_local_port_range(int *low, int *high); +extern unsigned long *sysctl_local_reserved_ports; +static inline int inet_is_reserved_local_port(int port) +{ + return test_bit(port, sysctl_local_reserved_ports); +} + extern int sysctl_ip_default_ttl; extern int sysctl_ip_nonlocal_bind; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 7d12c6a..06810b0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1546,9 +1546,13 @@ static int __init inet_init(void) BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!sysctl_local_reserved_ports) + goto out; + rc = proto_register(&tcp_prot, 1); if (rc) - goto out; + goto out_free_reserved_ports; rc = proto_register(&udp_prot, 1); if (rc) @@ -1647,6 +1651,8 @@ out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); +out_free_reserved_ports: + kfree(sysctl_local_reserved_ports); goto out; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8da6429..1acb462 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __read_mostly = { .range = { 32768, 61000 }, }; +unsigned long *sysctl_local_reserved_ports; +EXPORT_SYMBOL(sysctl_local_reserved_ports); + void inet_get_local_port_range(int *low, int *high) { unsigned seq; @@ -108,6 +111,8 @@ again: smallest_size = -1; do { + if (inet_is_reserved_local_port(rover)) + goto next_nolock; head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; spin_lock(&head->lock); @@ -130,6 +135,7 @@ again: break; next: spin_unlock(&head->lock); + next_nolock: if (++rover > high) rover = low; } while (--remaining > 0); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2b79377..d3e160a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; + if (inet_is_reserved_local_port(port)) + continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock(&head->lock); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7e3712c..ce3597a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -298,6 +298,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_local_reserved_ports", + .data = NULL, /* initialized in sysctl_ipv4_init */ + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_dobitmap, + }, #ifdef CONFIG_IP_MULTICAST { .procname = "igmp_max_memberships", @@ -721,6 +728,16 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; + struct ctl_table *i; + + for (i = ipv4_table; i->procname; i++) { + if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { + i->data = sysctl_local_reserved_ports; + break; + } + } + if (!i->procname[0]) + return -EINVAL; hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); if (hdr == NULL) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 608a544..bfd0a6a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -232,7 +232,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, */ do { if (low <= snum && snum <= high && - !test_bit(snum >> udptable->log, bitmap)) + !test_bit(snum >> udptable->log, bitmap) && + !inet_is_reserved_local_port(snum)) goto found; snum += rand; } while (snum != first); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f6d1e59..1f839d0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5432,6 +5432,8 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) rover++; if ((rover < low) || (rover > high)) rover = low; + if (inet_is_reserved_local_port(rover)) + continue; index = sctp_phashfn(rover); head = &sctp_port_hashtable[index]; sctp_spin_lock(&head->lock);