diff mbox

[net-next,v2,2/2] xfrm: configure policy hash table thresholds by netlink

Message ID 1406884348-12423-3-git-send-email-christophe.gouault@6wind.com
State Awaiting Upstream, archived
Delegated to: David Miller
Headers show

Commit Message

Christophe Gouault Aug. 1, 2014, 9:12 a.m. UTC
Enable to specify local and remote prefix length thresholds for the
policy hash table via a netlink XFRM_MSG_NEWSPDINFO message.

prefix length thresholds are specified by XFRMA_SPD_IPV4_HTHRESH and
XFRMA_SPD_IPV6_HTHRESH optional attributes (struct xfrmu_spdhthresh).

example:

    struct xfrmu_spdhthresh thresh4 = {
        .lbits = 0;
        .rbits = 24;
    };
    struct xfrmu_spdhthresh thresh6 = {
        .lbits = 0;
        .rbits = 56;
    };
    struct nlmsghdr *hdr;
    struct nl_msg *msg;

    msg = nlmsg_alloc();
    hdr = nlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, XFRMA_SPD_IPV4_HTHRESH, sizeof(__u32), NLM_F_REQUEST);
    nla_put(msg, XFRMA_SPD_IPV4_HTHRESH, sizeof(thresh4), &thresh4);
    nla_put(msg, XFRMA_SPD_IPV6_HTHRESH, sizeof(thresh6), &thresh6);
    nla_send_auto(sk, msg);

The numbers are the policy selector minimum prefix lengths to put a
policy in the hash table.

- lbits is the local threshold (source address for out policies,
  destination address for in and fwd policies).

- rbits is the remote threshold (destination address for out
  policies, source address for in and fwd policies).

The default values are:

XFRMA_SPD_IPV4_HTHRESH: 32 32
XFRMA_SPD_IPV6_HTHRESH: 128 128

Dynamic re-building of the SPD is performed when the thresholds values
are changed.

The kernel replies to XFRM_MSG_GETSPDINFO and XFRM_MSG_NEWSPDINFO
requests by an XFRM_MSG_NEWSPDINFO message, with both attributes
XFRMA_SPD_IPV4_HTHRESH and XFRMA_SPD_IPV6_HTHRESH.

Signed-off-by: Christophe Gouault <christophe.gouault@6wind.com>
---
v2:
- use netlink instead of /proc
---
 include/net/netns/xfrm.h  | 10 ++++++
 include/net/xfrm.h        |  1 +
 include/uapi/linux/xfrm.h |  7 ++++
 net/xfrm/xfrm_policy.c    | 90 ++++++++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_user.c      | 91 +++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 196 insertions(+), 3 deletions(-)

Comments

Christophe Gouault Aug. 1, 2014, 1:01 p.m. UTC | #1
This patchset is provided in order to test the kernel patchset
"[net-next v2 0/2] xfrm: scalability enhancements for policy database"
for those who would like to play with these new knobs.

Please note that I will be on vacation starting next week, so I will
not be very reactive to comments during August.

Best Regards,
Christophe
---
 include/linux/xfrm.h |   7 +++++
 ip/xfrm_policy.c     | 106
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 109 insertions(+), 4 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steffen Klassert Aug. 21, 2014, 6:09 a.m. UTC | #2
On Fri, Aug 01, 2014 at 11:12:28AM +0200, Christophe Gouault wrote:
> diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
> index 41902a8..9da7982 100644
> --- a/include/net/netns/xfrm.h
> +++ b/include/net/netns/xfrm.h
> @@ -19,6 +19,15 @@ struct xfrm_policy_hash {
>  	u8			sbits6;
>  };
>  
> +struct xfrm_policy_hthresh {
> +	struct work_struct	work;
> +	seqlock_t		lock;

This newly introduced lock is not initialized. It triggers an
inconsistent lock state warning when acquired for the first time.

>  
> +static void xfrm_hash_rebuild(struct work_struct *work)
> +{
> +	struct net *net = container_of(work, struct net,
> +				       xfrm.policy_hthresh.work);
> +	unsigned int hmask;
> +	struct xfrm_policy *pol;
> +	struct xfrm_policy *policy;
> +	struct hlist_head *chain;
> +	struct hlist_head *odst;
> +	struct hlist_node *newpos;
> +	int i;
> +	int dir;
> +	unsigned seq;
> +	u8 lbits4, rbits4, lbits6, rbits6;
> +
> +	mutex_lock(&hash_resize_mutex);
> +
> +	/* read selector prefixlen thresholds */
> +	do {
> +		seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);
> +
> +		lbits4 = net->xfrm.policy_hthresh.lbits4;
> +		rbits4 = net->xfrm.policy_hthresh.rbits4;
> +		lbits6 = net->xfrm.policy_hthresh.lbits6;
> +		rbits6 = net->xfrm.policy_hthresh.rbits6;
> +	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
> +
> +	write_lock_bh(&net->xfrm.xfrm_policy_lock);
> +
> +	pr_info("rebuilding SPD hash table: thresholds (%u,%u)(%u,%u)\n",
> +		lbits4, rbits4, lbits6, rbits6);

Do we really need to print this?

> +
> +	/* reset the bydst and inexact table in all directions */
> +	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
> +		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
> +		hmask = net->xfrm.policy_bydst[dir].hmask;
> +		odst = net->xfrm.policy_bydst[dir].table;
> +		for (i = hmask; i >= 0; i--)
> +			INIT_HLIST_HEAD(odst + i);
> +		if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
> +			/* dir out => dst = remote, src = local */
> +			net->xfrm.policy_bydst[dir].dbits4 = rbits4;
> +			net->xfrm.policy_bydst[dir].sbits4 = lbits4;
> +			net->xfrm.policy_bydst[dir].dbits6 = rbits6;
> +			net->xfrm.policy_bydst[dir].sbits6 = lbits6;
> +		} else {
> +			/* dir in/fwd => dst = local, src = remote */
> +			net->xfrm.policy_bydst[dir].dbits4 = lbits4;
> +			net->xfrm.policy_bydst[dir].sbits4 = rbits4;
> +			net->xfrm.policy_bydst[dir].dbits6 = lbits6;
> +			net->xfrm.policy_bydst[dir].sbits6 = rbits6;
> +		}
> +	}
> +
> +	/* re-insert all policies by order of creation */
> +	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
> +		newpos = NULL;
> +		chain = policy_hash_bysel(net, &policy->selector,
> +					  policy->family,
> +					  xfrm_policy_id2dir(policy->index));
> +		hlist_for_each_entry(pol, chain, bydst) {
> +			if (policy->priority >= pol->priority)
> +				newpos = &pol->bydst;
> +			else
> +				break;
> +		}
> +		if (newpos)
> +			hlist_add_after(newpos, &policy->bydst);

hlist_add_after() does not exist any more, it was replaced by
hlist_add_behind() recently.

>  
> +static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
> +			    struct nlattr **attrs)
> +{
> +	struct net *net = sock_net(skb->sk);
> +	struct sk_buff *r_skb;
> +	u32 *flags = nlmsg_data(nlh);
> +	u32 sportid = NETLINK_CB(skb).portid;
> +	u32 seq = nlh->nlmsg_seq;
> +	struct xfrmu_spdhthresh *thresh4 = NULL;
> +	struct xfrmu_spdhthresh *thresh6 = NULL;
> +
> +	/* selector prefixlen thresholds to hash policies */
> +	if (attrs[XFRMA_SPD_IPV4_HTHRESH]) {
> +		struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH];
> +
> +		if (nla_len(rta) < sizeof(*thresh4))
> +			return -EINVAL;
> +		thresh4 = nla_data(rta);
> +		if (thresh4->lbits > 32 || thresh4->rbits > 32)
> +			return -EINVAL;
> +	}
> +	if (attrs[XFRMA_SPD_IPV6_HTHRESH]) {
> +		struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH];
> +
> +		if (nla_len(rta) < sizeof(*thresh6))
> +			return -EINVAL;
> +		thresh6 = nla_data(rta);
> +		if (thresh6->lbits > 128 || thresh6->rbits > 128)
> +			return -EINVAL;
> +	}
> +
> +	if (thresh4 || thresh6) {
> +		write_seqlock(&net->xfrm.policy_hthresh.lock);
> +		if (thresh4) {
> +			net->xfrm.policy_hthresh.lbits4 = thresh4->lbits;
> +			net->xfrm.policy_hthresh.rbits4 = thresh4->rbits;
> +		}
> +		if (thresh6) {
> +			net->xfrm.policy_hthresh.lbits6 = thresh6->lbits;
> +			net->xfrm.policy_hthresh.rbits6 = thresh6->rbits;
> +		}
> +		write_sequnlock(&net->xfrm.policy_hthresh.lock);
> +
> +		xfrm_policy_hash_rebuild(net);
> +	}
> +
> +	r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC);
> +	if (r_skb == NULL)
> +		return -ENOMEM;
> +
> +	if (build_spdinfo(r_skb, net, sportid, seq, *flags) < 0)
> +		BUG();
> +
> +	return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid);

Why do you send these informations to userspace? This is a set
operation, not get.


The rest looks quite good, thanks!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christophe Gouault Aug. 26, 2014, 7:27 a.m. UTC | #3
2014-08-21 8:09 GMT+02:00 Steffen Klassert <steffen.klassert@secunet.com>:
> On Fri, Aug 01, 2014 at 11:12:28AM +0200, Christophe Gouault wrote:
>> diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
>> index 41902a8..9da7982 100644
>> --- a/include/net/netns/xfrm.h
>> +++ b/include/net/netns/xfrm.h
>> @@ -19,6 +19,15 @@ struct xfrm_policy_hash {
>>       u8                      sbits6;
>>  };
>>
>> +struct xfrm_policy_hthresh {
>> +     struct work_struct      work;
>> +     seqlock_t               lock;
>
> This newly introduced lock is not initialized. It triggers an
> inconsistent lock state warning when acquired for the first time.

oops! I'll fix that.

>> +     pr_info("rebuilding SPD hash table: thresholds (%u,%u)(%u,%u)\n",
>> +             lbits4, rbits4, lbits6, rbits6);
>
> Do we really need to print this?

No, it's not necessary, I will remove it.

>> +             hlist_for_each_entry(pol, chain, bydst) {
>> +                     if (policy->priority >= pol->priority)
>> +                             newpos = &pol->bydst;
>> +                     else
>> +                             break;
>> +             }
>> +             if (newpos)
>> +                     hlist_add_after(newpos, &policy->bydst);
>
> hlist_add_after() does not exist any more, it was replaced by
> hlist_add_behind() recently.

OK, I'll update the code accordingly.

>> +static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
>> +                         struct nlattr **attrs)
>> +{
>> +     struct net *net = sock_net(skb->sk);
>> +     struct sk_buff *r_skb;
>> +     u32 *flags = nlmsg_data(nlh);
>> +     u32 sportid = NETLINK_CB(skb).portid;
>> +     u32 seq = nlh->nlmsg_seq;
>> +     struct xfrmu_spdhthresh *thresh4 = NULL;
>> +     struct xfrmu_spdhthresh *thresh6 = NULL;
>> +
>> +     /* selector prefixlen thresholds to hash policies */
>> +     if (attrs[XFRMA_SPD_IPV4_HTHRESH]) {
>> +             struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH];
>> +
>> +             if (nla_len(rta) < sizeof(*thresh4))
>> +                     return -EINVAL;
>> +             thresh4 = nla_data(rta);
>> +             if (thresh4->lbits > 32 || thresh4->rbits > 32)
>> +                     return -EINVAL;
>> +     }
>> +     if (attrs[XFRMA_SPD_IPV6_HTHRESH]) {
>> +             struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH];
>> +
>> +             if (nla_len(rta) < sizeof(*thresh6))
>> +                     return -EINVAL;
>> +             thresh6 = nla_data(rta);
>> +             if (thresh6->lbits > 128 || thresh6->rbits > 128)
>> +                     return -EINVAL;
>> +     }
>> +
>> +     if (thresh4 || thresh6) {
>> +             write_seqlock(&net->xfrm.policy_hthresh.lock);
>> +             if (thresh4) {
>> +                     net->xfrm.policy_hthresh.lbits4 = thresh4->lbits;
>> +                     net->xfrm.policy_hthresh.rbits4 = thresh4->rbits;
>> +             }
>> +             if (thresh6) {
>> +                     net->xfrm.policy_hthresh.lbits6 = thresh6->lbits;
>> +                     net->xfrm.policy_hthresh.rbits6 = thresh6->rbits;
>> +             }
>> +             write_sequnlock(&net->xfrm.policy_hthresh.lock);
>> +
>> +             xfrm_policy_hash_rebuild(net);
>> +     }
>> +
>> +     r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC);
>> +     if (r_skb == NULL)
>> +             return -ENOMEM;
>> +
>> +     if (build_spdinfo(r_skb, net, sportid, seq, *flags) < 0)
>> +             BUG();
>> +
>> +     return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid);
>
> Why do you send these informations to userspace? This is a set
> operation, not get.

You're right, I'll remove this reply message.

> The rest looks quite good, thanks!

Thanks. I'll send an update.

Christophe
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christophe Gouault Aug. 27, 2014, 3:48 p.m. UTC | #4
This patchset enables to hash more policies than just non-prefixed
ones: hash policies whose prefix lengths are greater or equal to
configurable thresholds.

These thresholds are configured via netlink message
XFRM_MSG_NEWSPDINFO, attributes XFRMA_SPD_IPV4_HTHRESH and
XFRMA_SPD_IPV6_HTHRESH.

The related iproute2 patch for configuring the thresholds is available
on demand.

Best Regards,
Christophe
----
v2:
- change configuration API from proc to netlink
v3:
- initialize xfrm_policy_hthresh lock
- remove "rebuilding SPD hash table" log
- replace deprecated hlist_add_after by hlist_add_behind
- remove netlink reply to XFRM_MSG_NEWSPDINFO request
---
 include/net/netns/xfrm.h  |  14 +++++++
 include/net/xfrm.h        |   1 +
 include/uapi/linux/xfrm.h |   7 ++++
 net/xfrm/xfrm_hash.h      |  76 +++++++++++++++++++++++++++++++-----
 net/xfrm/xfrm_policy.c    | 140 +++++++++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_user.c      |  83 +++++++++++++++++++++++++++++++++++++--
 6 files changed, 302 insertions(+), 19 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 41902a8..9da7982 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -19,6 +19,15 @@  struct xfrm_policy_hash {
 	u8			sbits6;
 };
 
+struct xfrm_policy_hthresh {
+	struct work_struct	work;
+	seqlock_t		lock;
+	u8			lbits4;
+	u8			rbits4;
+	u8			lbits6;
+	u8			rbits6;
+};
+
 struct netns_xfrm {
 	struct list_head	state_all;
 	/*
@@ -45,6 +54,7 @@  struct netns_xfrm {
 	struct xfrm_policy_hash	policy_bydst[XFRM_POLICY_MAX * 2];
 	unsigned int		policy_count[XFRM_POLICY_MAX * 2];
 	struct work_struct	policy_hash_work;
+	struct xfrm_policy_hthresh policy_hthresh;
 
 
 	struct sock		*nlsk;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 721e9c3..dc4865e 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1591,6 +1591,7 @@  struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark,
 struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir,
 				     u32 id, int delete, int *err);
 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid);
+void xfrm_policy_hash_rebuild(struct net *net);
 u32 xfrm_get_acqseq(void);
 int verify_spi_info(u8 proto, u32 min, u32 max);
 int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi);
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 25e5dd9..02d5125 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -328,6 +328,8 @@  enum xfrm_spdattr_type_t {
 	XFRMA_SPD_UNSPEC,
 	XFRMA_SPD_INFO,
 	XFRMA_SPD_HINFO,
+	XFRMA_SPD_IPV4_HTHRESH,
+	XFRMA_SPD_IPV6_HTHRESH,
 	__XFRMA_SPD_MAX
 
 #define XFRMA_SPD_MAX (__XFRMA_SPD_MAX - 1)
@@ -347,6 +349,11 @@  struct xfrmu_spdhinfo {
 	__u32 spdhmcnt;
 };
 
+struct xfrmu_spdhthresh {
+	__u8 lbits;
+	__u8 rbits;
+};
+
 struct xfrm_usersa_info {
 	struct xfrm_selector		sel;
 	struct xfrm_id			id;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 312828c..c7d7a7e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -13,6 +13,8 @@ 
  *
  */
 
+#define pr_fmt(fmt) "IPsec: " fmt
+
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
@@ -566,6 +568,89 @@  static void xfrm_hash_resize(struct work_struct *work)
 	mutex_unlock(&hash_resize_mutex);
 }
 
+static void xfrm_hash_rebuild(struct work_struct *work)
+{
+	struct net *net = container_of(work, struct net,
+				       xfrm.policy_hthresh.work);
+	unsigned int hmask;
+	struct xfrm_policy *pol;
+	struct xfrm_policy *policy;
+	struct hlist_head *chain;
+	struct hlist_head *odst;
+	struct hlist_node *newpos;
+	int i;
+	int dir;
+	unsigned seq;
+	u8 lbits4, rbits4, lbits6, rbits6;
+
+	mutex_lock(&hash_resize_mutex);
+
+	/* read selector prefixlen thresholds */
+	do {
+		seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);
+
+		lbits4 = net->xfrm.policy_hthresh.lbits4;
+		rbits4 = net->xfrm.policy_hthresh.rbits4;
+		lbits6 = net->xfrm.policy_hthresh.lbits6;
+		rbits6 = net->xfrm.policy_hthresh.rbits6;
+	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
+
+	write_lock_bh(&net->xfrm.xfrm_policy_lock);
+
+	pr_info("rebuilding SPD hash table: thresholds (%u,%u)(%u,%u)\n",
+		lbits4, rbits4, lbits6, rbits6);
+
+	/* reset the bydst and inexact table in all directions */
+	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
+		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
+		hmask = net->xfrm.policy_bydst[dir].hmask;
+		odst = net->xfrm.policy_bydst[dir].table;
+		for (i = hmask; i >= 0; i--)
+			INIT_HLIST_HEAD(odst + i);
+		if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
+			/* dir out => dst = remote, src = local */
+			net->xfrm.policy_bydst[dir].dbits4 = rbits4;
+			net->xfrm.policy_bydst[dir].sbits4 = lbits4;
+			net->xfrm.policy_bydst[dir].dbits6 = rbits6;
+			net->xfrm.policy_bydst[dir].sbits6 = lbits6;
+		} else {
+			/* dir in/fwd => dst = local, src = remote */
+			net->xfrm.policy_bydst[dir].dbits4 = lbits4;
+			net->xfrm.policy_bydst[dir].sbits4 = rbits4;
+			net->xfrm.policy_bydst[dir].dbits6 = lbits6;
+			net->xfrm.policy_bydst[dir].sbits6 = rbits6;
+		}
+	}
+
+	/* re-insert all policies by order of creation */
+	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+		newpos = NULL;
+		chain = policy_hash_bysel(net, &policy->selector,
+					  policy->family,
+					  xfrm_policy_id2dir(policy->index));
+		hlist_for_each_entry(pol, chain, bydst) {
+			if (policy->priority >= pol->priority)
+				newpos = &pol->bydst;
+			else
+				break;
+		}
+		if (newpos)
+			hlist_add_after(newpos, &policy->bydst);
+		else
+			hlist_add_head(&policy->bydst, chain);
+	}
+
+	write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+	mutex_unlock(&hash_resize_mutex);
+}
+
+void xfrm_policy_hash_rebuild(struct net *net)
+{
+	schedule_work(&net->xfrm.policy_hthresh.work);
+}
+EXPORT_SYMBOL(xfrm_policy_hash_rebuild);
+
 /* Generate new index... KAME seems to generate them ordered by cost
  * of an absolute inpredictability of ordering of rules. This will not pass. */
 static u32 xfrm_gen_index(struct net *net, int dir, u32 index)
@@ -2870,9 +2955,14 @@  static int __net_init xfrm_policy_init(struct net *net)
 		htab->dbits6 = 128;
 		htab->sbits6 = 128;
 	}
+	net->xfrm.policy_hthresh.lbits4 = 32;
+	net->xfrm.policy_hthresh.rbits4 = 32;
+	net->xfrm.policy_hthresh.lbits6 = 128;
+	net->xfrm.policy_hthresh.rbits6 = 128;
 
 	INIT_LIST_HEAD(&net->xfrm.policy_all);
 	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
+	INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
 	if (net_eq(net, &init_net))
 		register_netdevice_notifier(&xfrm_dev_notifier);
 	return 0;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 412d9dc..a3549fa 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -965,7 +965,9 @@  static inline size_t xfrm_spdinfo_msgsize(void)
 {
 	return NLMSG_ALIGN(4)
 	       + nla_total_size(sizeof(struct xfrmu_spdinfo))
-	       + nla_total_size(sizeof(struct xfrmu_spdhinfo));
+	       + nla_total_size(sizeof(struct xfrmu_spdhinfo))
+	       + nla_total_size(sizeof(struct xfrmu_spdhthresh))
+	       + nla_total_size(sizeof(struct xfrmu_spdhthresh));
 }
 
 static int build_spdinfo(struct sk_buff *skb, struct net *net,
@@ -974,9 +976,11 @@  static int build_spdinfo(struct sk_buff *skb, struct net *net,
 	struct xfrmk_spdinfo si;
 	struct xfrmu_spdinfo spc;
 	struct xfrmu_spdhinfo sph;
+	struct xfrmu_spdhthresh spt4, spt6;
 	struct nlmsghdr *nlh;
 	int err;
 	u32 *f;
+	unsigned lseq;
 
 	nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0);
 	if (nlh == NULL) /* shouldn't really happen ... */
@@ -994,9 +998,22 @@  static int build_spdinfo(struct sk_buff *skb, struct net *net,
 	sph.spdhcnt = si.spdhcnt;
 	sph.spdhmcnt = si.spdhmcnt;
 
+	do {
+		lseq = read_seqbegin(&net->xfrm.policy_hthresh.lock);
+
+		spt4.lbits = net->xfrm.policy_hthresh.lbits4;
+		spt4.rbits = net->xfrm.policy_hthresh.rbits4;
+		spt6.lbits = net->xfrm.policy_hthresh.lbits6;
+		spt6.rbits = net->xfrm.policy_hthresh.rbits6;
+	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, lseq));
+
 	err = nla_put(skb, XFRMA_SPD_INFO, sizeof(spc), &spc);
 	if (!err)
 		err = nla_put(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph);
+	if (!err)
+		err = nla_put(skb, XFRMA_SPD_IPV4_HTHRESH, sizeof(spt4), &spt4);
+	if (!err)
+		err = nla_put(skb, XFRMA_SPD_IPV6_HTHRESH, sizeof(spt6), &spt6);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -1005,6 +1022,62 @@  static int build_spdinfo(struct sk_buff *skb, struct net *net,
 	return nlmsg_end(skb, nlh);
 }
 
+static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct sk_buff *r_skb;
+	u32 *flags = nlmsg_data(nlh);
+	u32 sportid = NETLINK_CB(skb).portid;
+	u32 seq = nlh->nlmsg_seq;
+	struct xfrmu_spdhthresh *thresh4 = NULL;
+	struct xfrmu_spdhthresh *thresh6 = NULL;
+
+	/* selector prefixlen thresholds to hash policies */
+	if (attrs[XFRMA_SPD_IPV4_HTHRESH]) {
+		struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH];
+
+		if (nla_len(rta) < sizeof(*thresh4))
+			return -EINVAL;
+		thresh4 = nla_data(rta);
+		if (thresh4->lbits > 32 || thresh4->rbits > 32)
+			return -EINVAL;
+	}
+	if (attrs[XFRMA_SPD_IPV6_HTHRESH]) {
+		struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH];
+
+		if (nla_len(rta) < sizeof(*thresh6))
+			return -EINVAL;
+		thresh6 = nla_data(rta);
+		if (thresh6->lbits > 128 || thresh6->rbits > 128)
+			return -EINVAL;
+	}
+
+	if (thresh4 || thresh6) {
+		write_seqlock(&net->xfrm.policy_hthresh.lock);
+		if (thresh4) {
+			net->xfrm.policy_hthresh.lbits4 = thresh4->lbits;
+			net->xfrm.policy_hthresh.rbits4 = thresh4->rbits;
+		}
+		if (thresh6) {
+			net->xfrm.policy_hthresh.lbits6 = thresh6->lbits;
+			net->xfrm.policy_hthresh.rbits6 = thresh6->rbits;
+		}
+		write_sequnlock(&net->xfrm.policy_hthresh.lock);
+
+		xfrm_policy_hash_rebuild(net);
+	}
+
+	r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC);
+	if (r_skb == NULL)
+		return -ENOMEM;
+
+	if (build_spdinfo(r_skb, net, sportid, seq, *flags) < 0)
+		BUG();
+
+	return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid);
+}
+
 static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
 		struct nlattr **attrs)
 {
@@ -2275,6 +2348,7 @@  static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
 	[XFRM_MSG_REPORT      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
 	[XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
 	[XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = sizeof(u32),
+	[XFRM_MSG_NEWSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
 	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
 };
 
@@ -2309,10 +2383,17 @@  static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_ADDRESS_FILTER]	= { .len = sizeof(struct xfrm_address_filter) },
 };
 
+static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
+	[XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) },
+	[XFRMA_SPD_IPV6_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) },
+};
+
 static const struct xfrm_link {
 	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
 	int (*dump)(struct sk_buff *, struct netlink_callback *);
 	int (*done)(struct netlink_callback *);
+	const struct nla_policy *nla_pol;
+	int nla_max;
 } xfrm_dispatch[XFRM_NR_MSGTYPES] = {
 	[XFRM_MSG_NEWSA       - XFRM_MSG_BASE] = { .doit = xfrm_add_sa        },
 	[XFRM_MSG_DELSA       - XFRM_MSG_BASE] = { .doit = xfrm_del_sa        },
@@ -2336,6 +2417,9 @@  static const struct xfrm_link {
 	[XFRM_MSG_GETAE       - XFRM_MSG_BASE] = { .doit = xfrm_get_ae  },
 	[XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate    },
 	[XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo   },
+	[XFRM_MSG_NEWSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_set_spdinfo,
+						   .nla_pol = xfrma_spd_policy,
+						   .nla_max = XFRMA_SPD_MAX },
 	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo   },
 };
 
@@ -2372,8 +2456,9 @@  static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		}
 	}
 
-	err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, XFRMA_MAX,
-			  xfrma_policy);
+	err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs,
+			  link->nla_max ? : XFRMA_MAX,
+			  link->nla_pol ? : xfrma_policy);
 	if (err < 0)
 		return err;