diff mbox

[net-next,3/3] net/sched: Introduce act_iptunnel

Message ID 20160822143834.32422-4-amir@vadai.me
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Amir Vadai Aug. 22, 2016, 2:38 p.m. UTC
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device

The action will release the metadata created by the tunnel device
(decap), or set the metadata with the specified values for encap
operation.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
iptunnel action and it's arguments:

$ filter add dev net0 protocol ip parent ffff: \
    flower \
      ip_proto 1 \
      dst_ip 11.11.11.2 \
    action iptunnel encap \
      src_ip 11.11.0.1 \
      dst_ip 11.11.0.2 \
      id 11 \
    action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <amir@vadai.me>
---
 include/net/tc_act/tc_iptunnel.h        |  24 +++
 include/uapi/linux/tc_act/tc_iptunnel.h |  40 +++++
 net/sched/Kconfig                       |  11 ++
 net/sched/Makefile                      |   1 +
 net/sched/act_iptunnel.c                | 292 ++++++++++++++++++++++++++++++++
 5 files changed, 368 insertions(+)
 create mode 100644 include/net/tc_act/tc_iptunnel.h
 create mode 100644 include/uapi/linux/tc_act/tc_iptunnel.h
 create mode 100644 net/sched/act_iptunnel.c

Comments

Jiri Benc Aug. 22, 2016, 5:07 p.m. UTC | #1
On Mon, 22 Aug 2016 17:38:34 +0300, Amir Vadai wrote:
> This action could be used before redirecting packets to a shared tunnel
> device, or when redirecting packets arriving from a such a device
> 
> The action will release the metadata created by the tunnel device
> (decap), or set the metadata with the specified values for encap
> operation.

I understand the motivation for the decap action. However, what would
happen if someone does not include it?

Borrowing your example from the cover letter and modifying it, what
would happen in the following case?

$ tc filter add dev vxlan0 protocol ip parent ffff: \
    flower \
  	  enc_src_ip 11.11.0.2 \
  		enc_dst_ip 11.11.0.1 \
  		enc_key_id 11 \
  	  action mirred egress redirect dev vnet0

Thanks,

 Jiri
Shmulik Ladkani Aug. 22, 2016, 5:57 p.m. UTC | #2
Hi,

On Mon, 22 Aug 2016 17:38:34 +0300 Amir Vadai <amir@vadai.me> wrote:
> +static struct metadata_dst *iptunnel_alloc(struct tcf_iptunnel *t,
> +					   __be32 saddr, __be32 daddr,
> +					   __be64 key_id)
> +{
> +	struct ip_tunnel_info *tun_info;
> +	struct metadata_dst *metadata;
> +
> +	metadata = metadata_dst_alloc(0, GFP_KERNEL);
> +	if (!metadata)
> +		return ERR_PTR(-ENOMEM);
> +
> +	tun_info = &metadata->u.tun_info;
> +	tun_info->mode = IP_TUNNEL_INFO_TX;
> 
> +	ip_tunnel_key_init(&tun_info->key, saddr, daddr, 0, 0, 0, 0, 0,
> +			   key_id, 0);

Seems key.tun_flags should be armed with TUNNEL_KEY.
This will make things work with GRE as well.
Pass it in the 'tun_flags' parameter.

> +
> +	return metadata;
> +}
> +
> +static int tcf_iptunnel_init(struct net *net, struct nlattr *nla,
> +			     struct nlattr *est, struct tc_action **a,
> +			     int ovr, int bind)
> +{
> +	struct tc_action_net *tn = net_generic(net, iptunnel_net_id);
> +	struct nlattr *tb[TCA_IPTUNNEL_MAX + 1];
> +	struct metadata_dst *metadata;
> +	struct tc_iptunnel *parm;
> +	struct tcf_iptunnel *t;
> +	__be32 saddr = 0;
> +	__be32 daddr = 0;
> +	__be64 key_id = 0;
> +	int encapdecap;
> +	bool exists = false;
> +	int ret = -EINVAL;
> +	int err;
> +
> +	if (!nla)
> +		return -EINVAL;
> +
> +	err = nla_parse_nested(tb, TCA_IPTUNNEL_MAX, nla, iptunnel_policy);
> +	if (err < 0)
> +		return err;
> +
> +	if (!tb[TCA_IPTUNNEL_PARMS])
> +		return -EINVAL;
> +	parm = nla_data(tb[TCA_IPTUNNEL_PARMS]);
> +	exists = tcf_hash_check(tn, parm->index, a, bind);
> +	if (exists && bind)
> +		return 0;
> +
> +	encapdecap = parm->t_action;
> +
> +	switch (encapdecap) {
> +	case TCA_IPTUNNEL_ACT_DECAP:
> +		break;
> +	case TCA_IPTUNNEL_ACT_ENCAP:
> +		if (tb[TCA_IPTUNNEL_ENC_IPV4_SRC])
> +			saddr = nla_get_be32(tb[TCA_IPTUNNEL_ENC_IPV4_SRC]);
> +		if (tb[TCA_IPTUNNEL_ENC_IPV4_DST])
> +			daddr = nla_get_be32(tb[TCA_IPTUNNEL_ENC_IPV4_DST]);
> +		if (tb[TCA_IPTUNNEL_ENC_KEY_ID])
> +			key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_IPTUNNEL_ENC_KEY_ID]));
> +
> +		if (!saddr || !daddr || !key_id) {

A zero tunnel ID is legit.

> +			ret = -EINVAL;
> +			goto err_out;
> +		}
> +
> +		metadata = iptunnel_alloc(t, saddr, daddr, key_id);
> +		if (IS_ERR(metadata)) {
> +			ret = PTR_ERR(metadata);
> +			goto err_out;
> +		}
> +
> +		break;
> +	default:
> +		goto err_out;
> +	}
> +
> +	if (!exists) {
> +		ret = tcf_hash_create(tn, parm->index, est, a,
> +				      &act_iptunnel_ops, bind, false);
> +		if (ret)
> +			return ret;
> +
> +		ret = ACT_P_CREATED;
> +	} else {
> +		tcf_hash_release(*a, bind);
> +		if (!ovr)
> +			return -EEXIST;
> +	}
> +
> +	t = to_iptunnel(*a);
> +
> +	spin_lock_bh(&t->tcf_lock);
> +
> +	t->tcf_action = parm->action;
> +
> +	t->tcft_action = encapdecap;
> +	t->tcft_enc_metadata = metadata;

Although tcft_enc_metadata is not used in TCA_IPTUNNEL_ACT_DECAP, still
prefer to nullify it instead of initializing it to stack junk.

> +
> +	spin_unlock_bh(&t->tcf_lock);
> +
> +	if (ret == ACT_P_CREATED)
> +		tcf_hash_insert(tn, *a);
> +
> +	return ret;

In the (exists && ovr) case, 'ret' seems to be left as '-EINVAL' as was
initialized. Initialize 'ret' to zero instead.

> +
> +err_out:
> +	if (exists)
> +		tcf_hash_release(*a, bind);
> +	return ret;
> +}
> +
Or Gerlitz Aug. 22, 2016, 6:15 p.m. UTC | #3
On Mon, Aug 22, 2016 at 5:38 PM, Amir Vadai <amir@vadai.me> wrote:
> This action could be used before redirecting packets to a shared tunnel
> device, or when redirecting packets arriving from a such a device

nit, add period


> The action will release the metadata created by the tunnel device
> (decap), or set the metadata with the specified values for encap
> operation.

Jiri B. -- SB pointer to the code


> +static int tcf_iptunnel(struct sk_buff *skb, const struct tc_action *a,
> +                       struct tcf_result *res)
> +{
> +       struct tcf_iptunnel *t = to_iptunnel(a);
> +       int action;
> +
> +       spin_lock(&t->tcf_lock);
> +       tcf_lastuse_update(&t->tcf_tm);
> +       bstats_update(&t->tcf_bstats, skb);
> +       action = t->tcf_action;
> +
> +       switch (t->tcft_action) {
> +       case TCA_IPTUNNEL_ACT_DECAP:
> +               skb_dst_set_noref(skb, NULL);
> +               break;
> +       case TCA_IPTUNNEL_ACT_ENCAP:
> +               skb_dst_set_noref(skb, &t->tcft_enc_metadata->dst);

Jiri B > I understand the motivation for the decap action. However, what would
Jiri B > happen if someone does not include it?

The MD set by the (say) vxlan device will not be "consumed" (cleared)
and would be keep travelling with the SKB

> +
> +               break;
> +       default:
> +               BUG();

no, please, scream if you want to go beyond warning but don't kill and don't die

> +       }
> +
> +       spin_unlock(&t->tcf_lock);
> +       return action;
> +}
Jiri Benc Aug. 22, 2016, 6:51 p.m. UTC | #4
On Mon, 22 Aug 2016 21:15:41 +0300, Or Gerlitz wrote:
> Jiri B > I understand the motivation for the decap action. However, what would
> Jiri B > happen if someone does not include it?
> 
> The MD set by the (say) vxlan device will not be "consumed" (cleared)
> and would be keep travelling with the SKB

Of course it would. That's not what I meant by the question :-)

There are three options:

1. It does not matter, as the metadata_dst will be freed anyway before
   it reaches tx path. This means we do not need the 'decap' action.

2. We may run into problems like tx path seeing the metadata_dst that
   it should not see. This means either this situation or such
   configuration must be prevented somehow.

3. The metadata_dst can reach the tx path but it doesn't matter, as it
   would just mean the packet is encapsulated into the same outer
   headers it was received with or the metadata_dst would be ignored
   (for non-tunnel interfaces).

Which one is it? Quickly looking into the code, tcf_mirred calls
dev_queue_xmit which indicates it's either 2 or 3. If it's 3., it
should be explained in the patch description (especially the non-tunnel
interface case) and documented.

 Jiri
Amir Vadai Aug. 23, 2016, 8:42 a.m. UTC | #5
On Mon, Aug 22, 2016 at 08:57:06PM +0300, Shmulik Ladkani wrote:
> Hi,
> 
> On Mon, 22 Aug 2016 17:38:34 +0300 Amir Vadai <amir@vadai.me> wrote:
> > +static struct metadata_dst *iptunnel_alloc(struct tcf_iptunnel *t,
> > +					   __be32 saddr, __be32 daddr,
> > +					   __be64 key_id)
> > +{
> > +	struct ip_tunnel_info *tun_info;
> > +	struct metadata_dst *metadata;
> > +
> > +	metadata = metadata_dst_alloc(0, GFP_KERNEL);
> > +	if (!metadata)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	tun_info = &metadata->u.tun_info;
> > +	tun_info->mode = IP_TUNNEL_INFO_TX;
> > 
> > +	ip_tunnel_key_init(&tun_info->key, saddr, daddr, 0, 0, 0, 0, 0,
> > +			   key_id, 0);
> 
> Seems key.tun_flags should be armed with TUNNEL_KEY.
> This will make things work with GRE as well.
> Pass it in the 'tun_flags' parameter.
ack

> 
> > +
> > +	return metadata;
> > +}
> > +
> > +static int tcf_iptunnel_init(struct net *net, struct nlattr *nla,
> > +			     struct nlattr *est, struct tc_action **a,
> > +			     int ovr, int bind)
> > +{
> > +	struct tc_action_net *tn = net_generic(net, iptunnel_net_id);
> > +	struct nlattr *tb[TCA_IPTUNNEL_MAX + 1];
> > +	struct metadata_dst *metadata;
> > +	struct tc_iptunnel *parm;
> > +	struct tcf_iptunnel *t;
> > +	__be32 saddr = 0;
> > +	__be32 daddr = 0;
> > +	__be64 key_id = 0;
> > +	int encapdecap;
> > +	bool exists = false;
> > +	int ret = -EINVAL;
> > +	int err;
> > +
> > +	if (!nla)
> > +		return -EINVAL;
> > +
> > +	err = nla_parse_nested(tb, TCA_IPTUNNEL_MAX, nla, iptunnel_policy);
> > +	if (err < 0)
> > +		return err;
> > +
> > +	if (!tb[TCA_IPTUNNEL_PARMS])
> > +		return -EINVAL;
> > +	parm = nla_data(tb[TCA_IPTUNNEL_PARMS]);
> > +	exists = tcf_hash_check(tn, parm->index, a, bind);
> > +	if (exists && bind)
> > +		return 0;
> > +
> > +	encapdecap = parm->t_action;
> > +
> > +	switch (encapdecap) {
> > +	case TCA_IPTUNNEL_ACT_DECAP:
> > +		break;
> > +	case TCA_IPTUNNEL_ACT_ENCAP:
> > +		if (tb[TCA_IPTUNNEL_ENC_IPV4_SRC])
> > +			saddr = nla_get_be32(tb[TCA_IPTUNNEL_ENC_IPV4_SRC]);
> > +		if (tb[TCA_IPTUNNEL_ENC_IPV4_DST])
> > +			daddr = nla_get_be32(tb[TCA_IPTUNNEL_ENC_IPV4_DST]);
> > +		if (tb[TCA_IPTUNNEL_ENC_KEY_ID])
> > +			key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_IPTUNNEL_ENC_KEY_ID]));
> > +
> > +		if (!saddr || !daddr || !key_id) {
> 
> A zero tunnel ID is legit.
ack

> 
> > +			ret = -EINVAL;
> > +			goto err_out;
> > +		}
> > +
> > +		metadata = iptunnel_alloc(t, saddr, daddr, key_id);
> > +		if (IS_ERR(metadata)) {
> > +			ret = PTR_ERR(metadata);
> > +			goto err_out;
> > +		}
> > +
> > +		break;
> > +	default:
> > +		goto err_out;
> > +	}
> > +
> > +	if (!exists) {
> > +		ret = tcf_hash_create(tn, parm->index, est, a,
> > +				      &act_iptunnel_ops, bind, false);
> > +		if (ret)
> > +			return ret;
> > +
> > +		ret = ACT_P_CREATED;
> > +	} else {
> > +		tcf_hash_release(*a, bind);
> > +		if (!ovr)
> > +			return -EEXIST;
> > +	}
> > +
> > +	t = to_iptunnel(*a);
> > +
> > +	spin_lock_bh(&t->tcf_lock);
> > +
> > +	t->tcf_action = parm->action;
> > +
> > +	t->tcft_action = encapdecap;
> > +	t->tcft_enc_metadata = metadata;
> 
> Although tcft_enc_metadata is not used in TCA_IPTUNNEL_ACT_DECAP, still
> prefer to nullify it instead of initializing it to stack junk.
good catch. strange that the compiler/sparse didn't catch it

> 
> > +
> > +	spin_unlock_bh(&t->tcf_lock);
> > +
> > +	if (ret == ACT_P_CREATED)
> > +		tcf_hash_insert(tn, *a);
> > +
> > +	return ret;
> 
> In the (exists && ovr) case, 'ret' seems to be left as '-EINVAL' as was
> initialized. Initialize 'ret' to zero instead.
another good catch - thanks.

> 
> > +
> > +err_out:
> > +	if (exists)
> > +		tcf_hash_release(*a, bind);
> > +	return ret;
> > +}
> > +
> 
>
Jamal Hadi Salim Aug. 23, 2016, 12:37 p.m. UTC | #6
On 16-08-22 10:38 AM, Amir Vadai wrote:
> This action could be used before redirecting packets to a shared tunnel
> device, or when redirecting packets arriving from a such a device
>
> The action will release the metadata created by the tunnel device
> (decap), or set the metadata with the specified values for encap
> operation.
>
> For example, the following flower filter will forward all ICMP packets
> destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
> redirecting, a metadata for the vxlan tunnel is created using the
> iptunnel action and it's arguments:
>
> $ filter add dev net0 protocol ip parent ffff: \
>     flower \
>       ip_proto 1 \
>       dst_ip 11.11.11.2 \
>     action iptunnel encap \
>       src_ip 11.11.0.1 \
>       dst_ip 11.11.0.2 \
>       id 11 \
>     action mirred egress redirect dev vxlan0

The noun "ip tunnel" is a little misleading. Unless you can use
this for other types of tunnels (ipip, etc). If this is specific
for just vxlan and metadata setting then some name like vxlanmeta
or something else that signifies both metadata de/encap + vxlan would
be helpful.


> +static int tcf_iptunnel(struct sk_buff *skb, const struct tc_action *a,
> +			struct tcf_result *res)

Can you rename this function to something more grep-able
like tcf_iptunnel_run or tcf_iptunnel_exec?
You already have a data structure called tcf_iptunnel

> +{
> +	struct tcf_iptunnel *t = to_iptunnel(a);
> +	int action;
> +
> +	spin_lock(&t->tcf_lock);
> +	tcf_lastuse_update(&t->tcf_tm);
> +	bstats_update(&t->tcf_bstats, skb);
> +	action = t->tcf_action;
> +
> +	switch (t->tcft_action) {
> +	case TCA_IPTUNNEL_ACT_DECAP:
> +		skb_dst_set_noref(skb, NULL);
> +		break;


So the real decap is going to be at the vxlan dev?


> +static struct metadata_dst *iptunnel_alloc(struct tcf_iptunnel *t,
> +					   __be32 saddr, __be32 daddr,
> +					   __be64 key_id)
> +{
> +	struct ip_tunnel_info *tun_info;
> +	struct metadata_dst *metadata;
> +
> +	metadata = metadata_dst_alloc(0, GFP_KERNEL);
> +	if (!metadata)
> +		return ERR_PTR(-ENOMEM);
> +
> +	tun_info = &metadata->u.tun_info;
> +	tun_info->mode = IP_TUNNEL_INFO_TX;
> +

More grep-ability tun_info sounds and i think is used by tun netdev.

Otherwise looks good (although i still think this wouldve scaled better
if you didnt depend on presence of vxlan dev).

cheers,
jamal
Amir Vadai Aug. 23, 2016, 3:28 p.m. UTC | #7
On Mon, Aug 22, 2016 at 08:51:37PM +0200, Jiri Benc wrote:
> On Mon, 22 Aug 2016 21:15:41 +0300, Or Gerlitz wrote:
> > Jiri B > I understand the motivation for the decap action. However, what would
> > Jiri B > happen if someone does not include it?
> > 
> > The MD set by the (say) vxlan device will not be "consumed" (cleared)
> > and would be keep travelling with the SKB
> 
> Of course it would. That's not what I meant by the question :-)
> 
> There are three options:
> 
> 1. It does not matter, as the metadata_dst will be freed anyway before
>    it reaches tx path. This means we do not need the 'decap' action.
> 
> 2. We may run into problems like tx path seeing the metadata_dst that
>    it should not see. This means either this situation or such
>    configuration must be prevented somehow.
> 
> 3. The metadata_dst can reach the tx path but it doesn't matter, as it
>    would just mean the packet is encapsulated into the same outer
>    headers it was received with or the metadata_dst would be ignored
>    (for non-tunnel interfaces).
> 
> Which one is it? Quickly looking into the code, tcf_mirred calls
> dev_queue_xmit which indicates it's either 2 or 3. If it's 3., it
> should be explained in the patch description (especially the non-tunnel
> interface case) and documented.
First, as you suspected it is (2) or (3). AFAIK the skb is injected by
act_mirred as is, with the metadata into the tx path.
I couldn't find a case where having the metadata on the skb matters.
Still, I would be very happy to hear what other people have to say about
it.

Anyway, this issue is orthogonal to this patchset...


> 
>  Jiri
Jiri Benc Aug. 23, 2016, 3:33 p.m. UTC | #8
On Tue, 23 Aug 2016 18:28:05 +0300, Amir Vadai wrote:
> On Mon, Aug 22, 2016 at 08:51:37PM +0200, Jiri Benc wrote:
> > 2. We may run into problems like tx path seeing the metadata_dst that
> >    it should not see. This means either this situation or such
> >    configuration must be prevented somehow.
[...]
> Anyway, this issue is orthogonal to this patchset...

Not really. If it's indeed (2) then such configuration needs to be
rejected. Or metadata_dst freed at an appropriate place. Thus it's
something that needs to be handled by this patchset before the uAPI is
set in stone.

 Jiri
Amir Vadai Aug. 23, 2016, 4:05 p.m. UTC | #9
On Tue, Aug 23, 2016 at 05:33:49PM +0200, Jiri Benc wrote:
> On Tue, 23 Aug 2016 18:28:05 +0300, Amir Vadai wrote:
> > On Mon, Aug 22, 2016 at 08:51:37PM +0200, Jiri Benc wrote:
> > > 2. We may run into problems like tx path seeing the metadata_dst that
> > >    it should not see. This means either this situation or such
> > >    configuration must be prevented somehow.
> [...]
> > Anyway, this issue is orthogonal to this patchset...
> 
> Not really. If it's indeed (2) then such configuration needs to be
> rejected. 
The configuration that needs to be rejected is when act_iptunnel is not
used. So, I guess the fix won't be part of it...

> Or metadata_dst freed at an appropriate place. Thus it's
> something that needs to be handled by this patchset before the uAPI is
> set in stone.
It is already there - user can use act_mirred and redirect skb's with
metadata since shared tunnel devices introduced.
The only thing that was added here, is to enable the user to drop the
metadata, which I think we agree is the ok.

But I agree with you, that I must understand the life cycle of the metadata and dst
better. I will try to understand it better and explain/fix accordingly.

Again, would be happy if someone will chime in and give some hints if it
was a bug, that a user could redirect skb's with metadata, or something
harmless.

Thanks,
Amir
> 
>  Jiri
Jiri Benc Aug. 23, 2016, 4:15 p.m. UTC | #10
On Tue, 23 Aug 2016 19:05:37 +0300, Amir Vadai wrote:
> It is already there - user can use act_mirred and redirect skb's with
> metadata since shared tunnel devices introduced.

You're right, I haven't thought of that.

> The only thing that was added here, is to enable the user to drop the
> metadata, which I think we agree is the ok.

Absolutely. It would be even better if the metadata could be dropped
automatically but I don't see any good place to do it.

> But I agree with you, that I must understand the life cycle of the metadata and dst
> better. I will try to understand it better and explain/fix accordingly.

Thanks!

 Jiri
Amir Vadai Aug. 23, 2016, 4:21 p.m. UTC | #11
On Tue, Aug 23, 2016 at 08:37:07AM -0400, Jamal Hadi Salim wrote:
> On 16-08-22 10:38 AM, Amir Vadai wrote:
> > This action could be used before redirecting packets to a shared tunnel
> > device, or when redirecting packets arriving from a such a device
> > 
> > The action will release the metadata created by the tunnel device
> > (decap), or set the metadata with the specified values for encap
> > operation.
> > 
> > For example, the following flower filter will forward all ICMP packets
> > destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
> > redirecting, a metadata for the vxlan tunnel is created using the
> > iptunnel action and it's arguments:
> > 
> > $ filter add dev net0 protocol ip parent ffff: \
> >     flower \
> >       ip_proto 1 \
> >       dst_ip 11.11.11.2 \
> >     action iptunnel encap \
> >       src_ip 11.11.0.1 \
> >       dst_ip 11.11.0.2 \
> >       id 11 \
> >     action mirred egress redirect dev vxlan0
> 
> The noun "ip tunnel" is a little misleading. Unless you can use
> this for other types of tunnels (ipip, etc). If this is specific
> for just vxlan and metadata setting then some name like vxlanmeta
> or something else that signifies both metadata de/encap + vxlan would
> be helpful.
Yeh, this name is not the best...
The action is not vxlan specific, it should be good for all the
ip tunnel interfaces that use metadata for the outer headers.
I will rename it to something like mdtunnel - unless someone has a
better suggestion.

> 
> 
> > +static int tcf_iptunnel(struct sk_buff *skb, const struct tc_action *a,
> > +			struct tcf_result *res)
> 
> Can you rename this function to something more grep-able
> like tcf_iptunnel_run or tcf_iptunnel_exec?
> You already have a data structure called tcf_iptunnel
ack

> 
> > +{
> > +	struct tcf_iptunnel *t = to_iptunnel(a);
> > +	int action;
> > +
> > +	spin_lock(&t->tcf_lock);
> > +	tcf_lastuse_update(&t->tcf_tm);
> > +	bstats_update(&t->tcf_bstats, skb);
> > +	action = t->tcf_action;
> > +
> > +	switch (t->tcft_action) {
> > +	case TCA_IPTUNNEL_ACT_DECAP:
> > +		skb_dst_set_noref(skb, NULL);
> > +		break;
> 
> 
> So the real decap is going to be at the vxlan dev?
yes, the action here will just cleanup after the tunnel device will peel
off the outer headers and place it in the metadata. This metadata was
used by the classifier and now can be released.

> 
> 
> > +static struct metadata_dst *iptunnel_alloc(struct tcf_iptunnel *t,
> > +					   __be32 saddr, __be32 daddr,
> > +					   __be64 key_id)
> > +{
> > +	struct ip_tunnel_info *tun_info;
> > +	struct metadata_dst *metadata;
> > +
> > +	metadata = metadata_dst_alloc(0, GFP_KERNEL);
> > +	if (!metadata)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	tun_info = &metadata->u.tun_info;
> > +	tun_info->mode = IP_TUNNEL_INFO_TX;
> > +
> 
> More grep-ability tun_info sounds and i think is used by tun netdev.
ack

> 
> Otherwise looks good (although i still think this wouldve scaled better
> if you didnt depend on presence of vxlan dev).
now I start to have some regrets :)
But I don't see a good enough reason to duplicate code, since I can't
point my finger on a performance problem with using the existing code.

Thanks,
Amir

> 
> cheers,
> jamal
Shmulik Ladkani Aug. 23, 2016, 6:59 p.m. UTC | #12
Hi,

On Tue, 23 Aug 2016 19:21:41 +0300 Amir Vadai <amir@vadai.me> wrote:
> On Tue, Aug 23, 2016 at 08:37:07AM -0400, Jamal Hadi Salim wrote:
> > 
> > The noun "ip tunnel" is a little misleading. Unless you can use
> > this for other types of tunnels (ipip, etc). If this is specific
> > for just vxlan and metadata setting then some name like vxlanmeta
> > or something else that signifies both metadata de/encap + vxlan would
> > be helpful.  
> Yeh, this name is not the best...
> The action is not vxlan specific, it should be good for all the
> ip tunnel interfaces that use metadata for the outer headers.
> I will rename it to something like mdtunnel - unless someone has a
> better suggestion.

Well, in bpf we have BPF_FUNC_skb_set_tunnel_key.

How about "action tunnel_key" as the noun?
Another decent alternative might be "action tunnel_info".

Not sure about the verbs though.. "action tunnel_key set/unset"?

Regards,
Shmulik
diff mbox

Patch

diff --git a/include/net/tc_act/tc_iptunnel.h b/include/net/tc_act/tc_iptunnel.h
new file mode 100644
index 000000000000..a325081478e7
--- /dev/null
+++ b/include/net/tc_act/tc_iptunnel.h
@@ -0,0 +1,24 @@ 
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NET_TC_IPTUNNEL_H
+#define __NET_TC_IPTUNNEL_H
+
+#include <net/act_api.h>
+
+struct tcf_iptunnel {
+	struct tc_action	common;
+	int			tcft_action;
+	struct metadata_dst     *tcft_enc_metadata;
+};
+
+#define to_iptunnel(a) ((struct tcf_iptunnel *)a)
+
+#endif /* __NET_TC_IPTUNNEL_H */
+
diff --git a/include/uapi/linux/tc_act/tc_iptunnel.h b/include/uapi/linux/tc_act/tc_iptunnel.h
new file mode 100644
index 000000000000..a9b688c1f28b
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_iptunnel.h
@@ -0,0 +1,40 @@ 
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_IPTUNNEL_H
+#define __LINUX_TC_IPTUNNEL_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_IPTUNNEL 17
+
+#define TCA_IPTUNNEL_ACT_ENCAP	1
+#define TCA_IPTUNNEL_ACT_DECAP	2
+
+struct tc_iptunnel {
+	tc_gen;
+	int t_action;
+};
+
+enum {
+	TCA_IPTUNNEL_UNSPEC,
+	TCA_IPTUNNEL_TM,
+	TCA_IPTUNNEL_PARMS,
+	TCA_IPTUNNEL_ENC_IPV4_SRC,	/* be32 */
+	TCA_IPTUNNEL_ENC_IPV4_DST,	/* be32 */
+	TCA_IPTUNNEL_ENC_KEY_ID,	/* be64 */
+	TCA_IPTUNNEL_PAD,
+	__TCA_IPTUNNEL_MAX,
+};
+
+#define TCA_IPTUNNEL_MAX (__TCA_IPTUNNEL_MAX - 1)
+
+#endif
+
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b3b94c..a8a5ac4edb2e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -761,6 +761,17 @@  config NET_ACT_IFE
 	  To compile this code as a module, choose M here: the
 	  module will be called act_ife.
 
+config NET_ACT_IPTUNNEL
+        tristate "IP tunnel manipulation"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to set/release ip tunnel metadata.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_tunnel.
+
 config NET_IFE_SKBMARK
         tristate "Support to encoding decoding skb mark on IFE action"
         depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5a9d95..c1287b95b574 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -22,6 +22,7 @@  obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
+obj-$(CONFIG_NET_ACT_IPTUNNEL)	+= act_iptunnel.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_iptunnel.c b/net/sched/act_iptunnel.c
new file mode 100644
index 000000000000..37640bd11b62
--- /dev/null
+++ b/net/sched/act_iptunnel.c
@@ -0,0 +1,292 @@ 
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
+#include <linux/tc_act/tc_iptunnel.h>
+#include <net/tc_act/tc_iptunnel.h>
+
+#define IPTUNNEL_TAB_MASK     15
+
+static int iptunnel_net_id;
+static struct tc_action_ops act_iptunnel_ops;
+
+static int tcf_iptunnel(struct sk_buff *skb, const struct tc_action *a,
+			struct tcf_result *res)
+{
+	struct tcf_iptunnel *t = to_iptunnel(a);
+	int action;
+
+	spin_lock(&t->tcf_lock);
+	tcf_lastuse_update(&t->tcf_tm);
+	bstats_update(&t->tcf_bstats, skb);
+	action = t->tcf_action;
+
+	switch (t->tcft_action) {
+	case TCA_IPTUNNEL_ACT_DECAP:
+		skb_dst_set_noref(skb, NULL);
+		break;
+	case TCA_IPTUNNEL_ACT_ENCAP:
+		skb_dst_set_noref(skb, &t->tcft_enc_metadata->dst);
+
+		break;
+	default:
+		BUG();
+	}
+
+	spin_unlock(&t->tcf_lock);
+	return action;
+}
+
+static const struct nla_policy iptunnel_policy[TCA_IPTUNNEL_MAX + 1] = {
+	[TCA_IPTUNNEL_PARMS]	    = { .len = sizeof(struct tc_iptunnel) },
+	[TCA_IPTUNNEL_ENC_IPV4_SRC] = { .type = NLA_U32 },
+	[TCA_IPTUNNEL_ENC_IPV4_DST] = { .type = NLA_U32 },
+	[TCA_IPTUNNEL_ENC_KEY_ID]   = { .type = NLA_U32 },
+};
+
+static struct metadata_dst *iptunnel_alloc(struct tcf_iptunnel *t,
+					   __be32 saddr, __be32 daddr,
+					   __be64 key_id)
+{
+	struct ip_tunnel_info *tun_info;
+	struct metadata_dst *metadata;
+
+	metadata = metadata_dst_alloc(0, GFP_KERNEL);
+	if (!metadata)
+		return ERR_PTR(-ENOMEM);
+
+	tun_info = &metadata->u.tun_info;
+	tun_info->mode = IP_TUNNEL_INFO_TX;
+
+	ip_tunnel_key_init(&tun_info->key, saddr, daddr, 0, 0, 0, 0, 0,
+			   key_id, 0);
+
+	return metadata;
+}
+
+static int tcf_iptunnel_init(struct net *net, struct nlattr *nla,
+			     struct nlattr *est, struct tc_action **a,
+			     int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, iptunnel_net_id);
+	struct nlattr *tb[TCA_IPTUNNEL_MAX + 1];
+	struct metadata_dst *metadata;
+	struct tc_iptunnel *parm;
+	struct tcf_iptunnel *t;
+	__be32 saddr = 0;
+	__be32 daddr = 0;
+	__be64 key_id = 0;
+	int encapdecap;
+	bool exists = false;
+	int ret = -EINVAL;
+	int err;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_IPTUNNEL_MAX, nla, iptunnel_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_IPTUNNEL_PARMS])
+		return -EINVAL;
+	parm = nla_data(tb[TCA_IPTUNNEL_PARMS]);
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	encapdecap = parm->t_action;
+
+	switch (encapdecap) {
+	case TCA_IPTUNNEL_ACT_DECAP:
+		break;
+	case TCA_IPTUNNEL_ACT_ENCAP:
+		if (tb[TCA_IPTUNNEL_ENC_IPV4_SRC])
+			saddr = nla_get_be32(tb[TCA_IPTUNNEL_ENC_IPV4_SRC]);
+		if (tb[TCA_IPTUNNEL_ENC_IPV4_DST])
+			daddr = nla_get_be32(tb[TCA_IPTUNNEL_ENC_IPV4_DST]);
+		if (tb[TCA_IPTUNNEL_ENC_KEY_ID])
+			key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_IPTUNNEL_ENC_KEY_ID]));
+
+		if (!saddr || !daddr || !key_id) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+
+		metadata = iptunnel_alloc(t, saddr, daddr, key_id);
+		if (IS_ERR(metadata)) {
+			ret = PTR_ERR(metadata);
+			goto err_out;
+		}
+
+		break;
+	default:
+		goto err_out;
+	}
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_iptunnel_ops, bind, false);
+		if (ret)
+			return ret;
+
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	t = to_iptunnel(*a);
+
+	spin_lock_bh(&t->tcf_lock);
+
+	t->tcf_action = parm->action;
+
+	t->tcft_action = encapdecap;
+	t->tcft_enc_metadata = metadata;
+
+	spin_unlock_bh(&t->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+
+	return ret;
+
+err_out:
+	if (exists)
+		tcf_hash_release(*a, bind);
+	return ret;
+}
+
+static void tcf_iptunnel_release(struct tc_action *a, int bind)
+{
+	struct tcf_iptunnel *t = to_iptunnel(a);
+
+	if (t->tcft_action == TCA_IPTUNNEL_ACT_ENCAP)
+		dst_release(&t->tcft_enc_metadata->dst);
+}
+
+static int tcf_iptunnel_dump(struct sk_buff *skb, struct tc_action *a,
+			     int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_iptunnel *t = to_iptunnel(a);
+	struct tc_iptunnel opt = {
+		.index    = t->tcf_index,
+		.refcnt   = t->tcf_refcnt - ref,
+		.bindcnt  = t->tcf_bindcnt - bind,
+		.action   = t->tcf_action,
+		.t_action = t->tcft_action,
+	};
+	struct tcf_t tm;
+
+	if (nla_put(skb, TCA_IPTUNNEL_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (t->tcft_action == TCA_IPTUNNEL_ACT_ENCAP) {
+		struct ip_tunnel_key *key =
+			&t->tcft_enc_metadata->u.tun_info.key;
+		__be32 saddr = key->u.ipv4.src;
+		__be32 daddr = key->u.ipv4.dst;
+		__be32 key_id = tunnel_id_to_key32(key->tun_id);
+
+		if (nla_put_be32(skb, TCA_IPTUNNEL_ENC_IPV4_SRC, saddr) ||
+		    nla_put_be32(skb, TCA_IPTUNNEL_ENC_IPV4_DST, daddr) ||
+		    nla_put_be32(skb, TCA_IPTUNNEL_ENC_KEY_ID, key_id))
+			goto nla_put_failure;
+	}
+
+	tcf_tm_dump(&tm, &t->tcf_tm);
+	if (nla_put_64bit(skb, TCA_IPTUNNEL_TM, sizeof(tm), &tm, TCA_IPTUNNEL_PAD))
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_iptunnel_walker(struct net *net, struct sk_buff *skb,
+			       struct netlink_callback *cb, int type,
+			       const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, iptunnel_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_iptunnel_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, iptunnel_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_iptunnel_ops = {
+	.kind		=	"iptunnel",
+	.type		=	TCA_ACT_IPTUNNEL,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_iptunnel,
+	.dump		=	tcf_iptunnel_dump,
+	.init		=	tcf_iptunnel_init,
+	.cleanup	=	tcf_iptunnel_release,
+	.walk		=	tcf_iptunnel_walker,
+	.lookup		=	tcf_iptunnel_search,
+	.size		=	sizeof(struct tcf_iptunnel),
+};
+
+static __net_init int iptunnel_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, iptunnel_net_id);
+
+	return tc_action_net_init(tn, &act_iptunnel_ops, IPTUNNEL_TAB_MASK);
+}
+
+static void __net_exit iptunnel_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, iptunnel_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations iptunnel_net_ops = {
+	.init = iptunnel_init_net,
+	.exit = iptunnel_exit_net,
+	.id   = &iptunnel_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init iptunnel_init_module(void)
+{
+	return tcf_register_action(&act_iptunnel_ops, &iptunnel_net_ops);
+}
+
+static void __exit iptunnel_cleanup_module(void)
+{
+	tcf_unregister_action(&act_iptunnel_ops, &iptunnel_net_ops);
+}
+
+module_init(iptunnel_init_module);
+module_exit(iptunnel_cleanup_module);
+
+MODULE_AUTHOR("Amir Vadai <amir@vadai.me>");
+MODULE_DESCRIPTION("ip tunnel manipulation actions");
+MODULE_LICENSE("GPL v2");