diff mbox series

xsk: add cq event

Message ID b18c1f2cfb0c9c0b409c25f4a73248e869c8ac97.1605513087.git.xuanzhuo@linux.alibaba.com
State Superseded
Headers show
Series xsk: add cq event | expand

Commit Message

Xuan Zhuo Nov. 16, 2020, 8:10 a.m. UTC
When we write all cq items to tx, we have to wait for a new event based
on poll to indicate that it is writable. But the current writability is
triggered based on whether tx is full or not, and In fact, when tx is
dissatisfied, the user of cq's item may not necessarily get it, because it
may still be occupied by the network card. In this case, we need to know
when cq is available, so this patch adds a socket option, When the user
configures this option using setsockopt, when cq is available, a
readable event is generated for all xsk bound to this umem.

I can't find a better description of this event,
I think it can also be 'readable', although it is indeed different from
the 'readable' of the new data. But the overhead of xsk checking whether
cq or rx is readable is small.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 include/net/xdp_sock.h      |  1 +
 include/uapi/linux/if_xdp.h |  1 +
 net/xdp/xsk.c               | 28 ++++++++++++++++++++++++++++
 3 files changed, 30 insertions(+)

Comments

Denis Kirjanov Nov. 16, 2020, 9:13 a.m. UTC | #1
On 11/16/20, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> When we write all cq items to tx, we have to wait for a new event based
> on poll to indicate that it is writable. But the current writability is
> triggered based on whether tx is full or not, and In fact, when tx is
> dissatisfied, the user of cq's item may not necessarily get it, because it
> may still be occupied by the network card. In this case, we need to know
> when cq is available, so this patch adds a socket option, When the user
> configures this option using setsockopt, when cq is available, a
> readable event is generated for all xsk bound to this umem.
>
> I can't find a better description of this event,
> I think it can also be 'readable', although it is indeed different from
> the 'readable' of the new data. But the overhead of xsk checking whether
> cq or rx is readable is small.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>  include/net/xdp_sock.h      |  1 +
>  include/uapi/linux/if_xdp.h |  1 +
>  net/xdp/xsk.c               | 28 ++++++++++++++++++++++++++++
>  3 files changed, 30 insertions(+)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 1a9559c..faf5b1a 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -49,6 +49,7 @@ struct xdp_sock {
>  	struct xsk_buff_pool *pool;
>  	u16 queue_id;
>  	bool zc;
> +	bool cq_event;
>  	enum {
>  		XSK_READY = 0,
>  		XSK_BOUND,
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index a78a809..2dba3cb 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -63,6 +63,7 @@ struct xdp_mmap_offsets {
>  #define XDP_UMEM_COMPLETION_RING	6
>  #define XDP_STATISTICS			7
>  #define XDP_OPTIONS			8
> +#define XDP_CQ_EVENT			9
>
>  struct xdp_umem_reg {
>  	__u64 addr; /* Start of packet data area */
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index cfbec39..0c53403 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -285,7 +285,16 @@ void __xsk_map_flush(void)
>
>  void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
>  {
> +	struct xdp_sock *xs;
> +
>  	xskq_prod_submit_n(pool->cq, nb_entries);
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
> +		if (xs->cq_event)
> +			sock_def_readable(&xs->sk);
> +	}
> +	rcu_read_unlock();
>  }
>  EXPORT_SYMBOL(xsk_tx_completed);
>
> @@ -495,6 +504,9 @@ static __poll_t xsk_poll(struct file *file, struct
> socket *sock,
>  			__xsk_sendmsg(sk);
>  	}
>
> +	if (xs->cq_event && pool->cq && !xskq_prod_is_empty(pool->cq))
> +		mask |= EPOLLIN | EPOLLRDNORM;
> +
>  	if (xs->rx && !xskq_prod_is_empty(xs->rx))
>  		mask |= EPOLLIN | EPOLLRDNORM;
>  	if (xs->tx && !xskq_cons_is_full(xs->tx))
> @@ -882,6 +894,22 @@ static int xsk_setsockopt(struct socket *sock, int
> level, int optname,
>  		mutex_unlock(&xs->mutex);
>  		return err;
>  	}
> +	case XDP_CQ_EVENT:
> +	{
> +		int cq_event;
> +
> +		if (optlen < sizeof(cq_event))
> +			return -EINVAL;
> +		if (copy_from_sockptr(&cq_event, optval, sizeof(cq_event)))
> +			return -EFAULT;
> +
> +		if (cq_event)
> +			xs->cq_event = true;
> +		else
> +			xs->cq_event = false;

It's false by default, isn't it?

> +
> +		return 0;
> +	}
>  	default:
>  		break;
>  	}
> --
> 1.8.3.1
>
>
Denis Kirjanov Nov. 16, 2020, 10:13 a.m. UTC | #2
On 11/16/20, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> On Mon, 16 Nov 2020 12:13:21 +0300, Denis Kirjanov <kda@linux-powerpc.org>
> wrote:
>> On 11/16/20, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>> > When we write all cq items to tx, we have to wait for a new event based
>> > on poll to indicate that it is writable. But the current writability is
>> > triggered based on whether tx is full or not, and In fact, when tx is
>> > dissatisfied, the user of cq's item may not necessarily get it, because
>> > it
>> > may still be occupied by the network card. In this case, we need to
>> > know
>> > when cq is available, so this patch adds a socket option, When the user
>> > configures this option using setsockopt, when cq is available, a
>> > readable event is generated for all xsk bound to this umem.
>> >
>> > I can't find a better description of this event,
>> > I think it can also be 'readable', although it is indeed different from
>> > the 'readable' of the new data. But the overhead of xsk checking
>> > whether
>> > cq or rx is readable is small.
>> >
>> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>> > ---
>> >  include/net/xdp_sock.h      |  1 +
>> >  include/uapi/linux/if_xdp.h |  1 +
>> >  net/xdp/xsk.c               | 28 ++++++++++++++++++++++++++++
>> >  3 files changed, 30 insertions(+)
>> >
>> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
>> > index 1a9559c..faf5b1a 100644
>> > --- a/include/net/xdp_sock.h
>> > +++ b/include/net/xdp_sock.h
>> > @@ -49,6 +49,7 @@ struct xdp_sock {
>> >  	struct xsk_buff_pool *pool;
>> >  	u16 queue_id;
>> >  	bool zc;
>> > +	bool cq_event;
>> >  	enum {
>> >  		XSK_READY = 0,
>> >  		XSK_BOUND,
>> > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
>> > index a78a809..2dba3cb 100644
>> > --- a/include/uapi/linux/if_xdp.h
>> > +++ b/include/uapi/linux/if_xdp.h
>> > @@ -63,6 +63,7 @@ struct xdp_mmap_offsets {
>> >  #define XDP_UMEM_COMPLETION_RING	6
>> >  #define XDP_STATISTICS			7
>> >  #define XDP_OPTIONS			8
>> > +#define XDP_CQ_EVENT			9
>> >
>> >  struct xdp_umem_reg {
>> >  	__u64 addr; /* Start of packet data area */
>> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>> > index cfbec39..0c53403 100644
>> > --- a/net/xdp/xsk.c
>> > +++ b/net/xdp/xsk.c
>> > @@ -285,7 +285,16 @@ void __xsk_map_flush(void)
>> >
>> >  void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
>> >  {
>> > +	struct xdp_sock *xs;
>> > +
>> >  	xskq_prod_submit_n(pool->cq, nb_entries);
>> > +
>> > +	rcu_read_lock();
>> > +	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
>> > +		if (xs->cq_event)
>> > +			sock_def_readable(&xs->sk);
>> > +	}
>> > +	rcu_read_unlock();
>> >  }
>> >  EXPORT_SYMBOL(xsk_tx_completed);
>> >
>> > @@ -495,6 +504,9 @@ static __poll_t xsk_poll(struct file *file, struct
>> > socket *sock,
>> >  			__xsk_sendmsg(sk);
>> >  	}
>> >
>> > +	if (xs->cq_event && pool->cq && !xskq_prod_is_empty(pool->cq))
>> > +		mask |= EPOLLIN | EPOLLRDNORM;
>> > +
>> >  	if (xs->rx && !xskq_prod_is_empty(xs->rx))
>> >  		mask |= EPOLLIN | EPOLLRDNORM;
>> >  	if (xs->tx && !xskq_cons_is_full(xs->tx))
>> > @@ -882,6 +894,22 @@ static int xsk_setsockopt(struct socket *sock, int
>> > level, int optname,
>> >  		mutex_unlock(&xs->mutex);
>> >  		return err;
>> >  	}
>> > +	case XDP_CQ_EVENT:
>> > +	{
>> > +		int cq_event;
>> > +
>> > +		if (optlen < sizeof(cq_event))
>> > +			return -EINVAL;
>> > +		if (copy_from_sockptr(&cq_event, optval, sizeof(cq_event)))
>> > +			return -EFAULT;
>> > +
>> > +		if (cq_event)
>> > +			xs->cq_event = true;
>> > +		else
>> > +			xs->cq_event = false;
>>
>> It's false by default, isn't it?
>
> I add cq_event inside "xdp_sock", that is got by sk_alloc, this call
> sk_prot_alloc by __GFP_ZERO. So I think it is false.

Right, I meant that what's the point to set it explicitly to 'false'?

>
> Thanks.
>
>>
>> > +
>> > +		return 0;
>> > +	}
>> >  	default:
>> >  		break;
>> >  	}
>> > --
>> > 1.8.3.1
>> >
>> >
>
Denis Kirjanov Nov. 16, 2020, 10:21 a.m. UTC | #3
On 11/16/20, Denis Kirjanov <kda@linux-powerpc.org> wrote:
> On 11/16/20, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>> On Mon, 16 Nov 2020 12:13:21 +0300, Denis Kirjanov
>> <kda@linux-powerpc.org>
>> wrote:
>>> On 11/16/20, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>> > When we write all cq items to tx, we have to wait for a new event
>>> > based
>>> > on poll to indicate that it is writable. But the current writability
>>> > is
>>> > triggered based on whether tx is full or not, and In fact, when tx is
>>> > dissatisfied, the user of cq's item may not necessarily get it,
>>> > because
>>> > it
>>> > may still be occupied by the network card. In this case, we need to
>>> > know
>>> > when cq is available, so this patch adds a socket option, When the
>>> > user
>>> > configures this option using setsockopt, when cq is available, a
>>> > readable event is generated for all xsk bound to this umem.
>>> >
>>> > I can't find a better description of this event,
>>> > I think it can also be 'readable', although it is indeed different
>>> > from
>>> > the 'readable' of the new data. But the overhead of xsk checking
>>> > whether
>>> > cq or rx is readable is small.
>>> >
>>> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>>> > ---
>>> >  include/net/xdp_sock.h      |  1 +
>>> >  include/uapi/linux/if_xdp.h |  1 +
>>> >  net/xdp/xsk.c               | 28 ++++++++++++++++++++++++++++
>>> >  3 files changed, 30 insertions(+)
>>> >
>>> > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
>>> > index 1a9559c..faf5b1a 100644
>>> > --- a/include/net/xdp_sock.h
>>> > +++ b/include/net/xdp_sock.h
>>> > @@ -49,6 +49,7 @@ struct xdp_sock {
>>> >  	struct xsk_buff_pool *pool;
>>> >  	u16 queue_id;
>>> >  	bool zc;
>>> > +	bool cq_event;
>>> >  	enum {
>>> >  		XSK_READY = 0,
>>> >  		XSK_BOUND,
>>> > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
>>> > index a78a809..2dba3cb 100644
>>> > --- a/include/uapi/linux/if_xdp.h
>>> > +++ b/include/uapi/linux/if_xdp.h
>>> > @@ -63,6 +63,7 @@ struct xdp_mmap_offsets {
>>> >  #define XDP_UMEM_COMPLETION_RING	6
>>> >  #define XDP_STATISTICS			7
>>> >  #define XDP_OPTIONS			8
>>> > +#define XDP_CQ_EVENT			9
>>> >
>>> >  struct xdp_umem_reg {
>>> >  	__u64 addr; /* Start of packet data area */
>>> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>>> > index cfbec39..0c53403 100644
>>> > --- a/net/xdp/xsk.c
>>> > +++ b/net/xdp/xsk.c
>>> > @@ -285,7 +285,16 @@ void __xsk_map_flush(void)
>>> >
>>> >  void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
>>> >  {
>>> > +	struct xdp_sock *xs;
>>> > +
>>> >  	xskq_prod_submit_n(pool->cq, nb_entries);
>>> > +
>>> > +	rcu_read_lock();
>>> > +	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
>>> > +		if (xs->cq_event)
>>> > +			sock_def_readable(&xs->sk);
>>> > +	}
>>> > +	rcu_read_unlock();
>>> >  }
>>> >  EXPORT_SYMBOL(xsk_tx_completed);
>>> >
>>> > @@ -495,6 +504,9 @@ static __poll_t xsk_poll(struct file *file, struct
>>> > socket *sock,
>>> >  			__xsk_sendmsg(sk);
>>> >  	}
>>> >
>>> > +	if (xs->cq_event && pool->cq && !xskq_prod_is_empty(pool->cq))
>>> > +		mask |= EPOLLIN | EPOLLRDNORM;
>>> > +
>>> >  	if (xs->rx && !xskq_prod_is_empty(xs->rx))
>>> >  		mask |= EPOLLIN | EPOLLRDNORM;
>>> >  	if (xs->tx && !xskq_cons_is_full(xs->tx))
>>> > @@ -882,6 +894,22 @@ static int xsk_setsockopt(struct socket *sock,
>>> > int
>>> > level, int optname,
>>> >  		mutex_unlock(&xs->mutex);
>>> >  		return err;
>>> >  	}
>>> > +	case XDP_CQ_EVENT:
>>> > +	{
>>> > +		int cq_event;
>>> > +
>>> > +		if (optlen < sizeof(cq_event))
>>> > +			return -EINVAL;
>>> > +		if (copy_from_sockptr(&cq_event, optval, sizeof(cq_event)))
>>> > +			return -EFAULT;
>>> > +
>>> > +		if (cq_event)
>>> > +			xs->cq_event = true;
>>> > +		else
>>> > +			xs->cq_event = false;
>>>
>>> It's false by default, isn't it?
>>
>> I add cq_event inside "xdp_sock", that is got by sk_alloc, this call
>> sk_prot_alloc by __GFP_ZERO. So I think it is false.
>
> Right, I meant that what's the point to set it explicitly to 'false'?

Nevermind, It's okay.

>
>>
>> Thanks.
>>
>>>
>>> > +
>>> > +		return 0;
>>> > +	}
>>> >  	default:
>>> >  		break;
>>> >  	}
>>> > --
>>> > 1.8.3.1
>>> >
>>> >
>>
>
Björn Töpel Nov. 16, 2020, 2:31 p.m. UTC | #4
On 2020-11-16 09:10, Xuan Zhuo wrote:
> When we write all cq items to tx, we have to wait for a new event based
> on poll to indicate that it is writable. But the current writability is
> triggered based on whether tx is full or not, and In fact, when tx is
> dissatisfied, the user of cq's item may not necessarily get it, because it
> may still be occupied by the network card. In this case, we need to know
> when cq is available, so this patch adds a socket option, When the user
> configures this option using setsockopt, when cq is available, a
> readable event is generated for all xsk bound to this umem.
> 
> I can't find a better description of this event,
> I think it can also be 'readable', although it is indeed different from
> the 'readable' of the new data. But the overhead of xsk checking whether
> cq or rx is readable is small.
> 
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

Thanks for the patch!

I'm not a fan of having two different "readable" event (both Rx and cq).
Could you explain a bit what the use case is, so I get a better
understanding.

The Tx queues has a back-pressure mechanism, determined of the number of
elements in cq. Is it related to that?

Please explain a bit more what you're trying to solve, and maybe we can
figure out a better way forward!


Thanks!
Björn
diff mbox series

Patch

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 1a9559c..faf5b1a 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -49,6 +49,7 @@  struct xdp_sock {
 	struct xsk_buff_pool *pool;
 	u16 queue_id;
 	bool zc;
+	bool cq_event;
 	enum {
 		XSK_READY = 0,
 		XSK_BOUND,
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index a78a809..2dba3cb 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -63,6 +63,7 @@  struct xdp_mmap_offsets {
 #define XDP_UMEM_COMPLETION_RING	6
 #define XDP_STATISTICS			7
 #define XDP_OPTIONS			8
+#define XDP_CQ_EVENT			9
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cfbec39..0c53403 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -285,7 +285,16 @@  void __xsk_map_flush(void)
 
 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
 {
+	struct xdp_sock *xs;
+
 	xskq_prod_submit_n(pool->cq, nb_entries);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
+		if (xs->cq_event)
+			sock_def_readable(&xs->sk);
+	}
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(xsk_tx_completed);
 
@@ -495,6 +504,9 @@  static __poll_t xsk_poll(struct file *file, struct socket *sock,
 			__xsk_sendmsg(sk);
 	}
 
+	if (xs->cq_event && pool->cq && !xskq_prod_is_empty(pool->cq))
+		mask |= EPOLLIN | EPOLLRDNORM;
+
 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
 		mask |= EPOLLIN | EPOLLRDNORM;
 	if (xs->tx && !xskq_cons_is_full(xs->tx))
@@ -882,6 +894,22 @@  static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
+	case XDP_CQ_EVENT:
+	{
+		int cq_event;
+
+		if (optlen < sizeof(cq_event))
+			return -EINVAL;
+		if (copy_from_sockptr(&cq_event, optval, sizeof(cq_event)))
+			return -EFAULT;
+
+		if (cq_event)
+			xs->cq_event = true;
+		else
+			xs->cq_event = false;
+
+		return 0;
+	}
 	default:
 		break;
 	}