Message ID | 1264537819.24933.122.camel@w-sridhar.beaverton.ibm.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Wed, 27 Jan 2010 07:00:19 am Sridhar Samudrala wrote: > This patch adds GSO/checksum offload to af_packet sockets using > virtio_net_hdr. Based on Rusty's patch to add this support to tun. > It allows GSO/checksum offload to be enabled when using raw socket > backend with virtio_net. > Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the > receive path and process/skip virtio_net_hdr in the send path. This > option is only allowed with SOCK_RAW sockets attached to ethernet > type devices. This is really MST's baby, so I'm going to defer to his wisdom on this... Cheers, Rusty. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote: > This patch adds GSO/checksum offload to af_packet sockets using > virtio_net_hdr. Based on Rusty's patch to add this support to tun. > It allows GSO/checksum offload to be enabled when using raw socket > backend with virtio_net. > Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the > receive path and process/skip virtio_net_hdr in the send path. This > option is only allowed with SOCK_RAW sockets attached to ethernet > type devices. > > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com> So the main issue with this implemenation is that it silently fails for non-ethernet protocols. It would be better to detect unsupported protocols and return an error to user. This is same issue that was pointed out by DaveM with my earlier attempt to solve a different (related) problem: http://lkml.org/lkml/2010/1/5/474 For an incomplete prototype attempting to solve the issue in a generic way: http://lkml.org/lkml/2010/1/6/56 A couple of additional comments below. > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h > index 4021d47..aa57a5f 100644 > --- a/include/linux/if_packet.h > +++ b/include/linux/if_packet.h > @@ -46,6 +46,7 @@ struct sockaddr_ll { > #define PACKET_RESERVE 12 > #define PACKET_TX_RING 13 > #define PACKET_LOSS 14 > +#define PACKET_VNET_HDR 15 > > struct tpacket_stats { > unsigned int tp_packets; > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c > index 53633c5..36d5360 100644 > --- a/net/packet/af_packet.c > +++ b/net/packet/af_packet.c > @@ -80,6 +80,8 @@ > #include <linux/init.h> > #include <linux/mutex.h> > #include <linux/if_vlan.h> > +#include <linux/virtio_net.h> > +#include <linux/if_arp.h> > > #ifdef CONFIG_INET > #include <net/inet_common.h> > @@ -193,7 +195,8 @@ struct packet_sock { > struct mutex pg_vec_lock; > unsigned int running:1, /* prot_hook is attached*/ > auxdata:1, > - origdev:1; > + origdev:1, > + vnet_hdr:1; > int ifindex; /* bound device */ > __be16 num; > struct packet_mclist *mclist; > @@ -1056,6 +1059,30 @@ out: > } > #endif > > +static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, > + size_t reserve, size_t len, > + size_t linear, int noblock, > + int *err) > +{ > + struct sk_buff *skb; > + > + /* Under a page? Don't bother with paged skb. */ > + if (prepad + len < PAGE_SIZE || !linear) > + linear = len; > + > + skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, > + err); > + if (!skb) > + return NULL; > + > + skb_reserve(skb, reserve); > + skb_put(skb, linear); > + skb->data_len = len - linear; > + skb->len += len - linear; > + > + return skb; > +} > + > static int packet_snd(struct socket *sock, > struct msghdr *msg, size_t len) > { > @@ -1066,14 +1093,15 @@ static int packet_snd(struct socket *sock, > __be16 proto; > unsigned char *addr; > int ifindex, err, reserve = 0; > + struct virtio_net_hdr vnethdr = { 0 }; > + int offset = 0; > + struct packet_sock *po = pkt_sk(sk); > > /* > * Get and verify the address. > */ > > if (saddr == NULL) { > - struct packet_sock *po = pkt_sk(sk); > - > ifindex = po->ifindex; > proto = po->num; > addr = NULL; > @@ -1100,25 +1128,52 @@ static int packet_snd(struct socket *sock, > if (!(dev->flags & IFF_UP)) > goto out_unlock; > > - err = -EMSGSIZE; > - if (len > dev->mtu+reserve) > - goto out_unlock; > + if (po->vnet_hdr) { > + err = -EINVAL; > + if (dev->type != ARPHRD_ETHER) > + goto out_unlock; > + > + if (len < sizeof(vnethdr)) > + goto out_unlock; > > - skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev), > - msg->msg_flags & MSG_DONTWAIT, &err); > + len -= sizeof(vnethdr); > + > + err = -EFAULT; > + if (memcpy_fromiovec((void *)&vnethdr, msg->msg_iov, > + sizeof(vnethdr))) > + goto out_unlock; > + > + if ((vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && > + (vnethdr.csum_start + vnethdr.csum_offset + 2 > > + vnethdr.hdr_len)) > + vnethdr.hdr_len = vnethdr.csum_start + > + vnethdr.csum_offset + 2; > + > + err = -EINVAL; > + if (vnethdr.hdr_len > len) > + goto out_unlock; > + } else { > + err = -EMSGSIZE; > + if (len > dev->mtu+reserve) > + goto out_unlock; IMO we should always perform the length check if GSO is off. > + } > + > + err = -ENOBUFS; > + skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), > + LL_RESERVED_SPACE(dev), len, vnethdr.hdr_len, > + msg->msg_flags & MSG_DONTWAIT, &err); > if (skb == NULL) > goto out_unlock; > > - skb_reserve(skb, LL_RESERVED_SPACE(dev)); > - skb_reset_network_header(skb); > + skb_set_network_header(skb, reserve); I think the above is wrong for vlans? > > err = -EINVAL; > if (sock->type == SOCK_DGRAM && > - dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0) > + (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) > goto out_free; > > /* Returns -EFAULT on error */ > - err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); > + err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); > if (err) > goto out_free; > > @@ -1127,6 +1182,51 @@ static int packet_snd(struct socket *sock, > skb->priority = sk->sk_priority; > skb->mark = sk->sk_mark; > > + if (po->vnet_hdr) { > + skb_reset_mac_header(skb); > + skb->protocol = eth_hdr(skb)->h_proto; > + Is this also broken for vlans? > + if (vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { > + if (!skb_partial_csum_set(skb, vnethdr.csum_start, > + vnethdr.csum_offset)) { > + err = -EINVAL; > + goto out_free; > + } > + } > + > + if (vnethdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { > + switch (vnethdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { > + case VIRTIO_NET_HDR_GSO_TCPV4: > + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; > + break; > + case VIRTIO_NET_HDR_GSO_TCPV6: > + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; > + break; > + case VIRTIO_NET_HDR_GSO_UDP: > + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; > + break; > + default: > + err = -EINVAL; > + goto out_free; > + } > + > + if (vnethdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) > + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; > + > + skb_shinfo(skb)->gso_size = vnethdr.gso_size; > + if (skb_shinfo(skb)->gso_size == 0) { > + err = -EINVAL; > + goto out_free; > + } > + > + /* Header must be checked, and gso_segs computed. */ > + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; > + skb_shinfo(skb)->gso_segs = 0; > + } > + > + len += sizeof(vnethdr); > + } > + > /* > * Now send it > */ > @@ -1420,6 +1520,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, > struct sk_buff *skb; > int copied, err; > struct sockaddr_ll *sll; > + int vnet_hdr_len = 0; > > err = -EINVAL; > if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) > @@ -1451,6 +1552,44 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, > if (skb == NULL) > goto out; > > + if (pkt_sk(sk)->vnet_hdr) { > + struct virtio_net_hdr vnethdr = { 0 }; > + > + vnet_hdr_len = sizeof(vnethdr); > + if ((len -= vnet_hdr_len) < 0) > + return -EINVAL; > + > + if (skb_is_gso(skb)) { > + struct skb_shared_info *sinfo = skb_shinfo(skb); > + > + /* This is a hint as to how much should be linear. */ > + vnethdr.hdr_len = skb_headlen(skb); > + vnethdr.gso_size = sinfo->gso_size; > + if (sinfo->gso_type & SKB_GSO_TCPV4) > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; > + else if (sinfo->gso_type & SKB_GSO_TCPV6) > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; > + else if (sinfo->gso_type & SKB_GSO_UDP) > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; > + else > + BUG(); Is there any chance this can get SKB_GSO_FCOE by binding to an appropriate interface? Maybe we don't want to BUG(). > + if (sinfo->gso_type & SKB_GSO_TCP_ECN) > + vnethdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; > + } else > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; > + > + if (skb->ip_summed == CHECKSUM_PARTIAL) { > + vnethdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; > + vnethdr.csum_start = skb->csum_start - skb_headroom(skb); > + vnethdr.csum_offset = skb->csum_offset; > + } /* else everything is zero */ > + > + if (unlikely(memcpy_toiovec(msg->msg_iov, (void *)&vnethdr, > + sizeof(vnethdr)))) { > + return -EFAULT; > + } > + } > + > /* > * If the address length field is there to be filled in, we fill > * it in now. > @@ -1502,7 +1641,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, > * Free or return the buffer as appropriate. Again this > * hides all the races and re-entrancy issues from us. > */ > - err = (flags&MSG_TRUNC) ? skb->len : copied; > + err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); > > out_free: > skb_free_datagram(sk, skb); > @@ -1826,6 +1965,22 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv > po->origdev = !!val; > return 0; > } > + case PACKET_VNET_HDR: > + { > + int val; > + > + if (sock->type != SOCK_RAW) > + return -EINVAL; > + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) > + return -EBUSY; Another way to get a broken ring + vnet hdr configuration would be to enable vnet hdr first and mmap second. I think we need to guard against this as well, by checking vnet_hdr when tx/rx ring is enabled. > + if (optlen < sizeof(val)) > + return -EINVAL; > + if (copy_from_user(&val, optval, sizeof(val))) > + return -EFAULT; > + > + po->vnet_hdr = !!val; > + return 0; > + } > default: > return -ENOPROTOOPT; > } > @@ -1876,6 +2031,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, > > data = &val; > break; > + case PACKET_VNET_HDR: > + if (len > sizeof(int)) > + len = sizeof(int); > + val = po->vnet_hdr; > + > + data = &val; > + break; > #ifdef CONFIG_PACKET_MMAP > case PACKET_VERSION: > if (len > sizeof(int)) > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2010-01-27 at 13:42 +0200, Michael S. Tsirkin wrote: > On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote: > > This patch adds GSO/checksum offload to af_packet sockets using > > virtio_net_hdr. Based on Rusty's patch to add this support to tun. > > It allows GSO/checksum offload to be enabled when using raw socket > > backend with virtio_net. > > Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the > > receive path and process/skip virtio_net_hdr in the send path. This > > option is only allowed with SOCK_RAW sockets attached to ethernet > > type devices. > > > > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com> > > So the main issue with this implemenation is that it silently fails for > non-ethernet protocols. It would be better to detect unsupported > protocols and return an error to user. Yes. I could return EINVAL or EOPNOTSUPP when trying to send/receive a packet with virtio_net_hdr on non-ethernet devices. > This is same issue that was > pointed out by DaveM with my earlier attempt to solve a different > (related) problem: > http://lkml.org/lkml/2010/1/5/474 > For an incomplete prototype attempting to solve the issue in a generic way: > http://lkml.org/lkml/2010/1/6/56 > > A couple of additional comments below. > > > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h > > index 4021d47..aa57a5f 100644 > > --- a/include/linux/if_packet.h > > +++ b/include/linux/if_packet.h > > @@ -46,6 +46,7 @@ struct sockaddr_ll { > > #define PACKET_RESERVE 12 > > #define PACKET_TX_RING 13 > > #define PACKET_LOSS 14 > > +#define PACKET_VNET_HDR 15 > > > > struct tpacket_stats { > > unsigned int tp_packets; > > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c > > index 53633c5..36d5360 100644 > > --- a/net/packet/af_packet.c > > +++ b/net/packet/af_packet.c > > @@ -80,6 +80,8 @@ > > #include <linux/init.h> > > #include <linux/mutex.h> > > #include <linux/if_vlan.h> > > +#include <linux/virtio_net.h> > > +#include <linux/if_arp.h> > > > > #ifdef CONFIG_INET > > #include <net/inet_common.h> > > @@ -193,7 +195,8 @@ struct packet_sock { > > struct mutex pg_vec_lock; > > unsigned int running:1, /* prot_hook is attached*/ > > auxdata:1, > > - origdev:1; > > + origdev:1, > > + vnet_hdr:1; > > int ifindex; /* bound device */ > > __be16 num; > > struct packet_mclist *mclist; > > @@ -1056,6 +1059,30 @@ out: > > } > > #endif > > > > +static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, > > + size_t reserve, size_t len, > > + size_t linear, int noblock, > > + int *err) > > +{ > > + struct sk_buff *skb; > > + > > + /* Under a page? Don't bother with paged skb. */ > > + if (prepad + len < PAGE_SIZE || !linear) > > + linear = len; > > + > > + skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, > > + err); > > + if (!skb) > > + return NULL; > > + > > + skb_reserve(skb, reserve); > > + skb_put(skb, linear); > > + skb->data_len = len - linear; > > + skb->len += len - linear; > > + > > + return skb; > > +} > > + > > static int packet_snd(struct socket *sock, > > struct msghdr *msg, size_t len) > > { > > @@ -1066,14 +1093,15 @@ static int packet_snd(struct socket *sock, > > __be16 proto; > > unsigned char *addr; > > int ifindex, err, reserve = 0; > > + struct virtio_net_hdr vnethdr = { 0 }; > > + int offset = 0; > > + struct packet_sock *po = pkt_sk(sk); > > > > /* > > * Get and verify the address. > > */ > > > > if (saddr == NULL) { > > - struct packet_sock *po = pkt_sk(sk); > > - > > ifindex = po->ifindex; > > proto = po->num; > > addr = NULL; > > @@ -1100,25 +1128,52 @@ static int packet_snd(struct socket *sock, > > if (!(dev->flags & IFF_UP)) > > goto out_unlock; > > > > - err = -EMSGSIZE; > > - if (len > dev->mtu+reserve) > > - goto out_unlock; > > + if (po->vnet_hdr) { > > + err = -EINVAL; > > + if (dev->type != ARPHRD_ETHER) > > + goto out_unlock; > > + > > + if (len < sizeof(vnethdr)) > > + goto out_unlock; > > > > - skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev), > > - msg->msg_flags & MSG_DONTWAIT, &err); > > + len -= sizeof(vnethdr); > > + > > + err = -EFAULT; > > + if (memcpy_fromiovec((void *)&vnethdr, msg->msg_iov, > > + sizeof(vnethdr))) > > + goto out_unlock; > > + > > + if ((vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && > > + (vnethdr.csum_start + vnethdr.csum_offset + 2 > > > + vnethdr.hdr_len)) > > + vnethdr.hdr_len = vnethdr.csum_start + > > + vnethdr.csum_offset + 2; > > + > > + err = -EINVAL; > > + if (vnethdr.hdr_len > len) > > + goto out_unlock; > > + } else { > > + err = -EMSGSIZE; > > + if (len > dev->mtu+reserve) > > + goto out_unlock; > > IMO we should always perform the length check if GSO is off. OK. I will fix this. > > > + } > > + > > + err = -ENOBUFS; > > + skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), > > + LL_RESERVED_SPACE(dev), len, vnethdr.hdr_len, > > + msg->msg_flags & MSG_DONTWAIT, &err); > > if (skb == NULL) > > goto out_unlock; > > > > - skb_reserve(skb, LL_RESERVED_SPACE(dev)); > > - skb_reset_network_header(skb); > > + skb_set_network_header(skb, reserve); > > I think the above is wrong for vlans? I also thought we need to address vlans here, but even tun doesn't handle this in the send routine. I submitted a patch that fixed skb_gso_segment() to handle vlan packets. This will address both tun and packet sockets. http://thread.gmane.org/gmane.linux.network/150198 With this patch, i tested vlans with both ipv4 and ipv6. > > > > > err = -EINVAL; > > if (sock->type == SOCK_DGRAM && > > - dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0) > > + (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) > > goto out_free; > > > > /* Returns -EFAULT on error */ > > - err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); > > + err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); > > if (err) > > goto out_free; > > > > @@ -1127,6 +1182,51 @@ static int packet_snd(struct socket *sock, > > skb->priority = sk->sk_priority; > > skb->mark = sk->sk_mark; > > > > + if (po->vnet_hdr) { > > + skb_reset_mac_header(skb); > > + skb->protocol = eth_hdr(skb)->h_proto; > > + > > Is this also broken for vlans? Same as above. > > > + if (vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { > > + if (!skb_partial_csum_set(skb, vnethdr.csum_start, > > + vnethdr.csum_offset)) { > > + err = -EINVAL; > > + goto out_free; > > + } > > + } > > + > > + if (vnethdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { > > + switch (vnethdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { > > + case VIRTIO_NET_HDR_GSO_TCPV4: > > + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; > > + break; > > + case VIRTIO_NET_HDR_GSO_TCPV6: > > + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; > > + break; > > + case VIRTIO_NET_HDR_GSO_UDP: > > + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; > > + break; > > + default: > > + err = -EINVAL; > > + goto out_free; > > + } > > + > > + if (vnethdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) > > + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; > > + > > + skb_shinfo(skb)->gso_size = vnethdr.gso_size; > > + if (skb_shinfo(skb)->gso_size == 0) { > > + err = -EINVAL; > > + goto out_free; > > + } > > + > > + /* Header must be checked, and gso_segs computed. */ > > + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; > > + skb_shinfo(skb)->gso_segs = 0; > > + } > > + > > + len += sizeof(vnethdr); > > + } > > + > > /* > > * Now send it > > */ > > @@ -1420,6 +1520,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, > > struct sk_buff *skb; > > int copied, err; > > struct sockaddr_ll *sll; > > + int vnet_hdr_len = 0; > > > > err = -EINVAL; > > if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) > > @@ -1451,6 +1552,44 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, > > if (skb == NULL) > > goto out; > > > > + if (pkt_sk(sk)->vnet_hdr) { > > + struct virtio_net_hdr vnethdr = { 0 }; > > + > > + vnet_hdr_len = sizeof(vnethdr); > > + if ((len -= vnet_hdr_len) < 0) > > + return -EINVAL; > > + > > + if (skb_is_gso(skb)) { > > + struct skb_shared_info *sinfo = skb_shinfo(skb); > > + > > + /* This is a hint as to how much should be linear. */ > > + vnethdr.hdr_len = skb_headlen(skb); > > + vnethdr.gso_size = sinfo->gso_size; > > + if (sinfo->gso_type & SKB_GSO_TCPV4) > > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; > > + else if (sinfo->gso_type & SKB_GSO_TCPV6) > > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; > > + else if (sinfo->gso_type & SKB_GSO_UDP) > > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; > > + else > > + BUG(); > > Is there any chance this can get SKB_GSO_FCOE by binding to > an appropriate interface? Maybe we don't want to BUG(). I could return -EINVAL in that case. > > > + if (sinfo->gso_type & SKB_GSO_TCP_ECN) > > + vnethdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; > > + } else > > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; > > + > > + if (skb->ip_summed == CHECKSUM_PARTIAL) { > > + vnethdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; > > + vnethdr.csum_start = skb->csum_start - skb_headroom(skb); > > + vnethdr.csum_offset = skb->csum_offset; > > + } /* else everything is zero */ > > + > > + if (unlikely(memcpy_toiovec(msg->msg_iov, (void *)&vnethdr, > > + sizeof(vnethdr)))) { > > + return -EFAULT; > > + } > > + } > > + > > /* > > * If the address length field is there to be filled in, we fill > > * it in now. > > @@ -1502,7 +1641,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, > > * Free or return the buffer as appropriate. Again this > > * hides all the races and re-entrancy issues from us. > > */ > > - err = (flags&MSG_TRUNC) ? skb->len : copied; > > + err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); > > > > out_free: > > skb_free_datagram(sk, skb); > > @@ -1826,6 +1965,22 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv > > po->origdev = !!val; > > return 0; > > } > > + case PACKET_VNET_HDR: > > + { > > + int val; > > + > > + if (sock->type != SOCK_RAW) > > + return -EINVAL; > > + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) > > + return -EBUSY; > > Another way to get a broken ring + vnet hdr configuration > would be to enable vnet hdr first and mmap second. > I think we need to guard against this as well, by checking vnet_hdr > when tx/rx ring is enabled. OK. i will add a check when setting PACKET_RX_RING/TX_RING socket options. > > + if (optlen < sizeof(val)) > > + return -EINVAL; > > + if (copy_from_user(&val, optval, sizeof(val))) > > + return -EFAULT; > > + > > + po->vnet_hdr = !!val; > > + return 0; > > + } > > default: > > return -ENOPROTOOPT; > > } > > @@ -1876,6 +2031,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, > > > > data = &val; > > break; > > + case PACKET_VNET_HDR: > > + if (len > sizeof(int)) > > + len = sizeof(int); > > + val = po->vnet_hdr; > > + > > + data = &val; > > + break; > > #ifdef CONFIG_PACKET_MMAP > > case PACKET_VERSION: > > if (len > sizeof(int)) > > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 27, 2010 at 09:42:37AM -0800, Sridhar Samudrala wrote: > On Wed, 2010-01-27 at 13:42 +0200, Michael S. Tsirkin wrote: > > On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote: > > > This patch adds GSO/checksum offload to af_packet sockets using > > > virtio_net_hdr. Based on Rusty's patch to add this support to tun. > > > It allows GSO/checksum offload to be enabled when using raw socket > > > backend with virtio_net. > > > Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the > > > receive path and process/skip virtio_net_hdr in the send path. This > > > option is only allowed with SOCK_RAW sockets attached to ethernet > > > type devices. > > > > > > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com> > > > > So the main issue with this implemenation is that it silently fails for > > non-ethernet protocols. It would be better to detect unsupported > > protocols and return an error to user. > > Yes. I could return EINVAL or EOPNOTSUPP when trying to send/receive a > packet with virtio_net_hdr on non-ethernet devices. non-ethernet *protocols* are at issue here. > > This is same issue that was > > pointed out by DaveM with my earlier attempt to solve a different > > (related) problem: > > http://lkml.org/lkml/2010/1/5/474 > > For an incomplete prototype attempting to solve the issue in a generic way: > > http://lkml.org/lkml/2010/1/6/56 > > > > A couple of additional comments below. > > > > > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h > > > index 4021d47..aa57a5f 100644 > > > --- a/include/linux/if_packet.h > > > +++ b/include/linux/if_packet.h > > > @@ -46,6 +46,7 @@ struct sockaddr_ll { > > > #define PACKET_RESERVE 12 > > > #define PACKET_TX_RING 13 > > > #define PACKET_LOSS 14 > > > +#define PACKET_VNET_HDR 15 > > > > > > struct tpacket_stats { > > > unsigned int tp_packets; > > > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c > > > index 53633c5..36d5360 100644 > > > --- a/net/packet/af_packet.c > > > +++ b/net/packet/af_packet.c > > > @@ -80,6 +80,8 @@ > > > #include <linux/init.h> > > > #include <linux/mutex.h> > > > #include <linux/if_vlan.h> > > > +#include <linux/virtio_net.h> > > > +#include <linux/if_arp.h> > > > > > > #ifdef CONFIG_INET > > > #include <net/inet_common.h> > > > @@ -193,7 +195,8 @@ struct packet_sock { > > > struct mutex pg_vec_lock; > > > unsigned int running:1, /* prot_hook is attached*/ > > > auxdata:1, > > > - origdev:1; > > > + origdev:1, > > > + vnet_hdr:1; > > > int ifindex; /* bound device */ > > > __be16 num; > > > struct packet_mclist *mclist; > > > @@ -1056,6 +1059,30 @@ out: > > > } > > > #endif > > > > > > +static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, > > > + size_t reserve, size_t len, > > > + size_t linear, int noblock, > > > + int *err) > > > +{ > > > + struct sk_buff *skb; > > > + > > > + /* Under a page? Don't bother with paged skb. */ > > > + if (prepad + len < PAGE_SIZE || !linear) > > > + linear = len; > > > + > > > + skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, > > > + err); > > > + if (!skb) > > > + return NULL; > > > + > > > + skb_reserve(skb, reserve); > > > + skb_put(skb, linear); > > > + skb->data_len = len - linear; > > > + skb->len += len - linear; > > > + > > > + return skb; > > > +} > > > + > > > static int packet_snd(struct socket *sock, > > > struct msghdr *msg, size_t len) > > > { > > > @@ -1066,14 +1093,15 @@ static int packet_snd(struct socket *sock, > > > __be16 proto; > > > unsigned char *addr; > > > int ifindex, err, reserve = 0; > > > + struct virtio_net_hdr vnethdr = { 0 }; > > > + int offset = 0; > > > + struct packet_sock *po = pkt_sk(sk); > > > > > > /* > > > * Get and verify the address. > > > */ > > > > > > if (saddr == NULL) { > > > - struct packet_sock *po = pkt_sk(sk); > > > - > > > ifindex = po->ifindex; > > > proto = po->num; > > > addr = NULL; > > > @@ -1100,25 +1128,52 @@ static int packet_snd(struct socket *sock, > > > if (!(dev->flags & IFF_UP)) > > > goto out_unlock; > > > > > > - err = -EMSGSIZE; > > > - if (len > dev->mtu+reserve) > > > - goto out_unlock; > > > + if (po->vnet_hdr) { > > > + err = -EINVAL; > > > + if (dev->type != ARPHRD_ETHER) > > > + goto out_unlock; > > > + > > > + if (len < sizeof(vnethdr)) > > > + goto out_unlock; > > > > > > - skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev), > > > - msg->msg_flags & MSG_DONTWAIT, &err); > > > + len -= sizeof(vnethdr); > > > + > > > + err = -EFAULT; > > > + if (memcpy_fromiovec((void *)&vnethdr, msg->msg_iov, > > > + sizeof(vnethdr))) > > > + goto out_unlock; > > > + > > > + if ((vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && > > > + (vnethdr.csum_start + vnethdr.csum_offset + 2 > > > > + vnethdr.hdr_len)) > > > + vnethdr.hdr_len = vnethdr.csum_start + > > > + vnethdr.csum_offset + 2; > > > + > > > + err = -EINVAL; > > > + if (vnethdr.hdr_len > len) > > > + goto out_unlock; > > > + } else { > > > + err = -EMSGSIZE; > > > + if (len > dev->mtu+reserve) > > > + goto out_unlock; > > > > IMO we should always perform the length check if GSO is off. > OK. I will fix this. > > > > > + } > > > + > > > + err = -ENOBUFS; > > > + skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), > > > + LL_RESERVED_SPACE(dev), len, vnethdr.hdr_len, > > > + msg->msg_flags & MSG_DONTWAIT, &err); > > > if (skb == NULL) > > > goto out_unlock; > > > > > > - skb_reserve(skb, LL_RESERVED_SPACE(dev)); > > > - skb_reset_network_header(skb); > > > + skb_set_network_header(skb, reserve); > > > > I think the above is wrong for vlans? > > I also thought we need to address vlans here, but even tun doesn't > handle this in the send routine. I submitted a patch that fixed > skb_gso_segment() to handle vlan packets. This will address both > tun and packet sockets. > http://thread.gmane.org/gmane.linux.network/150198 > > With this patch, i tested vlans with both ipv4 and ipv6. Well, there are more protocols than just vlans :) BTW, if there are dependencies between patches, you probably want to put them in a patchset so they can be judged together. > > > > > > > > err = -EINVAL; > > > if (sock->type == SOCK_DGRAM && > > > - dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0) > > > + (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) > > > goto out_free; > > > > > > /* Returns -EFAULT on error */ > > > - err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); > > > + err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); > > > if (err) > > > goto out_free; > > > > > > @@ -1127,6 +1182,51 @@ static int packet_snd(struct socket *sock, > > > skb->priority = sk->sk_priority; > > > skb->mark = sk->sk_mark; > > > > > > + if (po->vnet_hdr) { > > > + skb_reset_mac_header(skb); > > > + skb->protocol = eth_hdr(skb)->h_proto; > > > + > > > > Is this also broken for vlans? > > Same as above. > > > > > + if (vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { > > > + if (!skb_partial_csum_set(skb, vnethdr.csum_start, > > > + vnethdr.csum_offset)) { > > > + err = -EINVAL; > > > + goto out_free; > > > + } > > > + } > > > + > > > + if (vnethdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { > > > + switch (vnethdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { > > > + case VIRTIO_NET_HDR_GSO_TCPV4: > > > + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; > > > + break; > > > + case VIRTIO_NET_HDR_GSO_TCPV6: > > > + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; > > > + break; > > > + case VIRTIO_NET_HDR_GSO_UDP: > > > + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; > > > + break; > > > + default: > > > + err = -EINVAL; > > > + goto out_free; > > > + } > > > + > > > + if (vnethdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) > > > + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; > > > + > > > + skb_shinfo(skb)->gso_size = vnethdr.gso_size; > > > + if (skb_shinfo(skb)->gso_size == 0) { > > > + err = -EINVAL; > > > + goto out_free; > > > + } > > > + > > > + /* Header must be checked, and gso_segs computed. */ > > > + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; > > > + skb_shinfo(skb)->gso_segs = 0; > > > + } > > > + > > > + len += sizeof(vnethdr); > > > + } > > > + > > > /* > > > * Now send it > > > */ > > > @@ -1420,6 +1520,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, > > > struct sk_buff *skb; > > > int copied, err; > > > struct sockaddr_ll *sll; > > > + int vnet_hdr_len = 0; > > > > > > err = -EINVAL; > > > if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) > > > @@ -1451,6 +1552,44 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, > > > if (skb == NULL) > > > goto out; > > > > > > + if (pkt_sk(sk)->vnet_hdr) { > > > + struct virtio_net_hdr vnethdr = { 0 }; > > > + > > > + vnet_hdr_len = sizeof(vnethdr); > > > + if ((len -= vnet_hdr_len) < 0) > > > + return -EINVAL; > > > + > > > + if (skb_is_gso(skb)) { > > > + struct skb_shared_info *sinfo = skb_shinfo(skb); > > > + > > > + /* This is a hint as to how much should be linear. */ > > > + vnethdr.hdr_len = skb_headlen(skb); > > > + vnethdr.gso_size = sinfo->gso_size; > > > + if (sinfo->gso_type & SKB_GSO_TCPV4) > > > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; > > > + else if (sinfo->gso_type & SKB_GSO_TCPV6) > > > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; > > > + else if (sinfo->gso_type & SKB_GSO_UDP) > > > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; > > > + else > > > + BUG(); > > > > Is there any chance this can get SKB_GSO_FCOE by binding to > > an appropriate interface? Maybe we don't want to BUG(). > I could return -EINVAL in that case. > > > > > > + if (sinfo->gso_type & SKB_GSO_TCP_ECN) > > > + vnethdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; > > > + } else > > > + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; > > > + > > > + if (skb->ip_summed == CHECKSUM_PARTIAL) { > > > + vnethdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; > > > + vnethdr.csum_start = skb->csum_start - skb_headroom(skb); > > > + vnethdr.csum_offset = skb->csum_offset; > > > + } /* else everything is zero */ > > > + > > > + if (unlikely(memcpy_toiovec(msg->msg_iov, (void *)&vnethdr, > > > + sizeof(vnethdr)))) { > > > + return -EFAULT; > > > + } > > > + } > > > + > > > /* > > > * If the address length field is there to be filled in, we fill > > > * it in now. > > > @@ -1502,7 +1641,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, > > > * Free or return the buffer as appropriate. Again this > > > * hides all the races and re-entrancy issues from us. > > > */ > > > - err = (flags&MSG_TRUNC) ? skb->len : copied; > > > + err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); > > > > > > out_free: > > > skb_free_datagram(sk, skb); > > > @@ -1826,6 +1965,22 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv > > > po->origdev = !!val; > > > return 0; > > > } > > > + case PACKET_VNET_HDR: > > > + { > > > + int val; > > > + > > > + if (sock->type != SOCK_RAW) > > > + return -EINVAL; > > > + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) > > > + return -EBUSY; > > > > Another way to get a broken ring + vnet hdr configuration > > would be to enable vnet hdr first and mmap second. > > I think we need to guard against this as well, by checking vnet_hdr > > when tx/rx ring is enabled. > > OK. i will add a check when setting PACKET_RX_RING/TX_RING socket > options. > > > + if (optlen < sizeof(val)) > > > + return -EINVAL; > > > + if (copy_from_user(&val, optval, sizeof(val))) > > > + return -EFAULT; > > > + > > > + po->vnet_hdr = !!val; > > > + return 0; > > > + } > > > default: > > > return -ENOPROTOOPT; > > > } > > > @@ -1876,6 +2031,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, > > > > > > data = &val; > > > break; > > > + case PACKET_VNET_HDR: > > > + if (len > sizeof(int)) > > > + len = sizeof(int); > > > + val = po->vnet_hdr; > > > + > > > + data = &val; > > > + break; > > > #ifdef CONFIG_PACKET_MMAP > > > case PACKET_VERSION: > > > if (len > sizeof(int)) > > > > > > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote: > > + if (po->vnet_hdr) { > + err = -EINVAL; > + if (dev->type != ARPHRD_ETHER) > + goto out_unlock; We shouldn't have Ethernet-specific code in AF_PACKET. What's more, just because the device type is Ethernet it doesn't mean that the packet is going to have an Ethernet header. Cheers,
On Fri, 2010-01-29 at 21:53 +1300, Herbert Xu wrote: > On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote: > > > > + if (po->vnet_hdr) { > > + err = -EINVAL; > > + if (dev->type != ARPHRD_ETHER) > > + goto out_unlock; > > We shouldn't have Ethernet-specific code in AF_PACKET. What's > more, just because the device type is Ethernet it doesn't mean > that the packet is going to have an Ethernet header. This check is to dis-allow processing of packets with virtio_net_hdr destined for non-ethernet devices. Is it OK if i add a check in packet_bind() to not allow binding to a non-ethernet device when PACKET_VNET_HDR option is set? I need to figure out a way to set the skb protocol correctly based on the packet. Any clues? Michael has posted a prototype patch that addresses this. Do you agree with this approach? http://lkml.org/lkml/2010/1/6/56 Did you get a chance to look at my other patch that adds a check for VLAN packets in skb_gso_segment() to address a bug when sending large VLAN packets from a guest? http://thread.gmane.org/gmane.linux.network/150198 Thanks Sridhar -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 29, 2010 at 01:25:08PM -0800, Sridhar Samudrala wrote: > > This check is to dis-allow processing of packets with virtio_net_hdr > destined for non-ethernet devices. > Is it OK if i add a check in packet_bind() to not allow binding to > a non-ethernet device when PACKET_VNET_HDR option is set? > > I need to figure out a way to set the skb protocol correctly based > on the packet. Any clues? IMHO the skb protocol should be set by whatever is invoking the sendmsg call. After all they would know what the L2 header is and should be able to deduce the protocol correctly. Adding all this logic into AF_PACKET is just a hack. Cheers,
On 1/29/2010 1:36 PM, Herbert Xu wrote: > On Fri, Jan 29, 2010 at 01:25:08PM -0800, Sridhar Samudrala wrote: > >> This check is to dis-allow processing of packets with virtio_net_hdr >> destined for non-ethernet devices. >> Is it OK if i add a check in packet_bind() to not allow binding to >> a non-ethernet device when PACKET_VNET_HDR option is set? >> >> I need to figure out a way to set the skb protocol correctly based >> on the packet. Any clues? >> > IMHO the skb protocol should be set by whatever is invoking the > sendmsg call. After all they would know what the L2 header is > and should be able to deduce the protocol correctly. > In this use-case, the component that is calling sendmsg() can be either the vhost-net in host kernel or qemu. They get the buffer including the L2 header from the guest. They too need to parse the L2 header to find the protocol. The packet itself originates from the guest virtio-net driver. but when it converts the skb to a virtio-ring buffer, some of the information in the skb(including protocol and vlan info etc) is lost. I guess if we could re-design, virtio_net_hdr would include this info too. But i think it is too late to make any such changes. So either the callers of sendmsg() or packet_snd() need to parse the L2 header to get the protocol and vlan info etc. sendmsg() caller could fill in the protocol, but not the vlan info. Thanks Sridhar -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 29, 2010 at 02:30:49PM -0800, Sridhar Samudrala wrote: > > In this use-case, the component that is calling sendmsg() can be either > the vhost-net in host kernel > or qemu. They get the buffer including the L2 header from the guest. > They too need to parse the L2 > header to find the protocol. So have them parse the L2 header to get the protocol. They know that the packet should contain an Ethernet header, while this knowledge would be inappropriate for af_packet in general. This is also why we call eth_type_trans in network drivers as opposed to the core. Cheers,
diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h index 4021d47..aa57a5f 100644 --- a/include/linux/if_packet.h +++ b/include/linux/if_packet.h @@ -46,6 +46,7 @@ struct sockaddr_ll { #define PACKET_RESERVE 12 #define PACKET_TX_RING 13 #define PACKET_LOSS 14 +#define PACKET_VNET_HDR 15 struct tpacket_stats { unsigned int tp_packets; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 53633c5..36d5360 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -80,6 +80,8 @@ #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> +#include <linux/virtio_net.h> +#include <linux/if_arp.h> #ifdef CONFIG_INET #include <net/inet_common.h> @@ -193,7 +195,8 @@ struct packet_sock { struct mutex pg_vec_lock; unsigned int running:1, /* prot_hook is attached*/ auxdata:1, - origdev:1; + origdev:1, + vnet_hdr:1; int ifindex; /* bound device */ __be16 num; struct packet_mclist *mclist; @@ -1056,6 +1059,30 @@ out: } #endif +static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, + size_t reserve, size_t len, + size_t linear, int noblock, + int *err) +{ + struct sk_buff *skb; + + /* Under a page? Don't bother with paged skb. */ + if (prepad + len < PAGE_SIZE || !linear) + linear = len; + + skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, + err); + if (!skb) + return NULL; + + skb_reserve(skb, reserve); + skb_put(skb, linear); + skb->data_len = len - linear; + skb->len += len - linear; + + return skb; +} + static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { @@ -1066,14 +1093,15 @@ static int packet_snd(struct socket *sock, __be16 proto; unsigned char *addr; int ifindex, err, reserve = 0; + struct virtio_net_hdr vnethdr = { 0 }; + int offset = 0; + struct packet_sock *po = pkt_sk(sk); /* * Get and verify the address. */ if (saddr == NULL) { - struct packet_sock *po = pkt_sk(sk); - ifindex = po->ifindex; proto = po->num; addr = NULL; @@ -1100,25 +1128,52 @@ static int packet_snd(struct socket *sock, if (!(dev->flags & IFF_UP)) goto out_unlock; - err = -EMSGSIZE; - if (len > dev->mtu+reserve) - goto out_unlock; + if (po->vnet_hdr) { + err = -EINVAL; + if (dev->type != ARPHRD_ETHER) + goto out_unlock; + + if (len < sizeof(vnethdr)) + goto out_unlock; - skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev), - msg->msg_flags & MSG_DONTWAIT, &err); + len -= sizeof(vnethdr); + + err = -EFAULT; + if (memcpy_fromiovec((void *)&vnethdr, msg->msg_iov, + sizeof(vnethdr))) + goto out_unlock; + + if ((vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && + (vnethdr.csum_start + vnethdr.csum_offset + 2 > + vnethdr.hdr_len)) + vnethdr.hdr_len = vnethdr.csum_start + + vnethdr.csum_offset + 2; + + err = -EINVAL; + if (vnethdr.hdr_len > len) + goto out_unlock; + } else { + err = -EMSGSIZE; + if (len > dev->mtu+reserve) + goto out_unlock; + } + + err = -ENOBUFS; + skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), + LL_RESERVED_SPACE(dev), len, vnethdr.hdr_len, + msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb_reset_network_header(skb); + skb_set_network_header(skb, reserve); err = -EINVAL; if (sock->type == SOCK_DGRAM && - dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0) + (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) goto out_free; /* Returns -EFAULT on error */ - err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); if (err) goto out_free; @@ -1127,6 +1182,51 @@ static int packet_snd(struct socket *sock, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + if (po->vnet_hdr) { + skb_reset_mac_header(skb); + skb->protocol = eth_hdr(skb)->h_proto; + + if (vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + if (!skb_partial_csum_set(skb, vnethdr.csum_start, + vnethdr.csum_offset)) { + err = -EINVAL; + goto out_free; + } + } + + if (vnethdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (vnethdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + break; + case VIRTIO_NET_HDR_GSO_UDP: + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + break; + default: + err = -EINVAL; + goto out_free; + } + + if (vnethdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + + skb_shinfo(skb)->gso_size = vnethdr.gso_size; + if (skb_shinfo(skb)->gso_size == 0) { + err = -EINVAL; + goto out_free; + } + + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + len += sizeof(vnethdr); + } + /* * Now send it */ @@ -1420,6 +1520,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb; int copied, err; struct sockaddr_ll *sll; + int vnet_hdr_len = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) @@ -1451,6 +1552,44 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb == NULL) goto out; + if (pkt_sk(sk)->vnet_hdr) { + struct virtio_net_hdr vnethdr = { 0 }; + + vnet_hdr_len = sizeof(vnethdr); + if ((len -= vnet_hdr_len) < 0) + return -EINVAL; + + if (skb_is_gso(skb)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + /* This is a hint as to how much should be linear. */ + vnethdr.hdr_len = skb_headlen(skb); + vnethdr.gso_size = sinfo->gso_size; + if (sinfo->gso_type & SKB_GSO_TCPV4) + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else if (sinfo->gso_type & SKB_GSO_TCPV6) + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP) + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; + else + BUG(); + if (sinfo->gso_type & SKB_GSO_TCP_ECN) + vnethdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; + } else + vnethdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + vnethdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnethdr.csum_start = skb->csum_start - skb_headroom(skb); + vnethdr.csum_offset = skb->csum_offset; + } /* else everything is zero */ + + if (unlikely(memcpy_toiovec(msg->msg_iov, (void *)&vnethdr, + sizeof(vnethdr)))) { + return -EFAULT; + } + } + /* * If the address length field is there to be filled in, we fill * it in now. @@ -1502,7 +1641,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ - err = (flags&MSG_TRUNC) ? skb->len : copied; + err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); @@ -1826,6 +1965,22 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv po->origdev = !!val; return 0; } + case PACKET_VNET_HDR: + { + int val; + + if (sock->type != SOCK_RAW) + return -EINVAL; + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) + return -EBUSY; + if (optlen < sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->vnet_hdr = !!val; + return 0; + } default: return -ENOPROTOOPT; } @@ -1876,6 +2031,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, data = &val; break; + case PACKET_VNET_HDR: + if (len > sizeof(int)) + len = sizeof(int); + val = po->vnet_hdr; + + data = &val; + break; #ifdef CONFIG_PACKET_MMAP case PACKET_VERSION: if (len > sizeof(int))
This patch adds GSO/checksum offload to af_packet sockets using virtio_net_hdr. Based on Rusty's patch to add this support to tun. It allows GSO/checksum offload to be enabled when using raw socket backend with virtio_net. Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the receive path and process/skip virtio_net_hdr in the send path. This option is only allowed with SOCK_RAW sockets attached to ethernet type devices. Signed-off-by: Sridhar Samudrala <sri@us.ibm.com> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html