Message ID | 20091014143415.GA29937@redhat.com |
---|---|
State | New |
Headers | show |
Or Gerlitz wrote: > Add raw network backend option which uses a packet socket to provide > raw networking access. Once the socket is opened it's bound to a > provided host interface, such that packets received on the interface > are delivered to the VM and packets sent by the VM are sent to the > interface. > > This is functionally similar to the existing pcap network > backend, with the same advantages and problems. > Differences from pcap: > - can get an open socket from the monitor, > which allows running without NET_ADMIN priviledges > - support iovec sends with writev, saving one data copy > - one less dependency on an external library > - we have access to the underlying file descriptor > which makes it possible to connect to vhost net > - don't support polling all interfaces, always bind to a specific one > Networking is probably the area in qemu that users most frequently stumble with. The most common problems are: 1) slirp does not behave how they think it should (icmp doesn't work, guest isn't accessable from host) 2) it's difficult to figure out which backend behaves the way they want (socket vs. vde vs. tap) 3) when they figure out they need tap, tap is difficult to setup The problem with introducing a raw backend (or a pcap backend) is that it makes #2 even worse because now a user has to figure out whether they need raw/pcap or tap. But most troubling, it introduces another issue: 4) raw does not behave how they think it should because guest<->host networking does not work bidirectionally So unless there's an extremely compelling reason to have this, I'd rather not introduce this complexity. NB, I see this as a problem with vhost_net too if #4 is also true in that context. > Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > --- > hw/virtio-net.c | 3 +- > net.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ > qemu-options.hx | 4 + > 3 files changed, 198 insertions(+), 1 deletions(-) > > diff --git a/hw/virtio-net.c b/hw/virtio-net.c > index 1ac05a2..95d9f93 100644 > --- a/hw/virtio-net.c > +++ b/hw/virtio-net.c > @@ -545,7 +545,8 @@ static ssize_t virtio_net_receive2(VLANClientState *vc, const uint8_t *buf, size > virtqueue_pop(n->rx_vq, &elem) == 0) { > if (i == 0) > return -1; > - fprintf(stderr, "virtio-net truncating packet\n"); > + fprintf(stderr, "virtio-net truncating packet. offset %zd size %zd\n", > + offset, size); > exit(1); > } > This doesn't belong here. > diff --git a/net.c b/net.c > index d93eaef..1e0e874 100644 > --- a/net.c > +++ b/net.c > @@ -93,6 +93,9 @@ > #endif > #endif > > +#include <netpacket/packet.h> > +#include <net/ethernet.h> > + > This is certainly missing guards. > #if defined(__OpenBSD__) > #include <util.h> > #endif > @@ -1860,6 +1863,158 @@ static TAPState *net_tap_init(VLANState *vlan, const char *model, > > #endif /* !_WIN32 */ > > +typedef struct RAWState { > + VLANClientState *vc; > + int fd; > + uint8_t buf[4096]; > + int promisc; > +} RAWState; > + > +static int net_raw_fd_init(Monitor *mon, const char *ifname, int promisc) > +{ > + int fd, ret; > + struct ifreq req; > + struct sockaddr_ll lladdr; > + > + fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); > + if (fd < 0) > + fprintf(stderr, "packet socket failed\n"); > CodingStyle Also, this error checking should use the monitor error reporting framework. And falling through with fd=-1 certainly means we'll SEGV or worse further down. > + memset(&req, 0, sizeof(req)); > + strncpy(req.ifr_name, ifname, IFNAMSIZ-1); > Would be better to just use snprintf > + ret = ioctl(fd, SIOCGIFINDEX, &req); > + if (ret < 0) > + fprintf(stderr, "SIOCGIFINDEX failed\n"); > + > + memset(&lladdr, 0, sizeof(lladdr)); > + lladdr.sll_family = AF_PACKET; > + lladdr.sll_protocol = htons(ETH_P_ALL); > + lladdr.sll_ifindex = req.ifr_ifindex; > + ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr)); > + if (ret < 0) > + fprintf(stderr, "bind failed\n"); > > Error checking is bad here. > + /* set iface to promiscuous mode (packets sent to the VM MAC) */ > + if (promisc) { > + ret = ioctl(fd, SIOCGIFFLAGS, &req); > + if (ret < 0) > + perror("SIOCGIFFLAGS failed\n"); > + req.ifr_flags |= IFF_PROMISC; > + ret = ioctl(fd, SIOCSIFFLAGS, &req); > + if (ret < 0) > + fprintf(stderr, "SIOCSIFFLAGS to promiscous failed\n"); > + } > I suspect these ioctls are Linux specific. > + ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK); > + if (ret < 0) > + fprintf(stderr, "O_NONBLOCK set failed\n"); > + > + return fd; > +} > + > +static void raw_cleanup(VLANClientState *vc) > +{ > + struct ifreq req; > + RAWState *s = vc->opaque; > + > + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > + if (s->promisc) { > + ioctl(s->fd, SIOCGIFFLAGS, &req); > + req.ifr_flags &= ~IFF_PROMISC; > + ioctl(s->fd, SIOCSIFFLAGS, &req); > + } > + close(s->fd); > + qemu_free(s); > +} > + > +static void raw_send(void *opaque); > + > +static int raw_can_send(void *opaque) > +{ > + RAWState *s = opaque; > + > + return qemu_can_send_packet(s->vc); > +} > + > +static void raw_send_completed(VLANClientState *vc, ssize_t len) > +{ > + RAWState *s = vc->opaque; > + > + qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s); > +} > + > +static void raw_send(void *opaque) > +{ > + RAWState *s = opaque; > + int size; > + > + do { > + size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC); > + if (size <= 0) > + break; > Need to handle EINTR. > + size = qemu_send_packet_async(s->vc, s->buf, size, > + raw_send_completed); > + if (size == 0) > + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > + > + } while (size > 0); > +} > + > +static ssize_t raw_receive_iov(VLANClientState *vc, const struct iovec *iov, > + int iovcnt) > +{ > + ssize_t len; > + RAWState *s = vc->opaque; > + > + do { > + len = writev(s->fd, iov, iovcnt); > + } while (len == -1 && (errno == EINTR || errno == EAGAIN)); > Spinning on EAGAIN is certainly wrong. > +static int net_raw_init(Monitor *mon, VLANState *vlan, const char *model, > + const char *name, const char *ifname, > + int promisc, int fd) > +{ > + RAWState *s; > + > + s = qemu_mallocz(sizeof(RAWState)); > + > + if (fd == -1) { > + s->fd = net_raw_fd_init(mon, ifname, promisc); > + s->promisc = promisc; > + } else > + s->fd = fd; > + > + fcntl(s->fd, F_SETFL, O_NONBLOCK); > For net_raw_fd_int, you've already set O_NONBLOCK but you're also removing any other flags that have been set which is probably wrong for a passed in fd. > + s->vc = qemu_new_vlan_client(vlan, model, name, NULL, raw_receive, > diff --git a/qemu-options.hx b/qemu-options.hx > index bde3e3f..0d5440f 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -825,6 +825,10 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, > " default of 'sndbuf=1048576' can be disabled using 'sndbuf=0'\n" > #endif > #endif > + "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n" > + " bound the host network interface to VLAN 'n' in a raw manner:\n" > + " packets received on the interface are delivered to the vlan and\n" > + " packets delivered on the vlan are sent to the interface\n" > "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" > " connect the vlan 'n' to another VLAN using a socket connection\n" > "-net socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port]\n" > Needs documentation. Regards, Anthony Liguori
Anthony Liguori wrote: > Or Gerlitz wrote: > >Add raw network backend option which uses a packet socket to provide > >raw networking access. Once the socket is opened it's bound to a > >provided host interface, such that packets received on the interface > >are delivered to the VM and packets sent by the VM are sent to the > >interface. > > > >This is functionally similar to the existing pcap network > >backend, with the same advantages and problems. > >Differences from pcap: > >- can get an open socket from the monitor, > > which allows running without NET_ADMIN priviledges > >- support iovec sends with writev, saving one data copy > >- one less dependency on an external library > >- we have access to the underlying file descriptor > > which makes it possible to connect to vhost net > >- don't support polling all interfaces, always bind to a specific one > > > > Networking is probably the area in qemu that users most frequently > stumble with. The most common problems are: > > 1) slirp does not behave how they think it should (icmp doesn't work, > guest isn't accessable from host) > 2) it's difficult to figure out which backend behaves the way they want > (socket vs. vde vs. tap) > 3) when they figure out they need tap, tap is difficult to setup Worse, tap is impossible to setup properly with things like network-manager. > The problem with introducing a raw backend (or a pcap backend) is that > it makes #2 even worse because now a user has to figure out whether they > need raw/pcap or tap. But most troubling, it introduces another issue: > > 4) raw does not behave how they think it should because guest<->host > networking does not work bidirectionally I suspect user expectations are quite commonly: - guest<->host networking works - guest<->host's network works, directly or through host NAT - guest IP address is either private (inside the host) or on the same network as the host, according to some switch. Imho, there is only one right place to fix this, and it's by adding a feature to the host. Either modifying host packet socket, or modifying the tap+bridge combination. Neither tap nor pcap/raw works particularly well except in static IP configurations, and qemu cannot realistically work around the host configuration difficulties. > So unless there's an extremely compelling reason to have this, I'd > rather not introduce this complexity. NB, I see this as a problem with > vhost_net too if #4 is also true in that context. It'd be great if vhost_net doesn't have the configuration problems of tap or pcap/raw. If it does have the same problems, it's a natural place to fix them. I haven't looked at vhost_net yet. -- Jamie
On Wed, Oct 14, 2009 at 09:46:33AM -0500, Anthony Liguori wrote: > Or Gerlitz wrote: >> Add raw network backend option which uses a packet socket to provide >> raw networking access. Once the socket is opened it's bound to a >> provided host interface, such that packets received on the interface >> are delivered to the VM and packets sent by the VM are sent to the >> interface. >> >> This is functionally similar to the existing pcap network >> backend, with the same advantages and problems. >> Differences from pcap: >> - can get an open socket from the monitor, >> which allows running without NET_ADMIN priviledges >> - support iovec sends with writev, saving one data copy >> - one less dependency on an external library >> - we have access to the underlying file descriptor >> which makes it possible to connect to vhost net >> - don't support polling all interfaces, always bind to a specific one >> > > Networking is probably the area in qemu that users most frequently > stumble with. The most common problems are: > > 1) slirp does not behave how they think it should (icmp doesn't work, > guest isn't accessable from host) > 2) it's difficult to figure out which backend behaves the way they want > (socket vs. vde vs. tap) > 3) when they figure out they need tap, tap is difficult to setup tap does not need any setup. problem is, bridge needs setup. > The problem with introducing a raw backend (or a pcap backend) > is that > it makes #2 even worse because now a user has to figure out whether they > need raw/pcap or tap. But most troubling, it introduces another issue: > > 4) raw does not behave how they think it should because guest<->host > networking does not work bidirectionally OTOH icmp works fine, so we are not worse off than with slirp. > So unless there's an extremely compelling reason to have this, I work with remote machines all the time, having to fiddle with host bridge/network setup means I always risk losing the only way to admin the machine. So it's slirp or raw for me. If I'm the only one like this, I can keep maintaining this patch, but I doubt it. I consider this a compelling reason. > I'd rather not introduce this complexity. Does another option really add that much complexity? We add options all the time ... > NB, I see this as a problem with > vhost_net too if #4 is also true in that context. It's up to user whether to connect vhost net to tap or socket. I haven't posted userspace code to connect vhost to tap yet but I will RSN. >> Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com> >> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> >> --- >> hw/virtio-net.c | 3 +- >> net.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> qemu-options.hx | 4 + >> 3 files changed, 198 insertions(+), 1 deletions(-) >> >> diff --git a/hw/virtio-net.c b/hw/virtio-net.c >> index 1ac05a2..95d9f93 100644 >> --- a/hw/virtio-net.c >> +++ b/hw/virtio-net.c >> @@ -545,7 +545,8 @@ static ssize_t virtio_net_receive2(VLANClientState *vc, const uint8_t *buf, size >> virtqueue_pop(n->rx_vq, &elem) == 0) { >> if (i == 0) >> return -1; >> - fprintf(stderr, "virtio-net truncating packet\n"); >> + fprintf(stderr, "virtio-net truncating packet. offset %zd size %zd\n", >> + offset, size); >> exit(1); >> } >> > > This doesn't belong here. >> diff --git a/net.c b/net.c >> index d93eaef..1e0e874 100644 >> --- a/net.c >> +++ b/net.c >> @@ -93,6 +93,9 @@ >> #endif >> #endif >> +#include <netpacket/packet.h> >> +#include <net/ethernet.h> >> + >> > > This is certainly missing guards. > >> #if defined(__OpenBSD__) >> #include <util.h> >> #endif >> @@ -1860,6 +1863,158 @@ static TAPState *net_tap_init(VLANState *vlan, const char *model, >> #endif /* !_WIN32 */ >> +typedef struct RAWState { >> + VLANClientState *vc; >> + int fd; >> + uint8_t buf[4096]; >> + int promisc; >> +} RAWState; >> + >> +static int net_raw_fd_init(Monitor *mon, const char *ifname, int promisc) >> +{ >> + int fd, ret; >> + struct ifreq req; >> + struct sockaddr_ll lladdr; >> + >> + fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); >> + if (fd < 0) >> + fprintf(stderr, "packet socket failed\n"); >> > > CodingStyle > > Also, this error checking should use the monitor error reporting > framework. And falling through with fd=-1 certainly means we'll SEGV or > worse further down. > >> + memset(&req, 0, sizeof(req)); >> + strncpy(req.ifr_name, ifname, IFNAMSIZ-1); >> > > Would be better to just use snprintf > >> + ret = ioctl(fd, SIOCGIFINDEX, &req); >> + if (ret < 0) >> + fprintf(stderr, "SIOCGIFINDEX failed\n"); >> + >> + memset(&lladdr, 0, sizeof(lladdr)); >> + lladdr.sll_family = AF_PACKET; >> + lladdr.sll_protocol = htons(ETH_P_ALL); >> + lladdr.sll_ifindex = req.ifr_ifindex; >> + ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr)); >> + if (ret < 0) >> + fprintf(stderr, "bind failed\n"); >> >> > > Error checking is bad here. > >> + /* set iface to promiscuous mode (packets sent to the VM MAC) */ >> + if (promisc) { >> + ret = ioctl(fd, SIOCGIFFLAGS, &req); >> + if (ret < 0) >> + perror("SIOCGIFFLAGS failed\n"); >> + req.ifr_flags |= IFF_PROMISC; >> + ret = ioctl(fd, SIOCSIFFLAGS, &req); >> + if (ret < 0) >> + fprintf(stderr, "SIOCSIFFLAGS to promiscous failed\n"); >> + } >> > > I suspect these ioctls are Linux specific. > >> + ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK); >> + if (ret < 0) >> + fprintf(stderr, "O_NONBLOCK set failed\n"); >> + >> + return fd; >> +} >> + >> +static void raw_cleanup(VLANClientState *vc) >> +{ >> + struct ifreq req; >> + RAWState *s = vc->opaque; >> + >> + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); >> + if (s->promisc) { >> + ioctl(s->fd, SIOCGIFFLAGS, &req); >> + req.ifr_flags &= ~IFF_PROMISC; >> + ioctl(s->fd, SIOCSIFFLAGS, &req); >> + } >> + close(s->fd); >> + qemu_free(s); >> +} >> + >> +static void raw_send(void *opaque); >> + >> +static int raw_can_send(void *opaque) >> +{ >> + RAWState *s = opaque; >> + >> + return qemu_can_send_packet(s->vc); >> +} >> + >> +static void raw_send_completed(VLANClientState *vc, ssize_t len) >> +{ >> + RAWState *s = vc->opaque; >> + >> + qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s); >> +} >> + >> +static void raw_send(void *opaque) >> +{ >> + RAWState *s = opaque; >> + int size; >> + >> + do { >> + size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC); >> + if (size <= 0) >> + break; >> > > Need to handle EINTR. > >> + size = qemu_send_packet_async(s->vc, s->buf, size, >> + raw_send_completed); >> + if (size == 0) >> + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); >> + >> + } while (size > 0); >> +} >> + >> +static ssize_t raw_receive_iov(VLANClientState *vc, const struct iovec *iov, >> + int iovcnt) >> +{ >> + ssize_t len; >> + RAWState *s = vc->opaque; >> + >> + do { >> + len = writev(s->fd, iov, iovcnt); >> + } while (len == -1 && (errno == EINTR || errno == EAGAIN)); >> > > Spinning on EAGAIN is certainly wrong. > >> +static int net_raw_init(Monitor *mon, VLANState *vlan, const char *model, >> + const char *name, const char *ifname, >> + int promisc, int fd) >> +{ >> + RAWState *s; >> + >> + s = qemu_mallocz(sizeof(RAWState)); >> + >> + if (fd == -1) { >> + s->fd = net_raw_fd_init(mon, ifname, promisc); >> + s->promisc = promisc; >> + } else >> + s->fd = fd; >> + >> + fcntl(s->fd, F_SETFL, O_NONBLOCK); >> > > For net_raw_fd_int, you've already set O_NONBLOCK but you're also > removing any other flags that have been set which is probably wrong for > a passed in fd. > >> + s->vc = qemu_new_vlan_client(vlan, model, name, NULL, raw_receive, >> diff --git a/qemu-options.hx b/qemu-options.hx >> index bde3e3f..0d5440f 100644 >> --- a/qemu-options.hx >> +++ b/qemu-options.hx >> @@ -825,6 +825,10 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, >> " default of 'sndbuf=1048576' can be disabled using 'sndbuf=0'\n" >> #endif >> #endif >> + "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n" >> + " bound the host network interface to VLAN 'n' in a raw manner:\n" >> + " packets received on the interface are delivered to the vlan and\n" >> + " packets delivered on the vlan are sent to the interface\n" >> "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" >> " connect the vlan 'n' to another VLAN using a socket connection\n" >> "-net socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port]\n" >> > > Needs documentation. > > Regards, > > Anthony Liguori
On Wed, Oct 14, 2009 at 05:24:52PM +0200, Michael S. Tsirkin wrote: > On Wed, Oct 14, 2009 at 09:46:33AM -0500, Anthony Liguori wrote: > > Or Gerlitz wrote: > >> Add raw network backend option which uses a packet socket to provide > >> raw networking access. Once the socket is opened it's bound to a > >> provided host interface, such that packets received on the interface > >> are delivered to the VM and packets sent by the VM are sent to the > >> interface. > >> > >> This is functionally similar to the existing pcap network > >> backend, with the same advantages and problems. > >> Differences from pcap: > >> - can get an open socket from the monitor, > >> which allows running without NET_ADMIN priviledges > >> - support iovec sends with writev, saving one data copy > >> - one less dependency on an external library > >> - we have access to the underlying file descriptor > >> which makes it possible to connect to vhost net > >> - don't support polling all interfaces, always bind to a specific one > >> > > > > Networking is probably the area in qemu that users most frequently > > stumble with. The most common problems are: > > > > 1) slirp does not behave how they think it should (icmp doesn't work, > > guest isn't accessable from host) > > 2) it's difficult to figure out which backend behaves the way they want > > (socket vs. vde vs. tap) > > 3) when they figure out they need tap, tap is difficult to setup > > tap does not need any setup. problem is, bridge needs setup. > tap need IP assigning. > > The problem with introducing a raw backend (or a pcap backend) > > is that > > it makes #2 even worse because now a user has to figure out whether they > > need raw/pcap or tap. But most troubling, it introduces another issue: > > > > 4) raw does not behave how they think it should because guest<->host > > networking does not work bidirectionally > > OTOH icmp works fine, so we are not worse off than with slirp. > > > So unless there's an extremely compelling reason to have this, > > I work with remote machines all the time, having to fiddle with host > bridge/network setup means I always risk losing the only way to admin > the machine. So it's slirp or raw for me. If I'm the only one like > this, I can keep maintaining this patch, but I doubt it. > You are not the only one, but slirp works fine for my purposes. > I consider this a compelling reason. > > > I'd rather not introduce this complexity. > > Does another option really add that much complexity? > We add options all the time ... > > > NB, I see this as a problem with > > vhost_net too if #4 is also true in that context. > > It's up to user whether to connect vhost net to tap or socket. > I haven't posted userspace code to connect vhost to tap yet > but I will RSN. > > >> Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com> > >> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > >> --- > >> hw/virtio-net.c | 3 +- > >> net.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ > >> qemu-options.hx | 4 + > >> 3 files changed, 198 insertions(+), 1 deletions(-) > >> > >> diff --git a/hw/virtio-net.c b/hw/virtio-net.c > >> index 1ac05a2..95d9f93 100644 > >> --- a/hw/virtio-net.c > >> +++ b/hw/virtio-net.c > >> @@ -545,7 +545,8 @@ static ssize_t virtio_net_receive2(VLANClientState *vc, const uint8_t *buf, size > >> virtqueue_pop(n->rx_vq, &elem) == 0) { > >> if (i == 0) > >> return -1; > >> - fprintf(stderr, "virtio-net truncating packet\n"); > >> + fprintf(stderr, "virtio-net truncating packet. offset %zd size %zd\n", > >> + offset, size); > >> exit(1); > >> } > >> > > > > This doesn't belong here. > >> diff --git a/net.c b/net.c > >> index d93eaef..1e0e874 100644 > >> --- a/net.c > >> +++ b/net.c > >> @@ -93,6 +93,9 @@ > >> #endif > >> #endif > >> +#include <netpacket/packet.h> > >> +#include <net/ethernet.h> > >> + > >> > > > > This is certainly missing guards. > > > >> #if defined(__OpenBSD__) > >> #include <util.h> > >> #endif > >> @@ -1860,6 +1863,158 @@ static TAPState *net_tap_init(VLANState *vlan, const char *model, > >> #endif /* !_WIN32 */ > >> +typedef struct RAWState { > >> + VLANClientState *vc; > >> + int fd; > >> + uint8_t buf[4096]; > >> + int promisc; > >> +} RAWState; > >> + > >> +static int net_raw_fd_init(Monitor *mon, const char *ifname, int promisc) > >> +{ > >> + int fd, ret; > >> + struct ifreq req; > >> + struct sockaddr_ll lladdr; > >> + > >> + fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); > >> + if (fd < 0) > >> + fprintf(stderr, "packet socket failed\n"); > >> > > > > CodingStyle > > > > Also, this error checking should use the monitor error reporting > > framework. And falling through with fd=-1 certainly means we'll SEGV or > > worse further down. > > > >> + memset(&req, 0, sizeof(req)); > >> + strncpy(req.ifr_name, ifname, IFNAMSIZ-1); > >> > > > > Would be better to just use snprintf > > > >> + ret = ioctl(fd, SIOCGIFINDEX, &req); > >> + if (ret < 0) > >> + fprintf(stderr, "SIOCGIFINDEX failed\n"); > >> + > >> + memset(&lladdr, 0, sizeof(lladdr)); > >> + lladdr.sll_family = AF_PACKET; > >> + lladdr.sll_protocol = htons(ETH_P_ALL); > >> + lladdr.sll_ifindex = req.ifr_ifindex; > >> + ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr)); > >> + if (ret < 0) > >> + fprintf(stderr, "bind failed\n"); > >> > >> > > > > Error checking is bad here. > > > >> + /* set iface to promiscuous mode (packets sent to the VM MAC) */ > >> + if (promisc) { > >> + ret = ioctl(fd, SIOCGIFFLAGS, &req); > >> + if (ret < 0) > >> + perror("SIOCGIFFLAGS failed\n"); > >> + req.ifr_flags |= IFF_PROMISC; > >> + ret = ioctl(fd, SIOCSIFFLAGS, &req); > >> + if (ret < 0) > >> + fprintf(stderr, "SIOCSIFFLAGS to promiscous failed\n"); > >> + } > >> > > > > I suspect these ioctls are Linux specific. > > > >> + ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK); > >> + if (ret < 0) > >> + fprintf(stderr, "O_NONBLOCK set failed\n"); > >> + > >> + return fd; > >> +} > >> + > >> +static void raw_cleanup(VLANClientState *vc) > >> +{ > >> + struct ifreq req; > >> + RAWState *s = vc->opaque; > >> + > >> + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > >> + if (s->promisc) { > >> + ioctl(s->fd, SIOCGIFFLAGS, &req); > >> + req.ifr_flags &= ~IFF_PROMISC; > >> + ioctl(s->fd, SIOCSIFFLAGS, &req); > >> + } > >> + close(s->fd); > >> + qemu_free(s); > >> +} > >> + > >> +static void raw_send(void *opaque); > >> + > >> +static int raw_can_send(void *opaque) > >> +{ > >> + RAWState *s = opaque; > >> + > >> + return qemu_can_send_packet(s->vc); > >> +} > >> + > >> +static void raw_send_completed(VLANClientState *vc, ssize_t len) > >> +{ > >> + RAWState *s = vc->opaque; > >> + > >> + qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s); > >> +} > >> + > >> +static void raw_send(void *opaque) > >> +{ > >> + RAWState *s = opaque; > >> + int size; > >> + > >> + do { > >> + size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC); > >> + if (size <= 0) > >> + break; > >> > > > > Need to handle EINTR. > > > >> + size = qemu_send_packet_async(s->vc, s->buf, size, > >> + raw_send_completed); > >> + if (size == 0) > >> + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); > >> + > >> + } while (size > 0); > >> +} > >> + > >> +static ssize_t raw_receive_iov(VLANClientState *vc, const struct iovec *iov, > >> + int iovcnt) > >> +{ > >> + ssize_t len; > >> + RAWState *s = vc->opaque; > >> + > >> + do { > >> + len = writev(s->fd, iov, iovcnt); > >> + } while (len == -1 && (errno == EINTR || errno == EAGAIN)); > >> > > > > Spinning on EAGAIN is certainly wrong. > > > >> +static int net_raw_init(Monitor *mon, VLANState *vlan, const char *model, > >> + const char *name, const char *ifname, > >> + int promisc, int fd) > >> +{ > >> + RAWState *s; > >> + > >> + s = qemu_mallocz(sizeof(RAWState)); > >> + > >> + if (fd == -1) { > >> + s->fd = net_raw_fd_init(mon, ifname, promisc); > >> + s->promisc = promisc; > >> + } else > >> + s->fd = fd; > >> + > >> + fcntl(s->fd, F_SETFL, O_NONBLOCK); > >> > > > > For net_raw_fd_int, you've already set O_NONBLOCK but you're also > > removing any other flags that have been set which is probably wrong for > > a passed in fd. > > > >> + s->vc = qemu_new_vlan_client(vlan, model, name, NULL, raw_receive, > >> diff --git a/qemu-options.hx b/qemu-options.hx > >> index bde3e3f..0d5440f 100644 > >> --- a/qemu-options.hx > >> +++ b/qemu-options.hx > >> @@ -825,6 +825,10 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, > >> " default of 'sndbuf=1048576' can be disabled using 'sndbuf=0'\n" > >> #endif > >> #endif > >> + "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n" > >> + " bound the host network interface to VLAN 'n' in a raw manner:\n" > >> + " packets received on the interface are delivered to the vlan and\n" > >> + " packets delivered on the vlan are sent to the interface\n" > >> "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" > >> " connect the vlan 'n' to another VLAN using a socket connection\n" > >> "-net socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port]\n" > >> > > > > Needs documentation. > > > > Regards, > > > > Anthony Liguori > -- Gleb.
Jamie Lokier wrote: > Anthony Liguori wrote: > >> Or Gerlitz wrote: >> >>> Add raw network backend option which uses a packet socket to provide >>> raw networking access. Once the socket is opened it's bound to a >>> provided host interface, such that packets received on the interface >>> are delivered to the VM and packets sent by the VM are sent to the >>> interface. >>> >>> This is functionally similar to the existing pcap network >>> backend, with the same advantages and problems. >>> Differences from pcap: >>> - can get an open socket from the monitor, >>> which allows running without NET_ADMIN priviledges >>> - support iovec sends with writev, saving one data copy >>> - one less dependency on an external library >>> - we have access to the underlying file descriptor >>> which makes it possible to connect to vhost net >>> - don't support polling all interfaces, always bind to a specific one >>> >>> >> Networking is probably the area in qemu that users most frequently >> stumble with. The most common problems are: >> >> 1) slirp does not behave how they think it should (icmp doesn't work, >> guest isn't accessable from host) >> 2) it's difficult to figure out which backend behaves the way they want >> (socket vs. vde vs. tap) >> 3) when they figure out they need tap, tap is difficult to setup >> > > Worse, tap is impossible to setup properly with things like > network-manager. > This is being fixed. > I suspect user expectations are quite commonly: > > - guest<->host networking works > - guest<->host's network works, directly or through host NAT > - guest IP address is either private (inside the host) > or on the same network as the host, according to some switch. > > Imho, there is only one right place to fix this, and it's by adding a > feature to the host. Either modifying host packet socket, or > modifying the tap+bridge combination. > > Neither tap nor pcap/raw works particularly well except in static IP > configurations, and qemu cannot realistically work around the > host configuration difficulties. > The fact that network manager does work well with bridged interfaces is a network manager bug. It's getting fixed so in the near future, tap will satisfy all of these requirements. > It'd be great if vhost_net doesn't have the configuration problems of > tap or pcap/raw. If it does have the same problems, it's a natural > place to fix them. I haven't looked at vhost_net yet. > > -- Jamie > Regards, Anthony Liguori
On Wed, Oct 14, 2009 at 04:14:06PM +0100, Jamie Lokier wrote: > Anthony Liguori wrote: > > Or Gerlitz wrote: > > >Add raw network backend option which uses a packet socket to provide > > >raw networking access. Once the socket is opened it's bound to a > > >provided host interface, such that packets received on the interface > > >are delivered to the VM and packets sent by the VM are sent to the > > >interface. > > > > > >This is functionally similar to the existing pcap network > > >backend, with the same advantages and problems. > > >Differences from pcap: > > >- can get an open socket from the monitor, > > > which allows running without NET_ADMIN priviledges > > >- support iovec sends with writev, saving one data copy > > >- one less dependency on an external library > > >- we have access to the underlying file descriptor > > > which makes it possible to connect to vhost net > > >- don't support polling all interfaces, always bind to a specific one > > > > > > > Networking is probably the area in qemu that users most frequently > > stumble with. The most common problems are: > > > > 1) slirp does not behave how they think it should (icmp doesn't work, > > guest isn't accessable from host) > > 2) it's difficult to figure out which backend behaves the way they want > > (socket vs. vde vs. tap) > > 3) when they figure out they need tap, tap is difficult to setup > > Worse, tap is impossible to setup properly with things like > network-manager. > > > The problem with introducing a raw backend (or a pcap backend) is that > > it makes #2 even worse because now a user has to figure out whether they > > need raw/pcap or tap. But most troubling, it introduces another issue: > > > > 4) raw does not behave how they think it should because guest<->host > > networking does not work bidirectionally > > I suspect user expectations are quite commonly: > > - guest<->host networking works > - guest<->host's network works, directly or through host NAT > - guest IP address is either private (inside the host) > or on the same network as the host, according to some switch. > > Imho, there is only one right place to fix this, and it's by adding a > feature to the host. Either modifying host packet socket, or > modifying the tap+bridge combination. Long term, I agree. The patch is just using available interfaces. > Neither tap nor pcap/raw works particularly well except in static IP > configurations, Why don't they? I have a dhcp server happily assigning IPs to all my guests with raw. > and qemu cannot realistically work around the > host configuration difficulties. > > > So unless there's an extremely compelling reason to have this, I'd > > rather not introduce this complexity. NB, I see this as a problem with > > vhost_net too if #4 is also true in that context. > > It'd be great if vhost_net doesn't have the configuration problems of > tap or pcap/raw. > If it does have the same problems, it's a natural > place to fix them. > I haven't looked at vhost_net yet. > > -- Jamie vhost_net is just a character device to accelerate virtio. It currently binds to tap or raw socket and so has the same limitations. Once "bridge sockets" are available we can bind to that.
On Wed, Oct 14, 2009 at 10:58:22AM -0500, Anthony Liguori wrote: > Jamie Lokier wrote: >> Anthony Liguori wrote: >> >>> Or Gerlitz wrote: >>> >>>> Add raw network backend option which uses a packet socket to provide >>>> raw networking access. Once the socket is opened it's bound to a >>>> provided host interface, such that packets received on the interface >>>> are delivered to the VM and packets sent by the VM are sent to the >>>> interface. >>>> >>>> This is functionally similar to the existing pcap network >>>> backend, with the same advantages and problems. >>>> Differences from pcap: >>>> - can get an open socket from the monitor, >>>> which allows running without NET_ADMIN priviledges >>>> - support iovec sends with writev, saving one data copy >>>> - one less dependency on an external library >>>> - we have access to the underlying file descriptor >>>> which makes it possible to connect to vhost net >>>> - don't support polling all interfaces, always bind to a specific one >>>> >>> Networking is probably the area in qemu that users most frequently >>> stumble with. The most common problems are: >>> >>> 1) slirp does not behave how they think it should (icmp doesn't work, >>> guest isn't accessable from host) >>> 2) it's difficult to figure out which backend behaves the way they >>> want (socket vs. vde vs. tap) >>> 3) when they figure out they need tap, tap is difficult to setup >> >> Worse, tap is impossible to setup properly with things like >> network-manager. >> > > This is being fixed. I keep hearing this. But why wait? Some users can be helped by raw today. Those that can't will have to wait for a better bridge or spend time setting it up properly. > >> I suspect user expectations are quite commonly: >> >> - guest<->host networking works >> - guest<->host's network works, directly or through host NAT >> - guest IP address is either private (inside the host) >> or on the same network as the host, according to some switch. >> >> Imho, there is only one right place to fix this, and it's by adding a >> feature to the host. Either modifying host packet socket, or >> modifying the tap+bridge combination. >> >> Neither tap nor pcap/raw works particularly well except in static IP >> configurations, and qemu cannot realistically work around the >> host configuration difficulties. >> > > The fact that network manager does work well with bridged interfaces is > a network manager bug. Network manager was just one example. > It's getting fixed so in the near future, tap > will satisfy all of these requirements. >> It'd be great if vhost_net doesn't have the configuration problems of >> tap or pcap/raw. If it does have the same problems, it's a natural >> place to fix them. I haven't looked at vhost_net yet. >> >> -- Jamie >> > > Regards, > > Anthony Liguori
On Wed, Oct 14, 2009 at 09:46:33AM -0500, Anthony Liguori wrote: > 1) slirp does not behave how they think it should (icmp doesn't work, > guest isn't accessable from host) > 2) it's difficult to figure out which backend behaves the way they want > (socket vs. vde vs. tap) > 3) when they figure out they need tap, tap is difficult to setup This is more or less what we have with sound too, by the way: multiple backends, not one of which works well in all situations. Yes, it would be nice to have a single backend solving all problems, and people are working on it, but I don't think this is an argument that new partial solutions are unacceptable.
Michael S. Tsirkin wrote: > > The fact that network manager does work well with bridged interfaces is > > a network manager bug. > > Network manager was just one example. Even if network manager and all other configuration programs are fixed to handle a bridge, it's still quite fiddly to configure them. I find I have to use - bridges to an uplink interface for some VMs (guest has IP on same network as host, fixed host interfaces, typically in servers) - bridges attached to changing uplink for some VMs (laptop, where the main internet uplink changes between eth0/wlan0/ppp0/bnep0 according to available signals) - no bridge at all (just routing tap, used on all sorts of machines) for VMs needing to run on host-local IPs and talk to the host, and maybe NAT to the internet - bridges not attached to any host interface, just multiple tap interfaces, used to connect guests with host-local IPs that must see each other's broadcasts The corresponding iptables is rather tricky too. > > It's getting fixed so in the near future, tap will satisfy all of > > these requirements. That will be really nice if it does. -- Jamie
On Wed, Oct 14, 2009 at 05:54:47PM +0100, Jamie Lokier wrote: > Michael S. Tsirkin wrote: > > > The fact that network manager does work well with bridged interfaces is > > > a network manager bug. > > > > Network manager was just one example. > > Even if network manager and all other configuration programs are fixed > to handle a bridge, it's still quite fiddly to configure them. > > I find I have to use > > - bridges to an uplink interface for some VMs (guest has IP on same > network as host, fixed host interfaces, typically in servers) Another similar setup of interest is when you have dhcp server and want both VMs and host to get IP from there. > - bridges attached to changing uplink for some VMs (laptop, where > the main internet uplink changes between eth0/wlan0/ppp0/bnep0 > according to available signals) > > - no bridge at all (just routing tap, used on all sorts of machines) > for VMs needing to run on host-local IPs and talk to the host, > and maybe NAT to the internet > > - bridges not attached to any host interface, just multiple tap > interfaces, used to connect guests with host-local IPs that must > see each other's broadcasts > > The corresponding iptables is rather tricky too. Another issue is that host will often lose networking while you fiddle with bridge connected to an uplink. So doing this if your only way to access the box is through the network is tricky. > > > It's getting fixed so in the near future, tap will satisfy all of > > > these requirements. > > That will be really nice if it does. > > -- Jamie
Michael S. Tsirkin wrote: > I keep hearing this. But why wait? Some users can be helped > by raw today. Those that can't will have to wait for a better bridge > or spend time setting it up properly. > Because for every user who understands the limitations of the raw socket interface and is willing to accept them in order to avoid setting up a bridge, there will be a dozen who's experience will be worse because they tried the raw socket interface and it behaved in a way that they didn't expect. We aren't talking about adding a feature here. We're talking about adding something that's supposed to be "easier" but I'm arguing that it's not easier unless you're a kernel hacker and deeply understand the limitations of the implementation. Regards, Anthony Liguori
On Wed, Oct 14, 2009 at 01:36:33PM -0500, Anthony Liguori wrote: > We're talking about > adding something that's supposed to be "easier" but I'm arguing that > it's not easier unless you're a kernel hacker I guess it's hard for a kernel hacker to argue :) > and deeply understand the > limitations of the implementation. If we address host to guest, it'd become worthwhile then?
Gleb Natapov wrote:
> tap need IP assigning.
Hi Gleb, please correct me if I'm wrong, but the way we use tap here is attach it to a bridge from the one hand, and have Qemu write/read into/from it Ethernet frames from the other hand. This means that even the MAC address of tap isn't used, surely not an IP address.
Or.
On Thu, Oct 15, 2009 at 09:29:34AM +0200, Or Gerlitz wrote: > Gleb Natapov wrote: > > tap need IP assigning. > > Hi Gleb, please correct me if I'm wrong, but the way we use tap here is attach it to a bridge from the one hand, and have Qemu write/read into/from it Ethernet frames from the other hand. This means that even the MAC address of tap isn't used, surely not an IP address. > Yes, you are right. I don't use tap usually so I don't remember. What I really mean is that guest needs IP assigning, so this means I need external DHCP setup. Slirp has (almost) everything you ever need from networking and with zero config: dhcp address assigning, NATing your traffic and as a bonus no need to run as root. Hard to beat that if you don't need performance. Raw requires root and external dhcp so it lose two point to slirp and it doesn't win any for my use case. But we have a lot of crap^W features in qemu anyway so don't see why not add this too. -- Gleb.
Anthony Liguori wrote: > for every user who understands the limitations of the raw socket > interface and is willing to accept them in order to avoid setting up a > bridge, there will be a dozen who's experience will be worse because > they tried the raw socket interface and it behaved in a way that they didn't expect. limitations can be documented > We aren't talking about adding a feature here. We're talking about > adding something that's supposed to be "easier" but I'm arguing that > it's not easier unless you're a kernel hacker and deeply understand the > limitations of the implementation. Anthony, Its surely a feature, you bypass the bridge/tap and go directly to the NIC. Using sr-iov/vf based nic or macvlan, its software counterpart is simply as running /sbin/ip to create the nic in the macvlan case, or reading a short memo in the sr-iov case. Nothing is kernel hacking, e.g this is being done now all over the place by application engineers of the 10g nic vendors, and as eventually all of them are going to support sr-iov in the coming year. I believe that the Qemu developer community should also look into this near future and allow the adoption of a feature that allows for much less state-full configuration compared to a bridge/tap one and provides better performance. The VMs would get their IP from a DHCP server residing on the network and not the host, this should be acceptable for many users. Or. Or. Or.
Gleb Natapov wrote:
> Hard to beat that if you don't need performance
Raw comes to provide performance... think of it as a lightweight vhost
Or.
diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 1ac05a2..95d9f93 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -545,7 +545,8 @@ static ssize_t virtio_net_receive2(VLANClientState *vc, const uint8_t *buf, size virtqueue_pop(n->rx_vq, &elem) == 0) { if (i == 0) return -1; - fprintf(stderr, "virtio-net truncating packet\n"); + fprintf(stderr, "virtio-net truncating packet. offset %zd size %zd\n", + offset, size); exit(1); } diff --git a/net.c b/net.c index d93eaef..1e0e874 100644 --- a/net.c +++ b/net.c @@ -93,6 +93,9 @@ #endif #endif +#include <netpacket/packet.h> +#include <net/ethernet.h> + #if defined(__OpenBSD__) #include <util.h> #endif @@ -1860,6 +1863,158 @@ static TAPState *net_tap_init(VLANState *vlan, const char *model, #endif /* !_WIN32 */ +typedef struct RAWState { + VLANClientState *vc; + int fd; + uint8_t buf[4096]; + int promisc; +} RAWState; + +static int net_raw_fd_init(Monitor *mon, const char *ifname, int promisc) +{ + int fd, ret; + struct ifreq req; + struct sockaddr_ll lladdr; + + fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (fd < 0) + fprintf(stderr, "packet socket failed\n"); + + memset(&req, 0, sizeof(req)); + strncpy(req.ifr_name, ifname, IFNAMSIZ-1); + ret = ioctl(fd, SIOCGIFINDEX, &req); + if (ret < 0) + fprintf(stderr, "SIOCGIFINDEX failed\n"); + + memset(&lladdr, 0, sizeof(lladdr)); + lladdr.sll_family = AF_PACKET; + lladdr.sll_protocol = htons(ETH_P_ALL); + lladdr.sll_ifindex = req.ifr_ifindex; + ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr)); + if (ret < 0) + fprintf(stderr, "bind failed\n"); + + /* set iface to promiscuous mode (packets sent to the VM MAC) */ + if (promisc) { + ret = ioctl(fd, SIOCGIFFLAGS, &req); + if (ret < 0) + perror("SIOCGIFFLAGS failed\n"); + req.ifr_flags |= IFF_PROMISC; + ret = ioctl(fd, SIOCSIFFLAGS, &req); + if (ret < 0) + fprintf(stderr, "SIOCSIFFLAGS to promiscous failed\n"); + } + + ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK); + if (ret < 0) + fprintf(stderr, "O_NONBLOCK set failed\n"); + + return fd; +} + +static void raw_cleanup(VLANClientState *vc) +{ + struct ifreq req; + RAWState *s = vc->opaque; + + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); + if (s->promisc) { + ioctl(s->fd, SIOCGIFFLAGS, &req); + req.ifr_flags &= ~IFF_PROMISC; + ioctl(s->fd, SIOCSIFFLAGS, &req); + } + close(s->fd); + qemu_free(s); +} + +static void raw_send(void *opaque); + +static int raw_can_send(void *opaque) +{ + RAWState *s = opaque; + + return qemu_can_send_packet(s->vc); +} + +static void raw_send_completed(VLANClientState *vc, ssize_t len) +{ + RAWState *s = vc->opaque; + + qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s); +} + +static void raw_send(void *opaque) +{ + RAWState *s = opaque; + int size; + + do { + size = recv(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC); + if (size <= 0) + break; + + size = qemu_send_packet_async(s->vc, s->buf, size, + raw_send_completed); + if (size == 0) + qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL); + + } while (size > 0); +} + +static ssize_t raw_receive_iov(VLANClientState *vc, const struct iovec *iov, + int iovcnt) +{ + ssize_t len; + RAWState *s = vc->opaque; + + do { + len = writev(s->fd, iov, iovcnt); + } while (len == -1 && (errno == EINTR || errno == EAGAIN)); + + return len; +} + +static ssize_t raw_receive(VLANClientState *vc, const uint8_t *buf, size_t size) +{ + struct iovec iov[1]; + + iov[0].iov_base = (char *)buf; + iov[0].iov_len = size; + + return raw_receive_iov(vc, iov, 1); +} + +static int net_raw_init(Monitor *mon, VLANState *vlan, const char *model, + const char *name, const char *ifname, + int promisc, int fd) +{ + RAWState *s; + + s = qemu_mallocz(sizeof(RAWState)); + + if (fd == -1) { + s->fd = net_raw_fd_init(mon, ifname, promisc); + s->promisc = promisc; + } else + s->fd = fd; + + fcntl(s->fd, F_SETFL, O_NONBLOCK); + + s->vc = qemu_new_vlan_client(vlan, model, name, NULL, raw_receive, + raw_receive_iov, raw_cleanup, s); + qemu_set_fd_handler2(s->fd, raw_can_send, raw_send, NULL, s); + + if (fd == -1) + snprintf(s->vc->info_str, sizeof(s->vc->info_str), + "raw: ifname=%s, promisc=%d", ifname, promisc); + else + snprintf(s->vc->info_str, sizeof(s->vc->info_str), + "raw: fd=%d", fd); + + vlan->nb_host_devs++; + return 0; +} + #if defined(CONFIG_VDE) typedef struct VDEState { VLANClientState *vc; @@ -2625,6 +2780,23 @@ static int net_init_nic(QemuOpts *opts, Monitor *mon) return idx; } +static int net_init_raw(QemuOpts *opts, Monitor *mon) +{ + VLANState *vlan; + int fd = -1; + vlan = qemu_find_vlan(qemu_opt_get_number(opts, "vlan", 0), 1); + if (qemu_opt_get(opts, "fd")) { + fd = net_handle_fd_param(mon, qemu_opt_get(opts, "fd")); + if (fd < 0) + return -EINVAL; + } + return net_raw_init(mon, vlan, "raw", + qemu_opt_get(opts, "name"), + qemu_opt_get(opts, "ifname"), + qemu_opt_get_bool(opts, "promisc", 0), + fd); +} + static int net_init_slirp_configs(const char *name, const char *value, void *opaque) { struct slirp_config_str *config; @@ -3133,6 +3305,26 @@ static struct { }, { /* end of list */ } }, + }, { + .type = "raw", + .init = net_init_raw, + .desc = { + NET_COMMON_PARAMS_DESC, + { + .name = "fd", + .type = QEMU_OPT_STRING, + .help = "file descriptor of an already opened raw socket", + }, { + .name = "ifname", + .type = QEMU_OPT_STRING, + .help = "interface name", + }, { + .name = "promisc", + .type = QEMU_OPT_BOOL, + .help = "enable promiscious mode at startup", + }, + { /* end of list */ } + }, #ifdef CONFIG_VDE }, { .type = "vde", diff --git a/qemu-options.hx b/qemu-options.hx index bde3e3f..0d5440f 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -825,6 +825,10 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, " default of 'sndbuf=1048576' can be disabled using 'sndbuf=0'\n" #endif #endif + "-net raw[,vlan=n][,name=str],ifname=name[,promisc=m]\n" + " bound the host network interface to VLAN 'n' in a raw manner:\n" + " packets received on the interface are delivered to the vlan and\n" + " packets delivered on the vlan are sent to the interface\n" "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" " connect the vlan 'n' to another VLAN using a socket connection\n" "-net socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port]\n"